In [275]:
import pandas as pd
import os
import yaml
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re


In [276]:
def load_csv():
    dir = os.getcwd()
    df = pd.read_csv( f'{dir}/../../data/local/linkedin_job_listing_sample.csv')

    return df

df = load_csv()

df['text'] = df['job_title'] + df['job_description_text']

print( df.head() )

   job_posting_id         job_title  \
0             NaN               NaN   
1    4.325451e+09     Data Engineer   
2    4.325478e+09     Data Engineer   
3    4.325606e+09  AI Data Engineer   
4    4.325686e+09     Data Engineer   

                                                 url  \
0                                                NaN   
1  https://www.linkedin.com/jobs/view/data-engine...   
2  https://www.linkedin.com/jobs/view/data-engine...   
3  https://www.linkedin.com/jobs/view/ai-data-eng...   
4  https://www.linkedin.com/jobs/view/data-engine...   

                                job_description_text  \
0                                                NaN   
1   Software Guidance &amp; Assistance, Inc., (SG...   
2   Position: &#xA0;Data Engineer Compensation: &...   
3   Join our newly developing AI Competence Cente...   
4   Overview BigBear.ai are seeking a highly skil...   

                                                text  
0                                   

In [277]:
from __future__ import annotations

from typing import Any, Dict, List, Iterable, Tuple, Optional
import pandas as pd
import re


def variations_to_df(
    obj: Any,
    *,
    include_canonical: bool = False,
    dedup: bool = True,
    normalize: bool = True,
) -> pd.DataFrame:
    """
    Coleta TODAS as listas 'variations' de um YAML já carregado (dict/list),
    e devolve um DataFrame com 1 row por variação.

    Params:
      include_canonical: se True, quando encontrar um dict com 'canonical',
        inclui canonical como variação também.
      dedup: remove duplicadas (pelo texto normalizado ou original, dependendo de normalize)
      normalize: cria coluna 'variation_norm' (lower + trim + espaços)
    """

    def as_list(x: Any) -> List[str]:
        if x is None:
            return []
        if isinstance(x, str):
            return [x]
        if isinstance(x, list):
            out = []
            for it in x:
                if it is None:
                    continue
                s = it if isinstance(it, str) else str(it)
                s = s.strip()
                if s:
                    out.append(s)
            return out
        return [str(x).strip()]

    ws = re.compile(r"\s+")
    def norm(s: str) -> str:
        s = s.strip().lower()
        s = ws.sub(" ", s)
        return s

    rows: List[Dict[str, Any]] = []

    def walk(node: Any, path: Tuple[Any, ...] = ()):
        if isinstance(node, dict):
            # se tem variations, coleta
            if "variations" in node:
                vars_ = as_list(node.get("variations"))
                if include_canonical and "canonical" in node and node["canonical"]:
                    vars_ = [str(node["canonical"]).strip(), *vars_]

                for v in vars_:
                    rows.append({
                        "variation": v,
                        "path": ".".join(map(str, path + ("variations",))),
                    })

            # segue recursão
            for k, v in node.items():
                walk(v, path + (k,))

        elif isinstance(node, list):
            for i, item in enumerate(node):
                walk(item, path + (i,))

        # outros tipos: ignora

    walk(obj)

    dfv = pd.DataFrame(rows)
    if dfv.empty:
        return dfv.assign(variation_norm=pd.Series(dtype=str)) if normalize else dfv

    if normalize:
        dfv["variation_norm"] = dfv["variation"].map(norm)

    if dedup:
        subset = ["variation_norm"] if normalize else ["variation"]
        dfv = dfv.drop_duplicates(subset=subset).reset_index(drop=True)

    return dfv


In [278]:
def load_skill_catalog():
    dir = os.getcwd()
    catalog_file = (f"{dir}/../../src/skills_detection/config/skills_catalog.yaml")

    with open(catalog_file,'r') as f:
        yaml_data = yaml.safe_load(f) 

    return pd.DataFrame([{"data": yaml_data}])

catalog = load_skill_catalog()

df_catalog = variations_to_df(catalog.loc[0, "data"], include_canonical=False, dedup=True, normalize=True)

catalog_unities = set();

print(catalog_unities)

for ct in df_catalog["variation_norm"].tolist():
    catalog_unities.add(ct)

print( len(catalog_unities))

print(catalog_unities)

set()
284
{'presto', 'azure event hubs', 'azure cosmos', 'denodo', 'google cloud platform', 'oracle', 'sql', 'collaborative', 'aws lambda', 'delta lake', 'gitlab ci/cd', 'exadata', 'mssql', 'agile', 'lineage', 'luigi', 'data lakehouse', 'circle ci', 'pl-sql', 'cosmosdb', 'dagster', 'psql', 'tableau', 'scikit-learn', 'google dataproc', 'leadership', 'kubernetes', 'sklearn', 'arm templates', 'transact-sql', 'prestodb', 'aws cloudformation', 'google cloud composer', 'adls', 'github', 'aws rds', 'cloudformation', 'aws redshift', 'scikit learn', 'elasticsearch', 'dataops', 'gcp data engineer', 'amazon glue', 'aws dynamodb', 'ms azure', 'restful', 'azure sql', 'neo4j', 'dask', 'pyspark', 'redshift', 'data governance', 'go', 'azure sql database', 'synapse analytics', 'oracle database', 'nifi', 'looker', 'collibra', 'spotify luigi', 'rust', 'mongo', 'continuous integration', 'apache kafka', 'apache beam', 'delta', 'oracle db', 'ansible', 'data lineage', 'ts', 'eventhub', 'grafana', 'newrelic',

In [279]:

BAD_TOKEN_RE  = re.compile(r"(?i)\b(?:x[0-9a-f]{2,4}|u[0-9a-f]{4})\b")   # xE9, xA0, x2019, u2019
BAD_ESCAPE_RE = re.compile(r"(?i)(?:\\x[0-9a-f]{2}|\\u[0-9a-f]{4})")     # \xNN, \uNNNN
HTML_JUNK_RE  = re.compile(r"(?i)\b(?:amp|apos|nbsp|quot|lt|gt)\b")      # entidades comuns

def preprocess(text: str) -> str:
    text = "" if text is None else str(text)
    text = BAD_ESCAPE_RE.sub(" ", text)
    text = BAD_TOKEN_RE.sub(" ", text)
    text = HTML_JUNK_RE.sub(" ", text)
    return text

In [280]:

vectorizer = TfidfVectorizer(
    preprocessor=preprocess,
    stop_words='english', 
    ngram_range=(1, 5),
    min_df=2,
    max_df=0.8,
    token_pattern=r"(?u)[A-Za-z0-9][A-Za-z0-9+.#/_-]{1,}"
)

data_sample = df['text'].fillna("").astype(str)

print( data_sample )

X = vectorizer.fit_transform( data_sample )

feature_names  = (vectorizer.get_feature_names_out())


0                                                        
1       Data Engineer Software Guidance &amp; Assistan...
2       Data Engineer Position: &#xA0;Data Engineer Co...
3       AI Data Engineer Join our newly developing AI ...
4       Data Engineer Overview BigBear.ai are seeking ...
                              ...                        
3346    Senior Data Engineer Role: Senior Data Enginee...
3347    Data Architect / Lead Data Engineer Hola, Soy ...
3348    Test Data Engineer IT Strong and modern IT is ...
3349    Backend / Data Engineer Senior Software Engine...
3350    HPC & Data Engineer BI-REX Big Data Innovation...
Name: text, Length: 3351, dtype: object


In [281]:
def topk_by_ngram(
    X,
    vectorizer,
    *,
    n_values=(1, 2),
    topk=200,
    sort_by="tfidf_sum",   # "tfidf_sum" | "doc_freq"
):
    feature_names = vectorizer.get_feature_names_out().astype(str)
    scores = X.sum(axis=0).A1
    df_counts = (X > 0).sum(axis=0).A1

    # n = número de palavras no termo (unigram=1, bigram=2, ...)
    ngram_len = np.array([len(t.split()) for t in feature_names])

    out = {}
    for n in n_values:
        idx = np.where(ngram_len == n)[0]
        if idx.size == 0:
            out[n] = pd.DataFrame(columns=["term", "tfidf_sum", "doc_freq"])
            continue

        key = scores[idx] if sort_by == "tfidf_sum" else df_counts[idx]
        top_idx = idx[np.argsort(key)[::-1][:topk]]

        out[n] = (
            pd.DataFrame({
                "term": feature_names[top_idx],
                "tfidf_sum": scores[top_idx],
                "doc_freq": df_counts[top_idx],
            })
            .sort_values(sort_by, ascending=False)
            .reset_index(drop=True)
        )

    return out

tables = topk_by_ngram(X, vectorizer, n_values=(1,2,3,4,5), topk=10000)



In [282]:
print("=== UNIGRAMS ===")
print(tables[1].to_string(index=False))

=== UNIGRAMS ===
                                                               term  tfidf_sum  doc_freq
                                                                 en  39.638233       233
                                                                 We  39.265886      2082
                                                         Experience  37.975639      2313
                                                              Azure  37.395040      1685
                                                                 AI  35.364400       949
                                                          pipelines  32.653198      2378
                                                           business  32.075991      2047
                                                               work  30.001953      1887
                                                                You  29.740784      1501
                                                               team  28.899676      1807
    

In [283]:

print("\n=== BIGRAMS ===")
print(tables[2].to_string(index=False))


=== BIGRAMS ===


                                 term  tfidf_sum  doc_freq
                       data pipelines  22.742888      1771
                     data engineering  20.150076      1565
                     years experience  16.541795      1324
                           Azure Data  16.428126       553
                          Senior Data  14.829825       640
                         data quality  14.645297      1189
                     Data Engineering  13.619497       816
                      data processing  12.959077       780
                       best practices  12.300692      1030
                        scalable data  12.176163      1013
                               You ll  11.531466       663
                     Computer Science  11.497320      1083
                        data modeling  10.941782       814
                      data governance  10.509961       705
                      experience data  10.458005       854
                       data solutions  10.329717       6

In [284]:
print("\n=== TRIGRAMS ===")
print(tables[3].to_string(index=False))


=== TRIGRAMS ===
                                                                   term  tfidf_sum  doc_freq
                                                   Senior Data Engineer  13.387339       599
                                                scalable data pipelines   8.481220       576
                                            experience data engineering   8.120462       595
                                                     Azure Data Factory   7.957007       383
                                                degree Computer Science   7.750283       571
                                                  years experience data   7.587412       485
                                                     Product Owner Data   6.922588        98
                                                       Apply Dice today   6.573031       353
                                               seeking following. Apply   6.573031       353
                                                    

In [285]:
print("\n=== 4-TETRAGRAMS ===")
print(tables[4].to_string(index=False))


=== 4-TETRAGRAMS ===
                                                                               term  tfidf_sum  doc_freq
                                                    Dice leading career destination   6.573031       353
                                                    career destination tech experts   6.573031       353
                                                         experts stage careers. Our   6.573031       353
                                                      seeking following. Apply Dice   6.573031       353
                                                     destination tech experts stage   6.573031       353
                                                        tech experts stage careers.   6.573031       353
                                                          stage careers. Our client   6.573031       353
                                                    leading career destination tech   6.573031       353
                                 

In [286]:
print("\n=== 5-GRAMS ===")
print(tables[5].to_string(index=False))


=== 5-GRAMS ===


                                                                                                  term  tfidf_sum  doc_freq
                                                               destination tech experts stage careers.   6.573031       353
                                                               leading career destination tech experts   6.573031       353
                                                                  Dice leading career destination tech   6.573031       353
                                                                   seeking following. Apply Dice today   6.573031       353
                                                                       tech experts stage careers. Our   6.573031       353
                                                                 career destination tech experts stage   6.573031       353
                                                                     experts stage careers. Our client   6.573031       353
        

In [287]:
#Filtrar pelos termos mais frequentes
scores = X.sum(axis=0).A1
topk = 200
idx = scores.argsort()[::-1][:topk]

X_f = X[:, idx]
terms_f = feature_names[idx]

print( terms_f )

['en' 'We' 'Experience' 'Azure' 'AI' 'pipelines' 'business' 'work' 'You'
 'team' 'solutions' 'engineering' 'et' 'SQL' 'development' 'AWS' 'tools'
 'technical' 'The' 'including' 'years' 'using' 'Python' 'Databricks'
 'data pipelines' 'und' 'cloud' 'Snowflake' 'Cloud' 'Strong' 'skills'
 'support' 'la' 'teams' 'working' 'e.g.' 'analytics' 'role' 'design'
 'performance' 'ETL' 'Engineering' 'data engineering' 'scalable' 'quality'
 'maintain' 'platforms' 'building' 'll' 'Senior' 'des' 'Our' 'services'
 'systems' 'practices' 'processing' 'best' 'Spark' 'This' 'build'
 'years experience' 'Azure Data' 'like' 'technologies' 'BI' 'processes'
 'architecture' 'workflows' 'platform' 'management' 'GCP' 'related' 'Job'
 'knowledge' 'Skills' 'ensure' 'global' 'Science' 'healthcare' 'Work'
 'product' 'governance' 'Knowledge' 'expertise' 'Senior Data'
 'Familiarity' 'Design' 'clients' 'models' 'Product' 'data quality'
 'requirements' 'What' 'CI/CD' 'deliver' 'complex' 'security'
 'understanding' 'help' '

In [288]:
#Termos raros
idf = vectorizer.idf_
mask = (idf >= 1.5) & (idf <= 5.0)  # ajuste os limites

X_f = X[:, mask]
terms_f = feature_names[mask]

print(terms_f)


['000' '000+' '1+' ... 'years relevant experience equivalent'
 'years relevant experience equivalent combination' 'you.']


In [289]:
_ws = re.compile(r"\s+")
def norm_term(s: str) -> str:
    s = str(s).strip().lower()
    s = _ws.sub(" ", s)
    return s

def filter_new_terms(tables: dict[int, pd.DataFrame], catalog_unities: set[str]):
    new_tables = {}
    for n, t in tables.items():
        if t.empty:
            new_tables[n] = t
            continue
        t2 = t.copy()
        t2["term_norm"] = t2["term"].map(norm_term)
        new_tables[n] = (
            t2[~t2["term_norm"].isin(catalog_unities)]
            .reset_index(drop=True)
        )
    return new_tables


new_tables = filter_new_terms(tables, catalog_unities)

In [290]:
print("=== NOVOS UNIGRAMS ===")
print(new_tables[1].head(50).to_string(index=False))

=== NOVOS UNIGRAMS ===
       term  tfidf_sum  doc_freq   term_norm
         en  39.638233       233          en
         We  39.265886      2082          we
 Experience  37.975639      2313  experience
         AI  35.364400       949          ai
  pipelines  32.653198      2378   pipelines
   business  32.075991      2047    business
       work  30.001953      1887        work
        You  29.740784      1501         you
       team  28.899676      1807        team
  solutions  28.825001      1984   solutions
engineering  28.673098      1991 engineering
         et  26.864284       105          et
development  26.562899      1732 development
      tools  25.079549      1832       tools
  technical  24.402318      1579   technical
        The  24.352227      1602         the
  including  24.321512      1880   including
      years  24.196097      2213       years
      using  23.814286      1788       using
        und  22.703034        57         und
      cloud  22.161867      1665

In [291]:
print("\n=== NOVOS BIGRAMS ===")
print(new_tables[2].head(50).to_string(index=False))


=== NOVOS BIGRAMS ===
                  term  tfidf_sum  doc_freq              term_norm
        data pipelines  22.742888      1771         data pipelines
      data engineering  20.150076      1565       data engineering
      years experience  16.541795      1324       years experience
            Azure Data  16.428126       553             azure data
           Senior Data  14.829825       640            senior data
      Data Engineering  13.619497       816       data engineering
       data processing  12.959077       780        data processing
        best practices  12.300692      1030         best practices
         scalable data  12.176163      1013          scalable data
                You ll  11.531466       663                 you ll
      Computer Science  11.497320      1083       computer science
         data modeling  10.941782       814          data modeling
       experience data  10.458005       854        experience data
        data solutions  10.329717      

In [292]:
print("\n=== NOVOS TRIGRAMS ===")
print(new_tables[3].head(50).to_string(index=False))


=== NOVOS TRIGRAMS ===
                                    term  tfidf_sum  doc_freq                                term_norm
                    Senior Data Engineer  13.387339       599                     senior data engineer
                 scalable data pipelines   8.481220       576                  scalable data pipelines
             experience data engineering   8.120462       595              experience data engineering
                 degree Computer Science   7.750283       571                  degree computer science
                   years experience data   7.587412       485                    years experience data
                      Product Owner Data   6.922588        98                       product owner data
                        Apply Dice today   6.573031       353                         apply dice today
                seeking following. Apply   6.573031       353                 seeking following. apply
                      tech experts stage   6.5730

In [293]:
def pick_ngrams_with_word(tables: dict[int, pd.DataFrame], word: str, n_values=(3,4,5)):
    pat = re.compile(rf"(?i)\b{re.escape(word)}\b")

    frames = []
    for n in n_values:
        t = tables.get(n)
        if t is None or t.empty:
            continue
        frames.append(t[t["term"].str.contains(pat, na=False)].assign(n=n))

    if not frames:
        return pd.DataFrame(columns=["n","term","tfidf_sum","doc_freq"])
    return pd.concat(frames, ignore_index=True).sort_values("tfidf_sum", ascending=False)

In [294]:
cand = pick_ngrams_with_word(tables, word="years experience", n_values=(3,4,5))
print(cand.head(100).to_string(index=False))


                                         term  tfidf_sum  doc_freq  n
                        years experience data   7.587412       485  3
            years experience data engineering   6.466333       384  4
                          5+ years experience   4.726812       255  3
                        years experience Data   3.682290       182  3
                          3+ years experience   3.525865       132  3
                          2+ years experience   3.274757       115  3
                  experience years experience   3.035496        92  3
             experience years experience data   3.015400        89  4
       equivalent experience years experience   3.015400        89  4
  equivalent experience years experience data   3.015400        89  5
 experience years experience data engineering   2.994597        88  5
 field equivalent experience years experience   2.994597        88  5
  years experience data engineering Technical   2.994597        88  5
                    

In [295]:
def pick_ngrams_with_all_words(tables, words, n_values=(3,4,5)):
    pats = [re.compile(rf"(?i)\b{re.escape(w)}\b") for w in words]
    frames = []
    for n in n_values:
        t = tables.get(n)
        if t is None or t.empty:
            continue
        mask = pd.Series(True, index=t.index)
        for p in pats:
            mask &= t["term"].str.contains(p, na=False)
            
        frames.append(t[mask].assign(n=n))
    return pd.concat(frames, ignore_index=True).sort_values("tfidf_sum", ascending=False) if frames else \
           pd.DataFrame(columns=["n","term","tfidf_sum","doc_freq"])



In [296]:

cand2 = pick_ngrams_with_all_words(tables, ["years","experience"], n_values=(3,4,5))
print(cand2.head(100).to_string(index=False))

                                            term  tfidf_sum  doc_freq  n
                           years experience data   7.587412       485  3
               years experience data engineering   6.466333       384  4
                             5+ years experience   4.726812       255  3
                           years experience Data   3.682290       182  3
                       years hands-on experience   3.645730       185  3
                             3+ years experience   3.525865       132  3
                       years relevant experience   3.276670       154  3
                             2+ years experience   3.274757       115  3
                     experience years experience   3.035496        92  3
          equivalent experience years experience   3.015400        89  4
                experience years experience data   3.015400        89  4
     equivalent experience years experience data   3.015400        89  5
                     equivalent experience years   

In [297]:

cand3 = pick_ngrams_with_all_words(tables, ["canada"], n_values=(2,3,4))
print(cand3.head(100).to_string(index=False))

                          term  tfidf_sum  doc_freq  n
position posted Canada Capital    0.29565        18  4
   Capital One Canada position    0.29565        18  4
    One Canada position posted    0.29565        18  4
     posted Canada Capital One    0.29565        18  4
   note position posted Canada    0.29565        18  4
 Canada position posted United    0.29565        18  4
     Canada Capital One Canada    0.29565        18  4


In [298]:
fn = vectorizer.get_feature_names_out().astype(str)
print("tem 'brasil' em alguma feature?", np.any(np.char.find(fn, "brasil") >= 0))
print("tem 'brazil' em alguma feature?", np.any(np.char.find(fn, "brazil") >= 0))
print("tem 'portugues' em alguma feature?", np.any(np.char.find(fn, "portugues") >= 0))
print("tem 'portuguese' em alguma feature?", np.any(np.char.find(fn, "portuguese") >= 0))
print("tem 'latam' em alguma feature?", np.any(np.char.find(fn, "latam") >= 0))
print("tem 'latin' em alguma feature?", np.any(np.char.find(fn, "latin") >= 0))
print("tem 'canada' em alguma feature?", np.any(np.char.find(fn, "canada") >= 0))


tem 'brasil' em alguma feature? True
tem 'brazil' em alguma feature? False
tem 'portugues' em alguma feature? False
tem 'portuguese' em alguma feature? False
tem 'latam' em alguma feature? False
tem 'latin' em alguma feature? True
tem 'canada' em alguma feature? False


In [299]:

GENERIC_WORDS = {
    "years","year","experience","strong","work","team","business","including","responsibilities",
    "skills","ability","knowledge","required","preferred","must","plus","degree","develop","development",
    "engineer","engineering","role","salary","benefits","company","client"
}

def is_probably_non_tech(term: str) -> bool:
    t = term.lower().strip()

    # mata frases de recrutamento clássicas
    if re.search(r"\b\d+\+?\s+years?\b", t):                 # "5+ years"
        return True
    if re.search(r"\byears?\s+of\s+experience\b", t):        # "years of experience"
        return True
    if re.search(r"\b(bachelor|master|phd|degree)\b", t):
        return True

    words = t.split()
    # se TODAS as palavras são genéricas => não-tech
    if words and all(w in GENERIC_WORDS for w in words):
        return True

    return False


def filter_new_terms_strict(df, catalog_unities):
    df2 = df.copy()
    df2["term_norm"] = df2["term"].str.lower().str.replace(r"\s+", " ", regex=True).str.strip()

    # 1) remove o que já está no catálogo
    df2 = df2[~df2["term_norm"].isin(catalog_unities)]

    # 2) remove não-tech por heurística
    df2 = df2[~df2["term"].map(is_probably_non_tech)]

    return df2.reset_index(drop=True)


In [300]:
tables = topk_by_ngram(X, vectorizer, n_values=(2,3,4,5), topk=10000)  # aumenta topk p/ não perder candidatos
new_tables = {n: filter_new_terms_strict(t, catalog_unities) for n, t in tables.items()}

print(new_tables[2].head(50).to_string(index=False))


                  term  tfidf_sum  doc_freq              term_norm
        data pipelines  22.742888      1771         data pipelines
      data engineering  20.150076      1565       data engineering
            Azure Data  16.428126       553             azure data
           Senior Data  14.829825       640            senior data
      Data Engineering  13.619497       816       data engineering
       data processing  12.959077       780        data processing
        best practices  12.300692      1030         best practices
         scalable data  12.176163      1013          scalable data
                You ll  11.531466       663                 you ll
      Computer Science  11.497320      1083       computer science
         data modeling  10.941782       814          data modeling
       experience data  10.458005       854        experience data
        data solutions  10.329717       668         data solutions
   Hands-on experience   9.797034       719    hands-on experi

In [301]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer as TFChar

def train_term_tech_classifier(positive_terms, negative_terms):
    y = np.array([1]*len(positive_terms) + [0]*len(negative_terms))
    X_terms = positive_terms + negative_terms

    vec = TFChar(analyzer="char_wb", ngram_range=(2,5), min_df=1)
    Xv = vec.fit_transform(X_terms)

    clf = LogisticRegression(max_iter=2000, class_weight="balanced")
    clf.fit(Xv, y)
    return vec, clf

# positivos do catálogo
pos = df_catalog["variation_norm"].tolist()

# negativos: pegue um monte de candidatos "óbvios não-tech"
# (ex: unigrams top por doc_freq que não estão no catálogo)
all_terms = vectorizer.get_feature_names_out().astype(str)

neg = [t.lower().strip() for t in all_terms
       if t.lower().strip() not in catalog_unities and is_probably_non_tech(t)]

neg = neg[:5000]  # suficiente

vec_term, clf = train_term_tech_classifier(pos, neg)

def score_terms(terms):
    Xt = vec_term.transform([t.lower().strip() for t in terms])
    return clf.predict_proba(Xt)[:,1]


In [None]:
cands = new_tables[3].copy()
cands["tech_prob"] = score_terms(cands["term"].tolist())
cands = cands.sort_values(["tech_prob","tfidf_sum"], ascending=False)
print(cands.head(1000).to_string(index=False))


                                    term  tfidf_sum  doc_freq                                term_norm  tech_prob
                        AWS Azure Google   1.375603        65                         aws azure google   0.989859
                           AWS Azure GCP   5.287150       246                            aws azure gcp   0.986241
                           AWS GCP Azure   1.673190        69                            aws gcp azure   0.986241
                           Azure AWS GCP   1.326786        67                            azure aws gcp   0.986241
                           Azure GCP AWS   0.414883        12                            azure gcp aws   0.986241
                           GCP Azure AWS   0.381612        17                            gcp azure aws   0.986241
                  Microsoft Azure Google   0.709834        24                   microsoft azure google   0.986186
                     AWS Microsoft Azure   0.736023        24                      aws m