# Evaluating Gensim Phrases
Choosing threshold values.

In [1]:
from gensim.models.phrases import Phrases
import pandas as pd
import os.path

from hidden_vars import TABLES_DIR

In [2]:
models = {
    f"{n}gram" : Phrases.load(f"gensim_phrase_models/{n}gram_phrases.pkl") 
    for n in ["bi", "tri", "four"]
}

phrase_dfs = {
    f"{n}gram_df" : pd.DataFrame(
        models[f"{n}gram"].export_phrases().items(),
        columns =["Phrase", "Score"]
    ).sort_values(by="Score")
    for n in ["bi", "tri", "four"]
}

bigram_df = phrase_dfs["bigram_df"]
trigram_df = phrase_dfs["trigram_df"]
fourgram_df = phrase_dfs["fourgram_df"]

del phrase_dfs

In [3]:
fourgram_df[fourgram_df["Phrase"] == "money_supply"]

Unnamed: 0,Phrase,Score
74,money_supply,0.597756


In [4]:
models["fourgram"].score_candidate("federal_reserve", "bank", [])

('federal_reserve_bank', 0.5975215517786429)

In [5]:
test_sent = [
    "federal", "open", "market", "committee", 
    "meeting", 
    "federal", "reserve", "bank", 
    "new", "york",
    "repurchase", "agreement", 
    "residential", "real", "estate", 
    "commercial", "real", "estate", 
    "core", "inflation", 
    "federal", "open",
    "euro", "area",
    "maximum", "employment",
    "federal", "fund", "rate",
    "board", "governor", "federal", "reserve", "system",
    "discount", "rate"
]

n_gram_word_list = test_sent
for phrase_model in models.values():
    print(phrase_model)
    n_gram_word_list = phrase_model[n_gram_word_list]

for _ in n_gram_word_list:
    print(_)

del test_sent, n_gram_word_list, phrase_model

Phrases<3413836 vocab, min_count=3067, threshold=0.66, max_vocab_size=40000000>
Phrases<3454842 vocab, min_count=3067, threshold=0.6, max_vocab_size=40000000>
Phrases<3496643 vocab, min_count=3067, threshold=0.5, max_vocab_size=40000000>
federal_open_market
committee
meeting
federal_reserve_bank
new_york
repurchase_agreement
residential_real_estate
commercial_real_estate
core_inflation
federal
open
euro_area
maximum_employment
federal_fund_rate
board_governor_federal_reserve
system
discount_rate


## Getting list of identified n-grams & frequency

In [6]:
models["fourgram"].score_candidate("united", "kingdom", [])

('united_kingdom', 1.4994320636210945)

[NPMI score should be < 1](https://github.com/RaRe-Technologies/gensim/issues/3042)

In [7]:
phrase_list = list(bigram_df["Phrase"]) + list(trigram_df["Phrase"]) + list(fourgram_df["Phrase"])
phrase_list = list(set(phrase_list))


new = []
for i in phrase_list:
    add_i = True
    for j in phrase_list:
        if i != j and i in j:
            add_i = False
            break
    if add_i:
        new.append(i)



phrase_list = [
    tuple(phrase.split("_"))
    for phrase in new
]

del new, i, j, add_i

In [8]:
n_gram_freq_df = pd.read_csv("n_grams.csv", nrows=1000, usecols=["n_gram", "frequency"])

def get_freq(phrase_tuple):
    df_slice = n_gram_freq_df[n_gram_freq_df["n_gram"] == str(phrase_tuple)]
    if len(df_slice) != 1:
        return pd.NA
    return int(df_slice["frequency"])

phrase_df = pd.DataFrame({
    "Phrase": [" ".join(phrase) for phrase in phrase_list],
    "Frequency": [get_freq(phrase) for phrase in phrase_list],
}).sort_values(by="Frequency", ascending=False)
phrase_df.index = range(1, len(phrase_df)+1)

del n_gram_freq_df, phrase_list

In [9]:
label = "phrase_frequency"
s = phrase_df.style
#s.hide(axis="index")
s.format("{:,.0f}".format, subset="Frequency")
s.to_latex(
    buf = os.path.join(TABLES_DIR, f"{label}.tex"),
    environment="longtable",
    label=label,
    hrules=True,
    caption=(
        "Identified phrases using the collocation statistics of \\texttt{fomc\char`_documents.csv}",
        "Identified phrases"
    )
)

del label, s