In [1]:
import glob
import random
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim import corpora, models
from tqdm import tqdm
import numpy as np

stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text, min_len = 4):
        if token not in STOPWORDS:
            result.append(lemmatize_stemming(token))
    return result

cat_list = glob.glob ("cuisines/*")
cat_size = len(cat_list)

random.seed(0)
cat_names = []
cat_text = []
# sample_size = min(30, cat_size)
# cat_sample = sorted(random.sample(range(cat_size), sample_size))
cat_sample = range(0, cat_size)

count = 0
for i in cat_sample:
    cat_names.append(cat_list[i].replace("\\", "/").split('/')[-1][:-4].replace("_"," "))
    with open(cat_list[i]) as f:
        cat_text.append(f.read().replace("\n", "").replace("\r",""))

processed_docs = [preprocess(text) for text in tqdm(cat_text)]
dictionary = corpora.Dictionary(processed_docs)
print("Before prunn:%d"%(len(dictionary)))
dictionary.filter_extremes(no_below = 2, no_above = 0.5)
print("After prunn:%d"%(len(dictionary)))
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 150/150 [25:27<00:00,  8.38s/it]


Before prunn:193061
After prunn:100000


In [2]:
import math

def cosine_similarity(a, b):
    b = dict(b)
    norm_a = 0
    norm_b = 0
    denom = 0
    for a_i, a_v in a:
        norm_a += a_v * a_v
        if a_i in b:
            denom += a_v * b[a_i]
    for b_i in b:
        norm_b += b[b_i] * b[b_i]
    
    norm_a = math.sqrt(norm_a)
    norm_b = math.sqrt(norm_b)
#     print(norm_a)
#     print(norm_b)
#     print(denom)
    
    return denom / (norm_a * norm_b)

def top_n(df, n, thresh_hold = 0.1):
    df_count = np.zeros(df.shape)
    df_bak = df
    df_count[df >= thresh_hold] = 1
    _counts = np.sum(df_count, axis=1)
    max_index = []
    for i in range(0, n):
        _index = np.argmax(_counts)
        max_index.append(_index)
        _counts[_index] = -1
    
    return df.iloc[max_index][df.columns[max_index]]

def format_obj(df, groups):
    _nodes = "nodes"
    _links = "links"
    json_obj = {_nodes:[], _links:[]}
    for i in range(0, len(df.columns)):
        json_obj[_nodes].append({"name":df.columns[i], "group":groups[i]})
    
    for i in range(0, df.shape[0] - 1):
        for j in range(i + 1, df.shape[0]):
            json_obj[_links].append({"source":i, "target":j, "value":df.iloc[i][j]})
    
    return json_obj

def corpus_similarity(corpus):
    _sim = np.zeros([len(corpus), len(corpus)])

    for i in tqdm(range(0, len(corpus) - 1)):
        _sim[i][i] = 1
        for j in range(i + 1, len(corpus)):
            _sim[i][j] = cosine_similarity(corpus[i], corpus[j])
            _sim[j][i] = _sim[i][j]
    
    return _sim

In [3]:
import numpy as np
import pandas as pd

# sim = np.zeros([len(corpus), len(corpus)])

# for i in tqdm(range(0, len(corpus) - 1)):
#     sim[i][i] = 1
#     for j in range(i + 1, len(corpus)):
#         sim[i][j] = cosine_similarity(corpus[i], corpus[j])
#         sim[j][i] = sim[i][j]
        
sim = corpus_similarity(corpus)

sim_df = pd.DataFrame(sim)
sim_df.index = cat_names
sim_df.columns = cat_names
data = top_n(sim_df, 50)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 149/149 [00:27<00:00,  5.33it/s]


In [4]:
import json

with open("display/output.json", "w") as f:
    f.write(json.dumps(format_obj(data, np.ones(data.shape[0]))))

In [5]:
import seaborn as sns; 
import matplotlib.pyplot as plt

sample = 20
ax = sns.heatmap(data.iloc[0:sample][data.columns[0:sample]],cmap="YlGnBu", xticklabels=True, yticklabels=True)
plt.show()

<Figure size 640x480 with 2 Axes>

In [6]:
from gensim.models import TfidfModel

model = TfidfModel(corpus)
tfidf_corpus = model[corpus]

tfidf_sim = np.zeros([len(tfidf_corpus), len(tfidf_corpus)])

for i in tqdm(range(0, len(tfidf_corpus) - 1)):
    tfidf_sim[i][i] = 1
    for j in range(i + 1, len(tfidf_corpus)):
        tfidf_sim[i][j] = cosine_similarity(tfidf_corpus[i], tfidf_corpus[j])
        tfidf_sim[j][i] = tfidf_sim[i][j]
        

tfidf_sim_df = pd.DataFrame(tfidf_sim)
tfidf_sim_df.index = cat_names
tfidf_sim_df.columns = cat_names
tfidf_data = top_n(tfidf_sim_df, 50)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 149/149 [11:20<00:00,  4.57s/it]


In [7]:
import json

with open("display/tfidf_output.json", "w") as f:
    f.write(json.dumps(format_obj(tfidf_data, np.ones(tfidf_data.shape[0]))))

In [8]:
from time import time

t0 = time()
lda_model = models.LdaModel(tfidf_corpus, num_topics = 100, id2word=dictionary,  eval_every=5, alpha='auto', gamma_threshold=0.01)
doc_topics = lda_model.get_document_topics(tfidf_corpus)
print("done in %fs" % (time() - t0))

  perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words


done in 21.213967s


In [9]:
lda_sim = corpus_similarity(doc_topics)

lda_sim_df = pd.DataFrame(lda_sim)
lda_sim_df.index = cat_names
lda_sim_df.columns = cat_names
lda_data = top_n(lda_sim_df, 50)

with open("display/lda_output.json", "w") as f:
    f.write(json.dumps(format_obj(lda_data, np.ones(lda_data.shape[0]))))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 149/149 [15:10<00:00,  6.11s/it]


In [15]:
lda_model.show_topics(-1)`

[(29,
  '0.000*"prompf" + 0.000*"prooo" + 0.000*"prohibitivli" + 0.000*"prokkimchi" + 0.000*"promiscu" + 0.000*"professori" + 0.000*"proviso" + 0.000*"proteinsi" + 0.000*"probobl" + 0.000*"prosinexpens"'),
 (32,
  '0.000*"prompf" + 0.000*"prooo" + 0.000*"prohibitivli" + 0.000*"prokkimchi" + 0.000*"promiscu" + 0.000*"professori" + 0.000*"proviso" + 0.000*"proteinsi" + 0.000*"probobl" + 0.000*"prosinexpens"'),
 (55,
  '0.000*"bellagio" + 0.000*"kobe" + 0.000*"mastro" + 0.000*"wellington" + 0.000*"fogo" + 0.000*"wagyu" + 0.000*"brazilian" + 0.000*"delmonico" + 0.000*"porterhous" + 0.000*"outback"'),
 (87,
  '0.000*"rula" + 0.000*"casey" + 0.000*"ireland" + 0.000*"boxti" + 0.000*"guin" + 0.000*"irishmen" + 0.000*"shepherd" + 0.000*"smithwick" + 0.000*"mcmullan" + 0.000*"fibber"'),
 (8,
  '0.001*"firefli" + 0.000*"paella" + 0.000*"julian" + 0.000*"serrano" + 0.000*"manchego" + 0.000*"brava" + 0.000*"cevich" + 0.000*"croqueta" + 0.000*"empanada" + 0.000*"patata"'),
 (18,
  '0.000*"havana" + 

In [None]:
largest_coherence = -1e20
best_k = 0
best_model = None
for k in range(5, 150, 2):
    model = models.LdaModel(tfidf_corpus, num_topics = k, id2word=dictionary)
    cm = models.coherencemodel.CoherenceModel(model=model, corpus=tfidf_corpus, coherence='u_mass')
    coherence = cm.get_coherence()
    print("k=%d coherence=%f"%(k, coherence))
    if (coherence > largest_coherence):
        largest_coherence = coherence
        best_model = model
        best_k = k

print("best_k:%d"%(best_k))
for idx, topic in best_model.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))

k=5 coherence=-1.498333
k=7 coherence=-2.210039
k=9 coherence=-1.753167
k=11 coherence=-1.878003
k=13 coherence=-2.161507
k=15 coherence=-2.158024
k=17 coherence=-2.020575
k=19 coherence=-1.971740
k=21 coherence=-1.714946
k=23 coherence=-2.001825
k=25 coherence=-1.957821
k=27 coherence=-1.612905
k=29 coherence=-1.732868
k=31 coherence=-1.810027
k=33 coherence=-1.767502
k=35 coherence=-1.652618
k=37 coherence=-1.887585
k=39 coherence=-1.662607
k=41 coherence=-1.734076
k=43 coherence=-1.682676
k=45 coherence=-1.786588
k=47 coherence=-1.781682
k=49 coherence=-1.677419
