In [1]:
import glob
import random
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim import corpora, models
from tqdm import tqdm
import numpy as np

stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text, min_len = 4):
        if token not in STOPWORDS:
            result.append(lemmatize_stemming(token))
    return result

cat_list = glob.glob ("cuisines/*")
cat_size = len(cat_list)

random.seed(0)
cat_names = []
cat_text = []
# sample_size = min(30, cat_size)
# cat_sample = sorted(random.sample(range(cat_size), sample_size))
cat_sample = range(0, cat_size)

count = 0
for i in cat_sample:
    cat_names.append(cat_list[i].replace("\\", "/").split('/')[-1][:-4].replace("_"," "))
    with open(cat_list[i]) as f:
        cat_text.append(f.read().replace("\n", "").replace("\r",""))

processed_docs = [preprocess(text) for text in tqdm(cat_text)]
dictionary = corpora.Dictionary(processed_docs)
print("Before prunn:%d"%(len(dictionary)))
dictionary.filter_extremes(no_below = 2, no_above = 0.5)
print("After prunn:%d"%(len(dictionary)))
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

  1%|▌                                                                                 | 1/163 [00:03<09:10,  3.40s/it]

KeyboardInterrupt: 

In [None]:
import math
from scipy import spatial

def cosine_similarity(a, b):
    b = dict(b)
    norm_a = 0
    norm_b = 0
    denom = 0
    for a_i, a_v in a:
        norm_a += a_v * a_v
        if a_i in b:
            denom += a_v * b[a_i]
    for b_i in b:
        norm_b += b[b_i] * b[b_i]
    
    norm_a = math.sqrt(norm_a)
    norm_b = math.sqrt(norm_b)
#     print(norm_a)
#     print(norm_b)
#     print(denom)
    
    return denom / (norm_a * norm_b)

def top_n(df, n, thresh_hold = 0.1):
    df_count = np.zeros(df.shape)
    df_bak = df
    df_count[df >= thresh_hold] = 1
    _counts = np.sum(df_count, axis=1)
    max_index = []
    for i in range(0, n):
        _index = np.argmax(_counts)
        max_index.append(_index)
        _counts[_index] = -1
    
    return df.iloc[max_index][df.columns[max_index]]

def format_obj(df, groups):
    _nodes = "nodes"
    _links = "links"
    json_obj = {_nodes:[], _links:[]}
    for i in range(0, len(df.columns)):
        json_obj[_nodes].append({"name":df.columns[i], "group":groups[i]})
    
    for i in range(0, df.shape[0] - 1):
        for j in range(i + 1, df.shape[0]):
            json_obj[_links].append({"source":i, "target":j, "value":df.iloc[i][j]})
    
    return json_obj

def corpus_similarity(corpus, vector_dimension):
#     _sim = np.zeros([len(corpus), len(corpus)])

#     for i in tqdm(range(0, len(corpus) - 1)):
#         _sim[i][i] = 1
#         for j in range(i + 1, len(corpus)):
#             _sim[i][j] = cosine_similarity(corpus[i], corpus[j])
#             _sim[j][i] = _sim[i][j]
    _corpus_matrix = np.zeros([len(corpus), vector_dimension])
    for i, row in enumerate(corpus):
        for j, v in row:
            _corpus_matrix[i][j] = v
    
    _sim = np.zeros([len(corpus), len(corpus)])
    for i in tqdm(range(0, len(corpus) - 1)):
        for j in range(i + 1, len(corpus)):
            _sim[i, j] = spatial.distance.cosine(_corpus_matrix[i], _corpus_matrix[j])
            _sim[j][i] = _sim[i][j]
    
    return 1 - _sim

def corpus_similarity_1(corpus):
    _sim = np.zeros([len(corpus), len(corpus)])

    for i in tqdm(range(0, len(corpus) - 1)):
        _sim[i][i] = 1
        for j in range(i + 1, len(corpus)):
            _sim[i][j] = cosine_similarity(corpus[i], corpus[j])
            _sim[j][i] = _sim[i][j]
    
    return 1 - _sim

def slice_df_by_name(df,names):
    return df.loc[names][names]

In [None]:
import numpy as np
import pandas as pd

# sim = np.zeros([len(corpus), len(corpus)])

# for i in tqdm(range(0, len(corpus) - 1)):
#     sim[i][i] = 1
#     for j in range(i + 1, len(corpus)):
#         sim[i][j] = cosine_similarity(corpus[i], corpus[j])
#         sim[j][i] = sim[i][j]
        
sim = corpus_similarity(corpus, len(dictionary))

sim_df = pd.DataFrame(sim)
sim_df.index = cat_names
sim_df.columns = cat_names
data = top_n(sim_df, 50)
selected_names = data.columns

In [None]:
import json

with open("display/output.json", "w") as f:
    f.write(json.dumps(format_obj(data, np.ones(data.shape[0]))))

In [None]:
# import seaborn as sns; 
# import matplotlib.pyplot as plt

# sample = 20
# ax = sns.heatmap(data.iloc[0:sample][data.columns[0:sample]],cmap="YlGnBu", xticklabels=True, yticklabels=True)
# plt.show()

In [None]:
from gensim.models import TfidfModel

tfidf_model = TfidfModel(corpus)
tfidf_corpus = tfidf_model[corpus]

tfidf_sim = corpus_similarity(tfidf_corpus, len(dictionary))
# np.zeros([len(tfidf_corpus), len(tfidf_corpus)])

# for i in tqdm(range(0, len(tfidf_corpus) - 1)):
#     tfidf_sim[i][i] = 1
#     for j in range(i + 1, len(tfidf_corpus)):
#         tfidf_sim[i][j] = cosine_similarity(tfidf_corpus[i], tfidf_corpus[j])
#         tfidf_sim[j][i] = tfidf_sim[i][j]
        
tfidf_sim_df = pd.DataFrame(tfidf_sim)
tfidf_sim_df.index = cat_names
tfidf_sim_df.columns = cat_names
# tfidf_data = top_n(tfidf_sim_df, 50)
tfidf_data = slice_df_by_name(tfidf_sim_df, selected_names)

In [None]:
import json

with open("display/tfidf_output.json", "w") as f:
    f.write(json.dumps(format_obj(tfidf_data, np.ones(tfidf_data.shape[0]))))

In [None]:
from time import time

t0 = time()
lda_model = models.LdaModel(tfidf_corpus, num_topics = 100, id2word=dictionary,  eval_every=5, alpha='auto', gamma_threshold=0.01)
doc_topics = lda_model.get_document_topics(tfidf_corpus)
print("done in %fs" % (time() - t0))

In [None]:
lda_sim = corpus_similarity(doc_topics, len(dictionary))

lda_sim_df = pd.DataFrame(lda_sim)
lda_sim_df.index = cat_names
lda_sim_df.columns = cat_names
# lda_data = top_n(lda_sim_df, 50)
lda_data = slice_df_by_name(lda_sim_df, selected_names)

with open("display/lda_output.json", "w") as f:
    f.write(json.dumps(format_obj(lda_data, np.ones(lda_data.shape[0]))))

In [None]:
# largest_coherence = -1e20
# best_k = 0
# best_model = None
# for k in range(5, 150, 2):
#     model = models.LdaModel(tfidf_corpus, num_topics = k, id2word=dictionary)
#     cm = models.coherencemodel.CoherenceModel(model=model, corpus=tfidf_corpus, coherence='u_mass')
#     coherence = cm.get_coherence()
#     print("k=%d coherence=%f"%(k, coherence))
#     if (coherence > largest_coherence):
#         largest_coherence = coherence
#         best_model = model
#         best_k = k

# print("best_k:%d"%(best_k))
# for idx, topic in best_model.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))

In [None]:
names_file = "cuisine_indices.txt"
matrix_file = "cuisine_sim_matrix.csv"

with open (names_file, 'r') as f:
    names = f.read().split("\n")

demo_data = pd.read_csv(matrix_file, header=None)
demo_data.index = names
demo_data.columns = names

with open("display/demo_output.json", "w") as f:
    f.write(json.dumps(format_obj(demo_data, np.ones(demo_data.shape[0]))))

In [None]:
path2reviewdump = "reviews/reviews.dat"

with open(path2reviewdump, "r") as f:
    reviews = f.readlines()
review_docs = [preprocess(text) for text in tqdm(reviews)]
review_dictionary = corpora.Dictionary(review_docs)
print("Before prunn:%d"%(len(review_dictionary)))
review_dictionary.filter_extremes(no_below=15, no_above = 0.5)
print("After prunn:%d"%(len(review_dictionary)))
review_corpus = [review_dictionary.doc2bow(doc) for doc in review_docs]

In [None]:
from time import time

t0 = time()
review_model = models.LdaModel(review_corpus, num_topics=100, id2word=review_dictionary,  eval_every=5, alpha='auto', gamma_threshold=0.01)
print("done in %fs" % (time() - t0))

for idx, topic in review_model.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))

In [None]:
def combine_topics(cat_topics):
    topics = {}
    for _sub_topics in cat_topics:
        for _topic, _value in _sub_topics:
            if _topic in topics:
                topics[_topic] += _value
            else:
                topics[_topic] = _value
    
    return topics

all_topics = []
cat_names = []
for i in tqdm(range(0, len(cat_list))):
    cat_names.append(cat_list[i].replace("\\", "/").split('/')[-1][:-4].replace("_"," "))
    with open(cat_list[i]) as f:
        cat_docs = [preprocess(text) for text in f.readlines()]
        cat_corpus = [review_dictionary.doc2bow(doc) for doc in cat_docs]
        cat_topics = review_model[cat_corpus]
        all_topics.append(combine_topics(cat_topics))

In [None]:
lda_individual_sim = corpus_similarity([[(k, topic[k]) for k in topic] for topic in all_topics], len(review_dictionary))

lda_individual_sim_df = pd.DataFrame(lda_individual_sim)
lda_individual_sim_df.index = cat_names
lda_individual_sim_df.columns = cat_names
lda_individual_data = top_n(lda_individual_sim_df, 50)

with open("display/lda_ind_output.json", "w") as f:
    f.write(json.dumps(format_obj(lda_individual_data, np.ones(lda_individual_data.shape[0]))))