Topic Modeling with Gensim

https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#17howtofindtheoptimalnumberoftopicsforlda

In [1]:
import glob
import random
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim import corpora, models
from tqdm import tqdm
import numpy as np
import pandas as pd
import json

stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text, min_len = 4):
        if token not in STOPWORDS:
            result.append(lemmatize_stemming(token))
    return result

cat_list = sorted(glob.glob ("cuisines/*"))
cat_size = len(cat_list)

random.seed(0)
cat_names = []
cat_text = []
# sample_size = min(30, cat_size)
# cat_sample = sorted(random.sample(range(cat_size), sample_size))
cat_sample = range(0, cat_size)

count = 0
for i in cat_sample:
    cat_names.append(cat_list[i].replace("\\", "/").split('/')[-1][:-4].replace("_"," "))
    with open(cat_list[i]) as f:
        cat_text.append(f.read().replace("\n", "").replace("\r",""))

processed_docs = [preprocess(text) for text in tqdm(cat_text)]
dictionary = corpora.Dictionary(processed_docs)
print("Before prunn:%d"%(len(dictionary)))
dictionary.filter_extremes(no_below = 2, no_above = 0.5)
print("After prunn:%d"%(len(dictionary)))
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

100%|██████████| 147/147 [29:22<00:00,  7.92s/it] 


Before prunn:193047
After prunn:100000


In [2]:
import math
import sklearn
from scipy import spatial

def cosine_similarity(a, b):
    b = dict(b)
    norm_a = 0
    norm_b = 0
    denom = 0
    for a_i, a_v in a:
        norm_a += a_v * a_v
        if a_i in b:
            denom += a_v * b[a_i]
    for b_i in b:
        norm_b += b[b_i] * b[b_i]
    
    norm_a = math.sqrt(norm_a)
    norm_b = math.sqrt(norm_b)
    
    return denom / (norm_a * norm_b)

def top_n(df, n, thresh_hold = 0.1):
    df_count = np.zeros(df.shape)
    df_bak = df
    df_count[df >= thresh_hold] = 1
    _counts = np.sum(df_count, axis=1)
    max_index = []
    for i in range(0, n):
        _index = np.argmax(_counts)
        max_index.append(_index)
        _counts[_index] = -1
    
    return df.iloc[max_index][df.columns[max_index]]

def slice_df_by_name(df,names):
    return df.loc[names][names]

def format_obj(df, groups):
    _nodes = "nodes"
    _links = "links"
    json_obj = {_nodes:[], _links:[]}
    sorted_names = []
    name2gid = dict()
    for g in range(0, len(groups)):
        for name in groups[g]:
            name2gid[name] = g
            if name in df.columns:
                sorted_names.append(name)
    
    df = slice_df_by_name(df, sorted_names)
    for c_name in df.columns:
        json_obj[_nodes].append({"name": c_name, "group":name2gid[c_name]})
    
    for i in range(0, df.shape[0] - 1):
        for j in range(i + 1, df.shape[0]):
            json_obj[_links].append({"source":i, "target":j, "value":float(df.iloc[i][j])})
    
    return json_obj

def corpus2matrix(corpus, vector_dimension):
    _corpus_matrix = np.zeros([len(corpus), vector_dimension])
    for i, row in enumerate(corpus):
        for j, v in row:
            _corpus_matrix[i][j] = v
    
    return _corpus_matrix
    
def corpus_similarity(corpus, vector_dimension, distance_func = sklearn.metrics.pairwise.cosine_similarity):
    _corpus_matrix = corpus2matrix(corpus, vector_dimension)
    #Normailzation
#     _corpus_matrix = Normalizer().transform(_corpus_matrix)    
    return distance_func(_corpus_matrix)


def corpus_similarity_1(corpus):
    _sim = np.zeros([len(corpus), len(corpus)])

    for i in tqdm(range(0, len(corpus) - 1)):
        _sim[i][i] = 1
        for j in range(i + 1, len(corpus)):
            _sim[i][j] = cosine_similarity(corpus[i], corpus[j])
            _sim[j][i] = _sim[i][j]
    
    return _sim

In [3]:
from sklearn.cluster import KMeans, Birch, DBSCAN
from sklearn.preprocessing import Normalizer
from sklearn import metrics

def kmean_predict(X, n_clusters):
    return KMeans(n_clusters = n_clusters).fit_predict(X)

def birch_predict(X, n_clusters):
    return Birch(compute_labels = True, n_clusters = n_clusters).fit_predict(X)

def dbscan_predict(X, n_clusters):
    return DBSCAN(eps=n_clusters * 0.01, min_samples=1).fit_predict(X)

cluster_method = {"kmean": kmean_predict,
                 "birch": birch_predict,
                 "dbscan": dbscan_predict}

def get_cluster(features_list, feature_dimension, names, num_cluster = -1, method = "kmean", verbose = False):
    if type(features_list) == np.ndarray:
        X = features_list
    else:
        X = corpus2matrix(features_list, feature_dimension)
    
    Norm_X = Normalizer().transform(X)
    
    if num_cluster < 0:
        best_score = -1
        best_k = -1
        for k in range(2, 100):
            y_pred = cluster_method[method](Norm_X, k)
            _score = metrics.silhouette_score(Norm_X, y_pred, metric='euclidean')
#             _score = metrics.calinski_harabasz_score(Norm_X, y_pred) 
            if verbose:
                print(_score)
            if _score > best_score:
                best_k = k
                best_score = _score
        if verbose:
            print("Best k:%d"%(best_k))
    else:
        best_k = num_cluster
        
    y_pred = cluster_method[method](Norm_X, best_k)
    clusters = dict()
    name2cluster = dict()
    for i in range(0, len(y_pred)):
        name2cluster[names[i]] = y_pred[i]
        if y_pred[i] in clusters:
            clusters[y_pred[i]].append(names[i])
        else:
            clusters[y_pred[i]] = [names[i]]

    return (clusters, name2cluster)

In [4]:
sim = corpus_similarity(corpus, len(dictionary))
sim_clusters, i = get_cluster(corpus, len(dictionary), cat_names, 10, method='birch')

sim_df = pd.DataFrame(sim)
sim_df.index = cat_names
sim_df.columns = cat_names

sim_df_50 = top_n(sim_df, 50)
selected_names = sim_df_50.columns

with open("display/output.json", "w") as f:
    f.write(json.dumps(format_obj(sim_df, sim_clusters)))

with open("display/output_50.json", "w") as f:
    f.write(json.dumps(format_obj(sim_df_50, sim_clusters)))

In [5]:
# import seaborn as sns; 
# import matplotlib.pyplot as plt

# sample = 20
# ax = sns.heatmap(data.iloc[0:sample][data.columns[0:sample]],cmap="YlGnBu", xticklabels=True, yticklabels=True)
# plt.show()

In [6]:
from gensim.models import TfidfModel
import json

tfidf_model = TfidfModel(corpus)
tfidf_corpus = tfidf_model[corpus]

tfidf_sim = corpus_similarity(tfidf_corpus, len(dictionary))
tfidf_sim_clusters, i = get_cluster(tfidf_corpus, len(dictionary), cat_names, 10, method='birch')

tfidf_sim_df = pd.DataFrame(tfidf_sim)
tfidf_sim_df.index = cat_names
tfidf_sim_df.columns = cat_names
#tfidf_sim_df_50 = top_n(tfidf_sim_df, 50)
tfidf_sim_df_50 = slice_df_by_name(tfidf_sim_df, selected_names)
                              
with open("display/tfidf_output.json", "w") as f:
    f.write(json.dumps(format_obj(tfidf_sim_df, tfidf_sim_clusters)))
with open("display/tfidf_output_50.json", "w") as f:
    f.write(json.dumps(format_obj(tfidf_sim_df_50, tfidf_sim_clusters)))

In [7]:
from time import time
from gensim.models.coherencemodel import CoherenceModel

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

num_topics = 100

t0 = time()
lda_model = models.LdaModel(tfidf_corpus, 
                            num_topics = num_topics, 
                            id2word = dictionary,
                            random_state = 100,
                            eval_every=5, 
                            alpha='auto', 
                            gamma_threshold=0.01)
# lda_model = models.LdaModel(tfidf_corpus, 
#                             num_topics = num_topics, 
#                             id2word = dictionary,
#                             random_state = 100,
#                             update_every = 1,
#                             chunksize = 100,
#                             passes = 10,
#                             alpha = 'auto')

doc_topics = lda_model[tfidf_corpus]
print("Training done in %fs" % (time() - t0))

# t0 = time()
# # Compute Perplexity
# print('\nPerplexity: ', lda_model.log_perplexity(tfidf_corpus))  # a measure of how good the model is. lower the better.
# # Compute Coherence Score
# coherence_model_lda = CoherenceModel(model = lda_model, texts = processed_docs, dictionary = dictionary, coherence = 'c_v')
# coherence_lda = coherence_model_lda.get_coherence()
# print('\nCoherence Score: ', coherence_lda)
# print("Evaluation done in %fs" % (time() - t0))

  perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words


Training done in 24.541629s


In [8]:
import os

mallet_path = ".." + os.sep + "mallet-2.0.8"+ os.sep + "bin" + os.sep +"mallet"
t0 = time()
lda_mallet_model = models.wrappers.LdaMallet(mallet_path, 
                                             corpus = corpus, 
                                             num_topics = num_topics, 
                                             id2word = dictionary)
mallet_doc_topics = lda_mallet_model[corpus]
print("Training done in %fs" % (time() - t0))

# # Compute Coherence Score
# t0 = time()
# coherence_model_ldamallet = CoherenceModel(model = lda_mallet_model, texts = processed_docs, dictionary = dictionary, coherence='c_v')
# coherence_ldamallet = coherence_model_ldamallet.get_coherence()
# print('\nCoherence Score: ', coherence_ldamallet)
# print("Evaluation done in %fs" % (time() - t0))

Training done in 1238.064361s


In [9]:
lda_sim = corpus_similarity(doc_topics, num_topics)
lda_sim_clusters, i = get_cluster(doc_topics, num_topics, cat_names, 10, method='birch')

lda_sim_df = pd.DataFrame(lda_sim)
lda_sim_df.index = cat_names
lda_sim_df.columns = cat_names
# lda_sim_df_50 = top_n(lda_sim_df, 50)
lda_sim_df_50 = slice_df_by_name(lda_sim_df, selected_names)

with open("display/lda_output.json", "w") as f:
    f.write(json.dumps(format_obj(lda_sim_df, lda_sim_clusters)))
with open("display/lda_output_50.json", "w") as f:
    f.write(json.dumps(format_obj(lda_sim_df_50, lda_sim_clusters)))

In [10]:
lda_mallet_sim = corpus_similarity(mallet_doc_topics, num_topics)
lda_mallet_sim_clusters, i = get_cluster(mallet_doc_topics, num_topics, cat_names, 10, method='birch')

lda_mallet_sim_df = pd.DataFrame(lda_mallet_sim)
lda_mallet_sim_df.index = cat_names
lda_mallet_sim_df.columns = cat_names
# lda_mallet_sim_df_50 = top_n(lda_sim_df, 50)
lda_mallet_sim_df_50 = slice_df_by_name(lda_mallet_sim_df, selected_names)

with open("display/lda_mallet_output.json", "w") as f:
    f.write(json.dumps(format_obj(lda_mallet_sim_df, lda_mallet_sim_clusters)))
with open("display/lda_mallet_output_50.json", "w") as f:
    f.write(json.dumps(format_obj(lda_mallet_sim_df_50, lda_mallet_sim_clusters)))    

In [11]:
# import pyLDAvis
# import pyLDAvis.gensim

# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, tfidf_corpus, dictionary)
# vis

In [12]:
# largest_coherence = -1e20
# best_k = 0
# best_model = None
# for k in range(5, 150, 2):
#     model = models.LdaModel(tfidf_corpus, num_topics = k, id2word=dictionary)
#     cm = models.coherencemodel.CoherenceModel(model=model, corpus=tfidf_corpus, coherence='u_mass')
#     coherence = cm.get_coherence()
#     print("k=%d coherence=%f"%(k, coherence))
#     if (coherence > largest_coherence):
#         largest_coherence = coherence
#         best_model = model
#         best_k = k

# print("best_k:%d"%(best_k))
# for idx, topic in best_model.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))

In [13]:
# names_file = "cuisine_indices.txt"
# matrix_file = "cuisine_sim_matrix.csv"

# with open (names_file, 'r') as f:
#     names = f.read().split("\n")

# demo_data = pd.read_csv(matrix_file, header=None)
# demo_data.index = names
# demo_data.columns = names

# with open("display/demo_output.json", "w") as f:
#     f.write(json.dumps(format_obj(demo_data, np.ones(demo_data.shape[0]))))

In [14]:
# path2reviewdump = "reviews/reviews.dat"

# with open(path2reviewdump, "r") as f:
#     reviews = f.readlines()
# review_docs = [preprocess(text) for text in tqdm(reviews)]
# review_dictionary = corpora.Dictionary(review_docs)
# print("Before prunn:%d"%(len(review_dictionary)))
# review_dictionary.filter_extremes(no_below=15, no_above = 0.5)
# print("After prunn:%d"%(len(review_dictionary)))
# review_corpus = [review_dictionary.doc2bow(doc) for doc in review_docs]

In [15]:
# from time import time

# t0 = time()
# review_model = models.LdaModel(review_corpus, num_topics=100, id2word=review_dictionary,  eval_every=5, alpha='auto', gamma_threshold=0.01)
# print("done in %fs" % (time() - t0))

# for idx, topic in review_model.print_topics(-1):
#     print('Topic: {} Words: {}'.format(idx, topic))

In [16]:
# def combine_topics(cat_topics):
#     topics = {}
#     for _sub_topics in cat_topics:
#         for _topic, _value in _sub_topics:
#             if _topic in topics:
#                 topics[_topic] += _value
#             else:
#                 topics[_topic] = _value
    
#     return topics

# all_topics = []
# cat_names = []
# for i in tqdm(range(0, len(cat_list))):
#     cat_names.append(cat_list[i].replace("\\", "/").split('/')[-1][:-4].replace("_"," "))
#     with open(cat_list[i]) as f:
#         cat_docs = [preprocess(text) for text in f.readlines()]
#         cat_corpus = [review_dictionary.doc2bow(doc) for doc in cat_docs]
#         cat_topics = review_model[cat_corpus]
#         all_topics.append(combine_topics(cat_topics))

In [17]:
# lda_individual_sim = corpus_similarity([[(k, topic[k]) for k in topic] for topic in all_topics], len(review_dictionary))

# lda_individual_sim_df = pd.DataFrame(lda_individual_sim)
# lda_individual_sim_df.index = cat_names
# lda_individual_sim_df.columns = cat_names
# lda_individual_data = top_n(lda_individual_sim_df, 50)

# with open("display/lda_ind_output.json", "w") as f:
#     f.write(json.dumps(format_obj(lda_individual_data, np.ones(lda_individual_data.shape[0]))))

In [18]:
def dump_clusters(cluster1, cluster2, title):
    children_name = 'children'
    name_name = 'name'
    value_name = 'value'
    color_name = 'color'    

    _out = {name_name: title, children_name:[]}

#     name2cluster1 = dict()
#     for _group_id in range(0, len(cluster1)):
#         for _name in cluster1[_group_id]:
#             name2cluster1[_name]= _group_id

    name2cluster2 = dict()            
    for _group_id in range(0, len(cluster2)):
        for _name in cluster2[_group_id]:
            name2cluster2[_name]= _group_id
                
    for _group_id in range(0, len(cluster1)):
        _out[children_name].append({name_name: str(_group_id), children_name:[]})
        for _name_id in range(0, len(cluster1[_group_id])):
            _out[children_name][_group_id][children_name].append({name_name: cluster1[_group_id][_name_id],
                                                                  "cluster1": _group_id,
                                                                  "cluster2": name2cluster2[cluster1[_group_id][_name_id]]})
        
    return _out

In [19]:
t0 = time()
num_cluster = 10
sim_clusters_kmean_small, i = get_cluster(mallet_doc_topics, num_topics, cat_names, num_cluster)
sim_clusters_kmean, i = get_cluster(mallet_doc_topics, num_topics, cat_names)
with open("display/cluster_kmean_output.json", "w") as f:
    f.write(json.dumps(dump_clusters(sim_clusters_kmean, sim_clusters_kmean_small, "Cuisine Clustering by KMean")))
print("Clustering done in %fs" % (time() - t0))
  
t0 = time()
sim_clusters_birch_small, i = get_cluster(mallet_doc_topics, num_topics, cat_names, num_cluster, method='birch')
sim_clusters_birch, i = get_cluster(mallet_doc_topics, num_topics, cat_names, method='birch')
with open("display/cluster_birch_output.json", "w") as f:
    f.write(json.dumps(dump_clusters(sim_clusters_birch, sim_clusters_birch_small, "Cuisine Clustering by Birch")))
print("Clustering done in %fs" % (time() - t0))

Clustering done in 19.013090s




Clustering done in 1.275685s




In [20]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

with open("rest_reviews.txt") as f:
    review_docs = f.readlines()

processed_review_docs = [preprocess(doc) for doc in tqdm(review_docs)]
d2v_docs = [TaggedDocument(doc, [i]) for i, doc in tqdm(enumerate(processed_review_docs))]

FileNotFoundError: [Errno 2] No such file or directory: 'rest_reviews.txt'

In [None]:
vector_len = 100
%time d2v_model = Doc2Vec(d2v_docs, vector_size=vector_len, workers=4)
d2v = np.array([d2v_model.infer_vector(doc) for doc in tqdm(processed_docs)])
d2v_sim = sklearn.metrics.pairwise.cosine_similarity(d2v)

In [None]:
d2v_sim_clusters, i = get_cluster(d2v, vector_len, cat_names, 10, method='birch')

d2v_sim_df = pd.DataFrame(d2v_sim)
d2v_sim_df.index = cat_names
d2v_sim_df.columns = cat_names  
# lda_mallet_sim_df_50 = top_n(lda_sim_df, 50)
d2v_sim_df_50 = slice_df_by_name(d2v_sim_df, selected_names)

with open("display/d2v_output.json", "w") as f:
    f.write(json.dumps(format_obj(d2v_sim_df, d2v_sim_clusters)))
with open("display/d2v_output_50.json", "w") as f:
    f.write(json.dumps(format_obj(d2v_sim_df_50, d2v_sim_clusters)))    