In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas(desc='apply')
import dill
import os
import sys

from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

In [7]:
def iter_create_gensim_lsa_model(files, number_of_topics, dictionary, save=False):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    lsamodel = LsiModel(corpus=None, num_topics=number_of_topics, id2word=dictionary)
    for file in tqdm(files, desc='feathers'):
        df = pd.read_feather(file, columns=['title'])
        titles = df['title'].str.split()
        doc_term_matrix = [dictionary.doc2bow(text) for text in titles]
        lsamodel.add_documents(tqdm(doc_term_matrix, desc='model'))
    if save:
        with open('transforms/lsamodel.pkd') as file:
            dill.dump(lsamodel, file)
    return lsamodel

def create_gensim_lsa_model(doc_term_matrix, number_of_topics, dictionary, save=False):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    lsamodel = LsiModel(corpus=doc_term_matrix, num_topics=number_of_topics, id2word=dictionary)
    if save:
        with open('transforms/lsamodel.pkd', 'wb') as file:
            dill.dump(lsamodel, file)
    return lsamodel

In [8]:
%%time

folder = 'cleaned_cache'
files = [os.path.join(folder, 'data_{}.feather'.format(i)) for i in range(len(os.listdir(folder)))]

with open('transforms/dictionary.pkd', 'rb') as file:
    d = dill.load(file)
d.filter_extremes(keep_n=5000)

lsamodel = iter_create_gensim_lsa_model(files, number_of_topics=50, dictionary=d)

HBox(children=(IntProgress(value=0, description='feathers', max=41, style=ProgressStyle(description_width='ini…

  labels, = index.labels


HBox(children=(IntProgress(value=0, description='cleaned_cache/data_0.feather text', max=2683916, style=Progre…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_1.feather text', max=2581474, style=Progre…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_2.feather text', max=1494096, style=Progre…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_3.feather text', max=2920704, style=Progre…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_4.feather text', max=2829413, style=Progre…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_5.feather text', max=335030, style=Progres…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_6.feather text', max=1637789, style=Progre…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_7.feather text', max=2774958, style=Progre…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_8.feather text', max=1710598, style=Progre…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_9.feather text', max=2867479, style=Progre…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_10.feather text', max=2581474, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_11.feather text', max=2667549, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_12.feather text', max=1714149, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_13.feather text', max=2567327, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_14.feather text', max=2645516, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_15.feather text', max=2181246, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_16.feather text', max=1601517, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_17.feather text', max=1487397, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_18.feather text', max=335030, style=Progre…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_19.feather text', max=2567327, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_20.feather text', max=1361631, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_21.feather text', max=2383862, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_22.feather text', max=2771874, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_23.feather text', max=2388760, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_24.feather text', max=879895, style=Progre…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_25.feather text', max=2197294, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_26.feather text', max=2375352, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_27.feather text', max=2683916, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_28.feather text', max=2555366, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_29.feather text', max=2555649, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_30.feather text', max=727684, style=Progre…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_31.feather text', max=373657, style=Progre…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_32.feather text', max=2555366, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_33.feather text', max=1710598, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_34.feather text', max=2503773, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_35.feather text', max=1077185, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_36.feather text', max=2645286, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_37.feather text', max=2632243, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_38.feather text', max=1098258, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_39.feather text', max=2602043, style=Progr…

HBox(children=(IntProgress(value=0, description='cleaned_cache/data_40.feather text', max=2375352, style=Progr…

In [12]:
with open('transforms/lsamodel.pkd', 'rb') as file:
    test = dill.load(file)

In [None]:
topics = []
for i, topic in enumerate(lsamodel.get_topics()):
    df = pd.DataFrame({'keyword': list(dictionary.values()), 'weight': topic, 'num': [i]*len(topic)})
    df.sort_values('weight', ascending=False, inplace=True)
    topics.append(df)
    
topics = pd.concat(topics)
topics.set_index(['num', 'keyword'], inplace=True)
topics.reset_index().to_feather('transforms/topics.feather')