In [21]:
import numpy as np
import pandas as pd
import simplejson as sjson
import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess 
from gensim import corpora, models, similarities
from gensim.corpora import Dictionary  
import os

Input: year, n_topics, model_output_path, results_output_path   
Results should be in the format of Gvkey1, Gvkey2, similarity, with Gvkey1 < Gvkey2


In [4]:
def tokenize(text):  
    return [token for token in simple_preprocess(text) if token not in STOPWORDS] 

In [30]:
def similarity(fyear, n_topics, model_output_path, results_output_path):
    #load data
    file_path = '../data/fyear2/%s.txt' %fyear
    data = [] 
    with open(file_path) as f:
        for line in f:
            tmp = sjson.loads(line)
            data.append((tmp['gvkey'],tmp['comp_name'],tmp['item_1']))
    data = pd.DataFrame(data)
    data.columns = ['gvkey','comp_name','item1']
    #dictionary
    documents = data['item1'].tolist()
    processed_docs = [tokenize(doc) for doc in documents]
    dictionary = Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=20, no_above=0.1) 
    dictionary.compactify()
    #dictionary.save(os.path.join('..', results_output_path, 'dictionary_%s.dict' %fyear)) #'../model_output_path/dictionary_%s.dict' %fyear
    #corpus
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    #corpora.MmCorpus.serialize(os.path.join('..', results_output_path, 'corpus_%s.mm' %fyear, corpus)  # store to disk, for later use
    lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, num_topics=n_topics, id2word=dictionary, workers=4,  passes=1)
    lda_model.save(os.path.join('..',model_output_path,'lda_%s_%s.model' %(fyear, n_topics)))
    #similarity
    index = similarities.MatrixSimilarity(lda_model[corpus])
    
    list_of_dfs = {}    
    for i in data['gvkey']:#get gvkey
    
        doc = data.loc[data[data['gvkey']==i].index.values[0],'item1']#corresponding doc
        vec_bow = dictionary.doc2bow(doc.lower().split())#doc2wod
        vec_lda = lda_model[vec_bow]
        sims = index[vec_lda]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])#sort similarities result
        list_of_dfs[i]=pd.DataFrame(sims)
        list_of_dfs[i].columns = ['gvkey2','similarity']
        list_of_dfs[i]['gvkey2']=list_of_dfs[i]['gvkey2'].map(lambda x: data.loc[x,'gvkey']) #corresponding gvkey
        list_of_dfs[i] = list_of_dfs[i][list_of_dfs[i].gvkey2 > i] #gvkey1<gvkey2
        col_name = list_of_dfs[i].columns.tolist()
        list_of_dfs[i].insert(col_name.index('gvkey2'),'gvkey1',i) #add gvkey1
    df_similarity = pd.concat(list_of_dfs.values(), ignore_index=True)
    df_similarity=df_similarity[df_similarity['similarity']!=0]#remove similarities equal to 0
    df_similarity.to_csv(os.path.join('..',results_output_path,"similarity_%s_%s.txt" %(fyear, n_topics), sep="\t", index=None)
    
    

In [31]:
similarity(2014, 20, 'model','result')