# Topic Modeling -- NMF

In [54]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time
import time

import nltk
import stanza
import gensim
import re

#topic modeling
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LinearRegression as lm

# viz
import matplotlib.collections as mcol
from matplotlib.legend_handler import HandlerLineCollection, HandlerTuple
from matplotlib.lines import Line2D



import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


In [70]:
#df = pd.read_pickle("/home/zz3hs/git/dspg21oss/data/dspg21oss/readme_lemma_try.pkl")

#readme
#df = pd.read_csv("/home/zz3hs/git/dspg21oss/data/dspg21oss/tokenized_readmes.csv")

#repo description
df = pd.read_csv("/home/zz3hs/git/dspg21oss/data/dspg21oss/tokenized_description.csv")
n = len(df)
n

In [56]:
df

Unnamed: 0,slug,status,tokens
0,davidkpiano/flipping,Done,"flipping,awesome,animations"
1,dotpulse/Dotpulse.OutdatedBrowser,Done,"outdated,browser,neos,cms"
2,victronenergy/dbus-adc,Done,"bridge,venus,device,onboard,adc,dbus"
3,LODIFMO/lodGlossary,Done,"glossary,terms,ipm,deprecated"
4,jacobgumpert/MyArduinoCpp,Done,"simple,test,arduino,repositories"
...,...,...,...
53889,zekunyan/TTGSnackbar,Done,"ttgsnackbar,simple,message,action,button,botto..."
53890,JessYanCoding/MVPArt,Done,"android,mvp,architecture,此,框架,旨,在,解决,传统,mvp,类,..."
53891,cltl/multilingual-wiki-event-pipeline,Done,"project,aims,extract,information,incidents,typ..."
53892,mokha/verdd,Done,"veʹrdd,source,dictionary,editing,framework,foc..."


In [57]:
df["tokens"][1]

'outdated,browser,neos,cms'

In [58]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(topic_term_mat, vectorizer, top_n=10):
    for idx, topic in enumerate(topic_term_mat):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
            
# Function to format topics as a "list of list of strings".
# Needed for topic coherence function in Gensim

# function modified from https://nlpforhackers.io/topic-modeling/

def str_topics(topic_term_mat, vectorizer, top_n=10):

    #input. top_n: how many words to list per topic.  If -1, then list all words.
       
    topic_words = []
    
    for idx, topic in enumerate(topic_term_mat):  # loop through each row of H.  idx = row index.  topic = actual row
            
        if top_n == -1:   
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1]])
        else:
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
    
    str_wds = []
    
    for wds in topic_words:
        str_wds.append(", ".join(wds))    
    
    return str_wds 

In [59]:
# input needed for LDA, NMF (all from Scikit-Learn) is one string per document (not a list of strings)
text = []
abstract_tokens = df["tokens"]

for tokens in abstract_tokens:
    text.append(tokens)

In [79]:
# vectorize the corpus
a = 20 #min_df, topic words to be included only if they appear in at least a repo descriptions
b = 0.6 #max_df, topic words to be included only if they do not appear in more than 60% of the documents
tfidf_vectorizer = TfidfVectorizer( min_df= a, max_df= b, lowercase=False)
# calculate the feature matrix
tf_idf = tfidf_vectorizer.fit_transform(text)


In [80]:
shape = tf_idf.shape

print("There are", shape[1], "unique tokens in the corpus of",shape[0], "descriptions.")

There are 2155 unique tokens in the corpus of 53894 descriptions.


In [81]:
# create model

num_topics = 20

t1 = time.time()
nmf_model = NMF(n_components=num_topics, random_state = 1) #random_state is like a seed in R, control the random number generator
doc_topic = nmf_model.fit_transform(tf_idf)
t2 = time.time()
print(f"  Model run time: {t2-t1}")

topic_term = nmf_model.components_

  Model run time: 2.5928163528442383


In [82]:
nmf_model

NMF(n_components=20, random_state=1)

In [83]:
topic_frame = pd.DataFrame(doc_topic, columns=["Topic"+" "+str(i) for i in range(num_topics)])
topic_frame

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19
0,0.000000,0.000097,0.000246,0.000235,0.000167,0.000268,0.000106,0.000000,0.000098,0.000640,0.000118,0.000202,0.000053,0.000000,0.000386,0.000602,0.000844,0.000000,0.000000,0.000441
1,0.000010,0.000405,0.000000,0.001230,0.000622,0.000813,0.000000,0.000000,0.001283,0.002350,0.000000,0.000000,0.003082,0.000000,0.000014,0.007744,0.001975,0.001409,0.000000,0.000000
2,0.000000,0.000000,0.000021,0.000104,0.000048,0.000590,0.000000,0.000000,0.000499,0.000078,0.000827,0.000130,0.000264,0.000047,0.000279,0.000504,0.000373,0.000574,0.000343,0.000165
3,0.000000,0.000107,0.000672,0.000000,0.000122,0.000868,0.000432,0.000000,0.000046,0.000000,0.000344,0.000153,0.000504,0.000000,0.000000,0.000772,0.000343,0.001011,0.000650,0.001658
4,0.000000,0.000000,0.000274,0.000000,0.000000,0.000517,0.000000,0.000000,0.000561,0.000000,0.000000,0.000000,0.081833,0.000000,0.001719,0.000000,0.000000,0.000000,0.000000,0.001003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53889,0.004034,0.000051,0.000215,0.000000,0.000115,0.002556,0.000000,0.000262,0.000642,0.002873,0.009198,0.002436,0.033315,0.000011,0.000326,0.001732,0.002645,0.005128,0.001419,0.000000
53890,0.002245,0.000000,0.000349,0.000032,0.000036,0.001293,0.000211,0.000284,0.000191,0.000735,0.011863,0.000000,0.000342,0.000000,0.000222,0.000125,0.000078,0.002843,0.001266,0.000000
53891,0.000000,0.000243,0.019797,0.000729,0.000021,0.001076,0.000246,0.000000,0.023677,0.000004,0.000565,0.000000,0.000000,0.000000,0.000647,0.000000,0.000352,0.000000,0.000000,0.000000
53892,0.000000,0.000472,0.000045,0.006141,0.000222,0.000964,0.000134,0.000000,0.000285,0.001674,0.002793,0.001681,0.015987,0.000686,0.000521,0.003910,0.000952,0.000128,0.002212,0.016463


In [84]:
topic_words = str_topics(topic_term, tfidf_vectorizer, top_n=10)
topic_words

['com, google, exported, automatically, code, tools, support, frozenbubbleandroid, android, wordsearch',
 'website, official, com, personal, source, academic, public, hugo, static, portfolio',
 'project, final, udacity, driving, nanodegree, car, school, university, capstone, software',
 'web, application, based, framework, site, interface, service, page, development, applications',
 'blog, jekyll, theme, hugo, pages, system, gatsby, com, posts, based',
 'library, javascript, java, php, ruby, common, android, net, utility, parsing',
 'api, wrapper, rest, ruby, php, restful, sdk, interface, json, service',
 'de, para, la, en, sistema, proyecto, da, del, curso, el',
 'data, tool, analysis, tools, structures, science, based, system, files, package',
 'game, engine, based, jam, written, unity, multiplayer, board, html5, development',
 'app, android, ios, react, django, mobile, native, rails, demo, backend',
 'python, written, package, module, based, bindings, wrapper, interface, learning, s