### Apply LDA and NMF to the original articles that contain only nouns

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pandas as pd
import os
import re

df_Clusters = pd.DataFrame(columns=('Year', 'Model', 'Cluster', 'Features'))

def write_topics_to_file(year, model_name, model, feature_names, no_top_words):
    
    df_tmp = pd.DataFrame(columns=('Year', 'Model', 'Cluster', 'Features'))

    for topic_idx, topic in enumerate(model.components_):
        
        top_features = " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])
        
        df_tmp = df_tmp.append({'Year':year, 'Model':model_name, 'Cluster': topic_idx, 'Features':top_features}, ignore_index=True) 
        
    print("Year", year, "model", model_name, "number of topics", len(df_tmp))
    return df_tmp  

# /Data_AllSources/PKLs/ contians 12 files, each has articles published in a particular a year from the study range. 

for filename in os.listdir(os.getcwd()+"/Data_AllSources/PKLs/"):
                
    df = pd.read_pickle("Data_AllSources/PKLs/"+filename)
    documents = df.text
    
    # get the year from the name of the file
    numbers_from_string = re.compile(r'\d+')
    year = re.findall(numbers_from_string, filename)
    year = str(year[0])
    
    no_features = 1000

    # NMF is able to use tf-idf
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(documents)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(documents)
    tf_feature_names = tf_vectorizer.get_feature_names()

    for no_topics in range(5,11):

        # Run NMF
        nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

        # Run LDA
        lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

        no_top_words = 10
        df_tmp = write_topics_to_file(year, "NMF", nmf, tfidf_feature_names, no_top_words)
        df_Clusters = df_Clusters.append(df_tmp, ignore_index = True)
        df_tmp = write_topics_to_file(year, "LDA", lda, tf_feature_names, no_top_words)
        df_Clusters = df_Clusters.append(df_tmp, ignore_index = True)
        

print("File", filename, "number of lines",len(df_Clusters))
df_Clusters.to_csv("Clusters/Clusters.csv")        
    

Year 2006 model NMF number of topics 5
Year 2006 model LDA number of topics 5
Year 2006 model NMF number of topics 6
Year 2006 model LDA number of topics 6
Year 2006 model NMF number of topics 7
Year 2006 model LDA number of topics 7
Year 2006 model NMF number of topics 8
Year 2006 model LDA number of topics 8
Year 2006 model NMF number of topics 9
Year 2006 model LDA number of topics 9
Year 2006 model NMF number of topics 10
Year 2006 model LDA number of topics 10
Year 2007 model NMF number of topics 5
Year 2007 model LDA number of topics 5
Year 2007 model NMF number of topics 6
Year 2007 model LDA number of topics 6
Year 2007 model NMF number of topics 7
Year 2007 model LDA number of topics 7
Year 2007 model NMF number of topics 8
Year 2007 model LDA number of topics 8
Year 2007 model NMF number of topics 9
Year 2007 model LDA number of topics 9
Year 2007 model NMF number of topics 10
Year 2007 model LDA number of topics 10
Year 2008 model NMF number of topics 5
Year 2008 model LDA n