In [632]:
import os
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [633]:
# Load Dataset
documents_list = []
with open( os.path.join("articles.txt") ,"r",  encoding='utf-8') as fin:
    for line in fin.readlines():
            text = line.strip()
            documents_list.append(text)

In [634]:
tokenizer = RegexpTokenizer(r'\w+')
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        #ngram_range = (1,2),
                        tokenizer = tokenizer.tokenize)

train_data = tfidf.fit_transform(documents_list) #bag_of_words

In [635]:
Df1 = pd.DataFrame(train_data.toarray(), columns =tfidf.get_feature_names_out())
Df1

Unnamed: 0,0,00,000,0000,00002p,0001,00017,000ft,000km,000kr,...,šerbedžija,švankmajer,šárka,žantovský,želimir,žiga,žilnik,život,π,ツ
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.014397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.016772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4546,0.0,0.0,0.016193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4547,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4548,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4549,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [636]:
num_topics = 7
svd = TruncatedSVD(n_components=num_topics)
lsa = svd.fit_transform(train_data)
#Sigma = lsa.singular_values_ 
#V_transpose = lsa.components_.T

In [637]:
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic " + str(i) for i in range(num_topics)])
topic_encoded_df["body"] = documents_list
topic_encoded_df = topic_encoded_df[ ['body'] + [ col for col in topic_encoded_df.columns if col != 'body' ]]
topic_encoded_df['abs_max'] = topic_encoded_df[list(topic_encoded_df.drop('body', axis=1))].abs().max(axis=1)
#topic_encoded_df['max'] = np.nanmax(topic_encoded_df.drop('body', axis=1).values, axis=1)
topic_encoded_df

Unnamed: 0,body,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,abs_max
0,Barclays' defiance of US fines has merit Barcl...,0.115692,-0.059628,-0.021948,-0.088516,0.202537,-0.062323,0.012338,0.202537
1,How big is Hillary Clinton's lead in the presi...,0.192409,0.163622,-0.012582,0.009129,0.020097,0.009142,0.000501,0.192409
2,Zika’s greatest ally is human intransigence Th...,0.075472,-0.013817,0.006731,-0.047024,-0.020981,0.038978,0.001500,0.075472
3,Fight for the right: Cruz and Rubio spar in Ne...,0.280350,0.272369,0.000598,0.052851,0.025939,0.013592,-0.000275,0.280350
4,Voting day: America finally goes the polls Can...,0.336758,0.244177,-0.034515,0.008498,0.022718,0.000978,-0.002983,0.336758
...,...,...,...,...,...,...,...,...,...
4546,'The party I worked for died tonight': Republi...,0.408298,0.373303,-0.042374,0.024444,0.008978,0.000554,0.004032,0.408298
4547,The Republican women unfazed by Trump's sexism...,0.443965,0.300295,0.026420,-0.047537,-0.050043,-0.002043,0.018148,0.443965
4548,George I urged daughter to try smallpox inocul...,0.100233,-0.028457,0.030780,-0.035317,-0.024156,0.000901,0.006742,0.100233
4549,Europe and US remain divided by a common techn...,0.208727,-0.076862,-0.037973,-0.023192,-0.025836,-0.041809,-0.024591,0.208727


In [638]:
unpv_topic_encoded_df = pd.melt(topic_encoded_df, id_vars = ['body', 'abs_max',], var_name = 'topic_name') 
unpv_topic_encoded_df = unpv_topic_encoded_df[unpv_topic_encoded_df['abs_max'] == unpv_topic_encoded_df['value']] 
unpv_topic_encoded_df = unpv_topic_encoded_df[['body', 'topic_name']]
unpv_topic_encoded_df

Unnamed: 0,body,topic_name
1,How big is Hillary Clinton's lead in the presi...,topic 0
2,Zika’s greatest ally is human intransigence Th...,topic 0
3,Fight for the right: Cruz and Rubio spar in Ne...,topic 0
4,Voting day: America finally goes the polls Can...,topic 0
5,Twitter U-turns over banning white nationalist...,topic 0
...,...,...
30911,Norwich City 0-3 Sunderland: Premier League – ...,topic 6
30980,Arsenal 3-0 Chelsea: Premier League - as it ha...,topic 6
31101,West Brom v Manchester United: Premier League ...,topic 6
31156,Chelsea v West Ham United: Premier League – as...,topic 6


In [639]:
terms = tfidf.get_feature_names_out()

for index, component in enumerate(svd.components_):
    print(terms, component, "Topic "+str(index))

['0' '00' '000' ... 'život' 'π' 'ツ'] [2.33725530e-02 7.82489772e-04 3.79451915e-02 ... 3.32456630e-05
 3.88373020e-05 1.65533312e-04] Topic 0
['0' '00' '000' ... 'život' 'π' 'ツ'] [-2.86713653e-02 -5.21619024e-04 -1.94865643e-02 ... -2.43258786e-05
 -2.49849253e-05  1.97089329e-04] Topic 1
['0' '00' '000' ... 'život' 'π' 'ツ'] [ 3.61374708e-02 -5.06386159e-05 -1.26754159e-02 ...  6.16434451e-05
  6.43968543e-05 -4.27607548e-05] Topic 2
['0' '00' '000' ... 'život' 'π' 'ツ'] [ 4.44512479e-02 -1.25739791e-03 -3.33259234e-02 ... -8.36847609e-05
 -7.62685184e-05 -6.94257781e-05] Topic 3
['0' '00' '000' ... 'život' 'π' 'ツ'] [ 6.98490872e-02  1.99848804e-03  3.06443475e-02 ... -9.30885151e-05
 -8.27014864e-05  2.55530776e-04] Topic 4
['0' '00' '000' ... 'život' 'π' 'ツ'] [ 5.93626049e-03 -1.26367775e-03  1.70372946e-02 ... -1.49440580e-04
 -9.99363864e-05 -7.64976653e-05] Topic 5
['0' '00' '000' ... 'život' 'π' 'ツ'] [-1.61306279e-04  6.04042940e-04 -1.07021942e-02 ... -2.13234338e-05
  1.59732695

In [640]:
terms = tfidf.get_feature_names_out()
topics = dict()

for index, component in enumerate(svd.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    topics["topic "+str(index)] = " ".join(top_terms_list)
    #print("Topic "+str(index)+": ",top_terms_list)
topics

{'topic 0': 's trump said eu t people clinton',
 'topic 1': 'trump clinton republican donald cruz hillary sanders',
 'topic 2': 's league season min leicester premier goal',
 'topic 3': 'eu league min season brexit leicester united',
 'topic 4': 'bank banks banking rbs financial customers trump',
 'topic 5': 'health nhs care mental patients doctors hospital',
 'topic 6': 'min ball corner yards goal shot kick'}

In [641]:
topics_df = pd.DataFrame.from_dict(topics, orient='index', columns = ['topics'])
topics_df.reset_index(inplace= True)
topics_df = topics_df.rename(columns={"index": "topic_name"})
topics_df

Unnamed: 0,topic_name,topics
0,topic 0,s trump said eu t people clinton
1,topic 1,trump clinton republican donald cruz hillary s...
2,topic 2,s league season min leicester premier goal
3,topic 3,eu league min season brexit leicester united
4,topic 4,bank banks banking rbs financial customers trump
5,topic 5,health nhs care mental patients doctors hospital
6,topic 6,min ball corner yards goal shot kick


In [642]:
result = pd.merge(unpv_topic_encoded_df, topics_df , how="inner", on=["topic_name"])
result

Unnamed: 0,body,topic_name,topics
0,How big is Hillary Clinton's lead in the presi...,topic 0,s trump said eu t people clinton
1,Zika’s greatest ally is human intransigence Th...,topic 0,s trump said eu t people clinton
2,Fight for the right: Cruz and Rubio spar in Ne...,topic 0,s trump said eu t people clinton
3,Voting day: America finally goes the polls Can...,topic 0,s trump said eu t people clinton
4,Twitter U-turns over banning white nationalist...,topic 0,s trump said eu t people clinton
...,...,...,...
4459,Norwich City 0-3 Sunderland: Premier League – ...,topic 6,min ball corner yards goal shot kick
4460,Arsenal 3-0 Chelsea: Premier League - as it ha...,topic 6,min ball corner yards goal shot kick
4461,West Brom v Manchester United: Premier League ...,topic 6,min ball corner yards goal shot kick
4462,Chelsea v West Ham United: Premier League – as...,topic 6,min ball corner yards goal shot kick


In [643]:
result.to_excel('res_lsa.xlsx')