In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from datetime import datetime
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim
import pyLDAvis
import pyLDAvis.sklearn

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
UK_News = pd.read_pickle(r'UK_News_Modif1_labels_new.df')

# LDA 

### Data embedding 

In [5]:
# this TfidfVectorizer has been used for embedding tweets when has been
# performed by setting use_idf to 'False' or to 'True' depending of on which kind of embedding want to use
tfidf_UKNews_LDA = TfidfVectorizer(tokenizer=lambda x: x,
                                   preprocessor=lambda x: x,
                                   min_df=0,max_df=1.0,use_idf=False,
                                   smooth_idf=False) 
#fit the TfidfVectorizer on tokens and get Document frequency matrix
dtm_UKNews_LDA = tfidf_UKNews_LDA.fit_transform(UK_News['Tweets_C_tok12'])


In [14]:
print('The shape of Document Term Matrix is:',dtm_UKNews_LDA.shape)

The shape of Document Term Matrix is: (105622, 4352)


In [11]:
print('Nr of unique words is',len(tfidf_UKNews_LDA.get_feature_names()),'or Nr of words into the vocabulary is',len(tfidf_UKNews_LDA.vocabulary_))

Nr of unique words is 4352 or Nr of words into the vocabulary is 4352


In [None]:
# Define CountVectorizer 
countV_UKNews_LDA= CountVectorizer(tokenizer=lambda x: x,preprocessor=lambda x: x,min_df=0,max_df=1.0, )
#Fitting CountVectorizer on tokens and get the data matrix
countV_Fit_UKNews_LDA = countV_UKNews_LDA.fit_transform(UK_News['Tweets_C_tok12'])

In [400]:
data_dense = dtm_UKNews_LDA.todense()
#Since most cells in this matrix will be zero, I am interested in knowing what percentage of cells contain non-zero values
# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  0.17394811840113522 %


### LDA clustering model 

Have been performed:
<br>
- LDA with use_idf and smooth_idf =True 
- LDA with use_idf and smooth_idf =False 
- LDA with CountVectorizer 
<br>
Each of these  models have been performed with 'n_components' parameter with several values: 13,14,15 and 16. 
<br> 
The ouput of the follwing cells are referingt to the last LDA model configuration that has been run

In [7]:
#Define LDA model 
#the follwoing code has been used for each LDA configuration by changing
#n_components parameter.
LDA_skl = LatentDirichletAllocation(n_components=14,random_state=11,
                                    max_iter=100,max_doc_update_iter=1000,
                                    n_jobs=-1) 

In [8]:
# Fitting the LDA model on embedded data 
LDA_skl.fit(dtm_UKNews_LDA ) 
topics_LDA_skl = LDA_skl.transform(dtm_UKNews_LDA)

In [631]:
# assigning LDA cluster to each tweet
UK_News['LDA_skl Topic'] = topics_LDA_skl.argmax(axis=1) 

In [446]:
for index,topic in enumerate(LDA_skl.components_):  
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([countV_UKNews_LDA.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['man', 'protester', 'anti', 'podcast', 'french', 'listen', 'officer', 'trump', 'people', 'protest', 'black', 'attack', 'white', 'house', 'police']


THE TOP 15 WORDS FOR TOPIC #1
['presidential', 'poll', 'result', 'campaign', 'america', 'state', 'vote', 'president', 'win', 'joe', 'donald', 'usa', 'biden', 'election', 'trump']


THE TOP 15 WORDS FOR TOPIC #2
['local', 'christmas', 'pm', 'area', 'national', 'secretary', 'burley', 'kay', 'government', 'rule', 'england', 'restriction', 'tier', 'coronavirus', 'lockdown']


THE TOP 15 WORDS FOR TOPIC #3
['court', 'boy', 'girl', 'meghan', 'life', 'care', 'son', 'die', 'family', 'baby', 'man', 'mum', 'old', 'home', 'woman']


THE TOP 15 WORDS FOR TOPIC #4
['uk', 'sell', 'spend', 'bn', 'sale', 'food', 'cut', 'company', 'dollar', 'pay', 'job', 'million', 'money', 'christmas', 'pound']


THE TOP 15 WORDS FOR TOPIC #5
['william', 'series', 'harry', 'film', 'kate', 'crown', 'princess', 'die', 'love', 'fan', 'queen', '

### Models coherence scores
Once a certain LDA model configuration has been run then the coherence score for each of them has been computed and taken note of the values since each LDA model configuration has been run one at time by changing and using the appropriate model variables and parameters values.

In [633]:
#countV_UKNews_LDA  countV_Fit_UKNews_LDA 
metric_coherence_gensim(measure='c_v', #u_mass, c_v
                        top_n=20, return_mean=True, #return_coh_model=False
                        topic_word_distrib=LDA_skl.components_, 
                        dtm=dtm_UKNews_LDA,  
                        vocab=np.array([x for x in tfidf_UKNews_LDA.vocabulary_.keys()]), 
                       texts=UK_News['Tweets_C_tok12'].values)


0.7646429214383883

#### Notes of coherence
1) with 15 clusters: 0.74(with idf=True), 0.75 (with idf=False),0.76 (with CountVect)
<br>
2) with 14 clusters: 0.76(with idf=True), 0.765(with idf=False),0.769 (with CountVect)
<br>
3) with 13 clusters: 0.76 (with idf=True), 0.77(with idf=False), 0.763 (with CountVect)
<br>
4) with 16 clusters: 0.75(with idf=True), 0.75(with idf=False), 0.754 (with CountVect)

### LDA clusters and clusters words relevance Visualization

In [9]:
pyLDAvis.enable_notebook()

In [None]:
dash = pyLDAvis.sklearn.prepare(LDA_skl, dtm_UKNews_LDA, tfidf_UKNews_LDA,mds='tsne', sort_topics=False) #mds='mmds' , mds='tsne'
dash

# Models Validation process
For each LDA model configuration have been examined each cluster most relevant words and based of such examination each cluster has been assigned to a topic label. During such process have been taken notes about cluster topics.
<br>
During the labeling process were used the following labels:
<br>
(since these labels, except 'other Labels', are the labels used during the human labeling of a sample of tweets)
<br>
Label1-covid, health
<br>
Labe2-election, usa, trump, biden,usa politics
<br>
Label3- crimes
<br>
Label4-royals
<br>
Label4_2-celebrity
<br>
Label5-not used. Wanted to be used for 'Life&People' topic but because of time consuming of the labeling process at the end it was not used but in the future can be used for such 
topic
<br>
Label6- sport
<br>
Label7-brexit
<br>
Label8-space
<br>
Label8_2-tech, digital companies. It has been calles 8_2 instead of just 9 because during 
the labeling process initially there was some indecision if to put it together with Label8
<br>
Label9-politics
<br>
Label10-economy, financial, markets
<br>
Label11-not used, the same as fo Label5 
<br>
Label12- international affairs, war, conflicts
<br>
Label13- meteo, environment conditions 
<br>
Plus 'other Labels' if other topics emerged or merge topics if they are emerged into a single cluster



#### Labeling LDA with idf=True and 13 clusters

In [636]:
#UK_News.insert(34, "LDA_skl Topic13 Label", " ")
labels_true15=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2,Label10','Label9']
labels_lda15=[2,          1,       0,      5,        8,           9,        7,        3,     4,                10]
#cluster 10 mixed, politics, crimes
#cluster 11 mostly about crime but also weather is whole included in it, label13 will not be considered
#label10 about economy is inside cluster 4 and 10, it is not well defined, label10 and 8_2 will be taken together as a cluster
#label 12 can not be related to a cluster so will not be considerated as cluster
for i, j in zip(labels_lda15, labels_true15):
    UK_News.loc[UK_News['LDA_skl Topic13'] == i, ['LDA_skl Topic13 Label']] =j
labels_true25=['Label1','Label3','Label1']
labels_lda25=[6,11,12]
for i, j in zip(labels_lda25, labels_true25):
    UK_News.loc[UK_News['LDA_skl Topic13'] == i, ['LDA_skl Topic13 Label']] =j

#### Labeling LDA with idf=True and 16 clusters

In [617]:
#UK_News.insert(33, "LDA_skl Topic16 Label", " ")
labels_true14=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8_2','Label9','Label10','Label12','Label13','Mix']
labels_lda14=[2,          1,       14,      5,        8,           9,        7,        4,     10,      13,       3,       11 ,0]
#cluster 0 mixed
#cluster 3 mixed: space, nuclear, royals, is not clear let assign it to label 12 while label8 is not cosidered
#cluster 11 mixed about crime and weather
for i, j in zip(labels_lda14, labels_true14):
    UK_News.loc[UK_News['LDA_skl Topic16'] == i, ['LDA_skl Topic16 Label']] =j
labels_true24=['Label1','Label1','Label3']
labels_lda24=[6,12,15]
for i, j in zip(labels_lda24, labels_true24):
    UK_News.loc[UK_News['LDA_skl Topic16'] == i, ['LDA_skl Topic16 Label']] =j

#### Labeling LDA with idf=True and 14 clusters

In [603]:
#UK_News.insert(32, "LDA_skl Topic14 Label", " ")
labels_true13=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8_2','Label9','Label10','Label12','Label13']
labels_lda13=[12,          0,       3,      5,      8,           9,        7,        4,       10,      13,       1,        11  ]
#cluster 6 mix crime and vaccine
#label8 about space is into cluster 3 that is about crime mostly so label8 will not be considered 
for i, j in zip(labels_lda13, labels_true13):
    UK_News.loc[UK_News['LDA_skl Topic14'] == i, ['LDA_skl Topic14 Label']] =j
#'Label9' -> in 10 plus 2 
labels_true23=['Label1','Label3']
labels_lda23=[2,6]
for i, j in zip(labels_lda23, labels_true23):
    UK_News.loc[UK_News['LDA_skl Topic14'] == i, ['LDA_skl Topic14 Label']] =j

#### Labeling LDA with idf=True and 15 clusters

In [234]:
#UK_News.insert(23, "LDA_skl Topic15 Label", " ")
labels_true1=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label13']
labels_lda1=[6,           1,       14,      5,      8,        9,        7,        3,         4,         10 ,      13,         11  ]
for i, j in zip(labels_lda1, labels_true1):
    UK_News.loc[UK_News['LDA_skl Topic15'] == i, ['LDA_skl Topic15 Label']] =j
#'Label3'->14 plus 9 plus 
#'Label4'->5 plus 3; 'Label4_2'->8 plus 5
#'Label8' ->3 (in the cluster that is mix a bit of celeb/royal, space, international affairs/conflicts, eviroment events but seems the topic space is all included here and not splitted among other cluters too)
# so Label8 will not be assigned as cluster 
#'Label8_2' -> 4 and (a bit the term technology in 6)
#'Label9' -> in 10 plus 2 
labels_true2=['Label2','Label1', 'Label1']
labels_lda2=[0,2,12]
for i, j in zip(labels_lda2, labels_true2):
    UK_News.loc[UK_News['LDA_skl Topic15'] == i, ['LDA_skl Topic15 Label']] =j

#### Labeling LDA with idf=False and 16 clusters

In [588]:
#UK_News.insert(31, "LDA_skl_False_idf Topic16 Label", " ")
labels_true8=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8_2','Label9','Label10','Label13']
labels_lda8=[    2,         1,       6,      3,       5,        9,        7,         4,      10,      13,        11      ]
#clsuter 3 about royals plus space(secondary according to top words)
#cluster 8 pretty mixed
#cluster 11 pretty mixed but for sure almost all about weather is in it
for i, j in zip(labels_lda8, labels_true8):
    UK_News.loc[UK_News['LDA_skl_False_idf Topic16'] == i, ['LDA_skl_False_idf Topic16 Label']] =j

labels_true88=['Label2','Label4_2','Label1','Label3','Label1']
labels_lda88=[0,8,12,14,15]
for i, j in zip(labels_lda88, labels_true88):
    UK_News.loc[UK_News['LDA_skl_False_idf Topic16'] == i, ['LDA_skl_False_idf Topic16 Label']] =j

#### Labeling LDA with idf=False and 13 clusters

In [563]:
#UK_News.insert(30, "LDA_skl_False_idf Topic13 Label", " ")
labels_true7=['Label1','Label2','Label3','Label4,Label4_2','Label6','Label7','Label8','Label8_2,Label10','Label9','Label12','Label13']
labels_lda7=[    2,         1,       6,      5,               9,        7,       3,          4,            10,         0,      11      ]
#cluster 3 pretty mixe: space, life, negative/bad news
#cluster 7 about brexit but also vaccine, plus economy
#cluster 8 pretty mix about life and usa politics election
#cluster 11 about holiday and weather and other
#cluster 4 about companies and also financials, label8_2 and 10 are kespt together
for i, j in zip(labels_lda7, labels_true7):
    UK_News.loc[UK_News['LDA_skl_False_idf Topic13'] == i, ['LDA_skl_False_idf Topic13 Label']] =j

labels_true77=['Label2','Label1']
labels_lda77=[8,12]
for i, j in zip(labels_lda77, labels_true77):
    UK_News.loc[UK_News['LDA_skl_False_idf Topic13'] == i, ['LDA_skl_False_idf Topic13 Label']] =j

#### Labeling LDA with idf=False and 14 clusters

In [543]:
#UK_News.insert(29, "LDA_skl_False_idf Topic14 Label", " ")
labels_true7=['Label1','Label2','Label3','Label4,Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label12','Label13']
labels_lda7=[    2,         0,       6,      5,               9,        7,       3,          4,      10,      13,         1,      11      ]
#cluster 3 pretty mixed - life, crime, space 
#cluster5 is about royals but also celebrities so will be kept together label4 and 4_2
for i, j in zip(labels_lda7, labels_true7):
    UK_News.loc[UK_News['LDA_skl_False_idf Topic14'] == i, ['LDA_skl_False_idf Topic14 Label']] =j
labels_true77=['Label1','Label4,Label4_2']
labels_lda77=[12,8]
for i, j in zip(labels_lda77, labels_true77):
    UK_News.loc[UK_News['LDA_skl_False_idf Topic14'] == i, ['LDA_skl_False_idf Topic14 Label']] =j

#### Labeling LDA with idf=False and 15 clusters

In [148]:

labels_true11=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8_2','Label9','Label10','Label13']
labels_lda11=[12, 1, 14,  3,  5, 9, 7,  4, 10 , 13,   11  ]
for i, j in zip(labels_lda11, labels_true11):
    UK_News.loc[UK_News['LDA_skl_False_idf Topic15'] == i, ['LDA_skl_False_idf Topic15 Label']] =j
#'Label1'->12 plus 2  #'Label2'->1 plus 0
#'Label3' -> 14 puls 6
#Label8->no cluster well defined, it is into cluster 3 that speaks about celebrities mostly
### ATTETION: IN CLUSTER 6 THERE ARE LABELS 3 AND 12 #Label12-> XX  so i will keep only label 3 for clusters 6 and 14 while the label 12
#can not be well definded as cluster.
# IN CLUSTER 3 THERE ARE MANILY LABEL4 BUT ALSO LABEL8, we will keep only label4 for cluster 3 while label8 can not be well defined as 
#cluster
labels_true22=['Label2','Label1','Label3','Label_people_life' ]
labels_lda22=[0,2,6,8]
for i, j in zip(labels_lda22, labels_true22):
    UK_News.loc[UK_News['LDA_skl_False_idf Topic15'] == i, ['LDA_skl_False_idf Topic15 Label']] =j


#### Labeling LDA with CountVect and 16 clusters

In [504]:
#UK_News.insert(28, "LDA_skl_CountV Topic16 Label", " ")
labels_true6=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8_2','Label9','Label10','Label13','Life_schoolMix']
labels_lda6=[    2,         1,       14,      3,       5,        9,        7,        4,      10 ,      13,          11,     8 ]
#cluster 0 pretty mixed
#cluster 3 about royals, plus space, plus others
#cluster 4 about economy, companies, tech companies, sells
#cluster 10 about politics, then about international issues and also about crime
#cluster 11 pretty mixed including weather
for i, j in zip(labels_lda6, labels_true6):
    UK_News.loc[UK_News['LDA_skl_CountV Topic16'] == i, ['LDA_skl_CountV Topic16 Label']] =j
labels_true6=['Label4_2','Label1','Life_schoolMix','Label1']
labels_lda6=[6,12,0,15]
for i, j in zip(labels_lda6, labels_true6):
    UK_News.loc[UK_News['LDA_skl_CountV Topic16'] == i, ['LDA_skl_CountV Topic16 Label']] =j

#### Labeling LDA with CountVect and 13 clusters

In [471]:
#UK_News.insert(27, "LDA_skl_CountV Topic13 Label", " ")
labels_true5=['Label1','Label2','Label3','Label4,Label4_2','Label6','Label7','Label8','Label8_2,Label10','Label9','Label13','Life_schoolMix']
labels_lda5=[    2,         1,       0,      5,               9,        7,      3,         4,              10 ,         11,     8 ]
# Cluster 0 very mixed
# Cluster 3 is about life and space
# Cluster 4 is about economy and tech companies will put label8_2 and label10 together for cluster 4
# Cluster 5 is about royals plus celebrity: Label4 and 4_2 together
# Cluster 6 is pretty mixed
# Cluster 7 is pretty mixed: brexit, politics, economy, international issues. The 
# So Label7 cluster 7 , while  label12 will not be considered since it can not be related to a cluster in a "evident way"
# Cluster 11 is pretty mixed: travel,sport, weather,  ecc
for i, j in zip(labels_lda5, labels_true5):
    UK_News.loc[UK_News['LDA_skl_CountV Topic13'] == i, ['LDA_skl_CountV Topic13 Label']] =j
labels_true5=['Label3','Label1']
labels_lda5=[6,12]
for i, j in zip(labels_lda5, labels_true5):
    UK_News.loc[UK_News['LDA_skl_CountV Topic13'] == i, ['LDA_skl_CountV Topic13 Label']] =j

#### Labeling LDA with CountVect and 14 clusters

In [423]:
#UK_News.insert(26, "LDA_skl_CountV Topic14 Label", " ")
labels_true4=['Label1','Label2','Label3','Label4,Label4_2','Label6','Label7','Label8','Label8_2','Label9, Label12','Label10','Label13','Life_schoolMix']
labels_lda4=[    2,         0,       6,      5,               9,        7,        3,         4,      10 ,            13,         11 ,  8 ]
#Label8 is abuout life and space
#Label7 about brexit plus politics
#Life_schoolMix for cluster 8
# Cluster 10 contains both label10 and 12 about politics and world conflicts
# Cluster 5 contains bothe label4 and 4_2, 
for i, j in zip(labels_lda4, labels_true4):
    UK_News.loc[UK_News['LDA_skl_CountV Topic14'] == i, ['LDA_skl_CountV Topic14 Label']] =j
labels_true33=['Label2','Label1']
labels_lda33=[1,12]
for i, j in zip(labels_lda33, labels_true33):
    UK_News.loc[UK_News['LDA_skl_CountV Topic14'] == i, ['LDA_skl_CountV Topic14 Label']] =j

#### Labeling LDA with CountVect and 15 clusters

In [406]:
#UK_News.insert(25, "LDA_skl_CountV Topic15 Label", " ")
labels_true3=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label_life','Label8_2','Label9','Label10','Label13']
labels_lda3=[2,           1,       14,      3,      5,        0,        7,        8,         4,         10 ,      13,         11  ]
for i, j in zip(labels_lda3, labels_true3):
    UK_News.loc[UK_News['LDA_skl_CountV Topic15'] == i, ['LDA_skl_CountV Topic15 Label']] =j
labels_true33=['Label1','Label3','Label12']
labels_lda33=[12,6,9]
for i, j in zip(labels_lda33, labels_true33):
    UK_News.loc[UK_News['LDA_skl_CountV Topic15'] == i, ['LDA_skl_CountV Topic15 Label']] =j

In [676]:
#Save dataset with new columns labels
#UK_News.to_pickle('UK_News_LDA_labels.df')

# Accuracy - F1 score 

In [637]:
#Subset only the tweets that have 'True Topic' label that represent the 
#human assigned label.
UK_News_V = UK_News.loc[UK_News['True Topic'] != '']

### LDA with idf=True 

In [526]:
data_classes_true=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label12','Label13']
data_classes_true_nr=[1,2,3,4,5,6,7, 8,9,10,11,  12,13]
d_true= dict(zip(data_classes_true, data_classes_true_nr))
labels_true_nr=UK_News_V['True Topic'].map(d_true, na_action='ignore')
data_classes_lda1_15=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label13']
data_classes_lda1_15_nr=[1,2,3,4,5,6,7,8,9,10,11, 13]
d_lda1_15 = dict(zip(data_classes_lda1_15, data_classes_lda1_15_nr))
labels_lda1_15_nr=UK_News_V['LDA_skl Topic15 Label'].map(d_lda1_15, na_action='ignore')
from sklearn.metrics import f1_score, accuracy_score
print("Accuracy with idf=True 15 Clusters:",round(accuracy_score(list(labels_true_nr),list(labels_lda1_15_nr),  normalize=True)*100,2),"%")
print("F1 Score:",round(f1_score(list(labels_true_nr),list(labels_lda1_15_nr),  average='weighted')*100,2),"%")

Accuracy with idf=True 15 Clusters: 65.62 %
F1 Score: 61.86 %


In [606]:
data_classes_true0=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label12','Label13']
data_classes_true_nr0=[1,      2,       3,       4,        5,        6,         7,       8,      9,        10,      11,       12,       13]
d_true0= dict(zip(data_classes_true0, data_classes_true_nr0))
labels_true_nr0=UK_News_V['True Topic'].map(d_true0, na_action='ignore')
data_classes_lda1_14=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8_2','Label9','Label10','Label12','Label13']
data_classes_lda1_14_nr=[1,      2,        3,      4,        5,        6,        7,         9,          10,     11,     12,         13]
d_lda1_14 = dict(zip(data_classes_lda1_14, data_classes_lda1_14_nr))
labels_lda1_14_nr=UK_News_V['LDA_skl Topic14 Label'].map(d_lda1_14, na_action='ignore')
print("Accuracy with idf=True 14 Clusters:",round(accuracy_score(list(labels_true_nr0),list(labels_lda1_14_nr),  normalize=True)*100,2),"%")
print("F1 Score:",round(f1_score(list(labels_true_nr0),list(labels_lda1_14_nr),  average='weighted')*100,2),"%")

Accuracy with idf=True 14 Clusters: 68.63 %
F1 Score: 66.74 %


In [620]:
data_classes_true00=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label12','Label13']
data_classes_true_nr00=[1,      2,       3,       4,        5,        6,         7,       8,      9,        10,      11,       12,       13]
d_true00= dict(zip(data_classes_true00, data_classes_true_nr00))
labels_true_nr00=UK_News_V['True Topic'].map(d_true00, na_action='ignore')
data_classes_lda1_16=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8_2','Label9','Label10','Label12','Label13','Mix']
data_classes_lda1_16_nr=[1,      2,        3,      4,        5,        6,        7,         9,          10,     11,     12,         13,  14]
d_lda1_16 = dict(zip(data_classes_lda1_16, data_classes_lda1_16_nr))
labels_lda1_16_nr=UK_News_V['LDA_skl Topic16 Label'].map(d_lda1_16, na_action='ignore')
print("Accuracy with idf=True 16 Clusters:",round(accuracy_score(list(labels_true_nr00),list(labels_lda1_16_nr),  normalize=True)*100,2),"%")
print("F1 Score:",round(f1_score(list(labels_true_nr00),list(labels_lda1_16_nr),  average='weighted')*100,2),"%")

Accuracy with idf=True 16 Clusters: 63.21 %
F1 Score: 61.65 %


In [640]:
data_classes_true01=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label12','Label13']
data_classes_true_nr01=[1,      2,       3,       4,        5,        6,         7,       8,      9,        10,      9,       12,       13]
d_true01= dict(zip(data_classes_true01, data_classes_true_nr01))
labels_true_nr01=UK_News_V['True Topic'].map(d_true01, na_action='ignore')
data_classes_lda1_13=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2,Label10','Label9']
data_classes_lda1_13_nr=[1,      2,        3,      4,        5,        6,        7,        8,          9,          10,]
d_lda1_13 = dict(zip(data_classes_lda1_13, data_classes_lda1_13_nr))
labels_lda1_13_nr=UK_News_V['LDA_skl Topic13 Label'].map(d_lda1_13, na_action='ignore')
print("Accuracy with idf=True 13 Clusters:",round(accuracy_score(list(labels_true_nr01),list(labels_lda1_13_nr),normalize=True)*100,2),"%")
print("F1 Score:",round(f1_score(list(labels_true_nr01),list(labels_lda1_13_nr),average='weighted')*100,2),"%")

Accuracy with idf=True 13 Clusters: 58.17 %
F1 Score: 52.99 %


### LDA with idf=False 

In [525]:
data_classes_lda2_15=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8_2','Label9','Label10','Label13','Label_people_life']
data_classes_lda2_15_nr=[1,2,3,4,5,6,7,9,10,11,13,14]
d_lda2_15 = dict(zip(data_classes_lda2_15, data_classes_lda2_15_nr))
labels_lda2_15_nr=UK_News_V['LDA_skl_False_idf Topic15 Label'].map(d_lda2_15, na_action='ignore')
print("Accuracy with idf=False 15 Clusters:",round(accuracy_score(list(labels_true_nr),list(labels_lda2_15_nr) )*100,2),"%") #,  normalize=True
print("F1 Score:",round(f1_score(list(labels_true_nr),list(labels_lda2_15_nr),average='weighted')*100,2),"%") #, 

Accuracy with idf=False 15 Clusters: 74.86 %
F1 Score: 71.18 %


In [546]:
data_classes_true5=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label12','Label13']
data_classes_true_nr5=[1,       2,      3,       4,        4,         6,       7,       8,       9,         10,      11,        12,   13]
d_true5= dict(zip(data_classes_true5, data_classes_true_nr5))
labels_true_nr5=(UK_News_V['True Topic'].map(d_true5, na_action='ignore')).astype(int)
data_classes_lda2_14=['Label1','Label2','Label3','Label4,Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label12','Label13']
data_classes_lda2_14_nr=[1,       2,       3,        4,               6,         7,     8,       9,       10,      11,         12,    13, ]
d_lda2_14 = dict(zip(data_classes_lda2_14, data_classes_lda2_14_nr))
labels_lda2_14_nr=UK_News_V['LDA_skl_False_idf Topic14 Label'].map(d_lda2_14, na_action='ignore')
print("Accuracy with idf=False 14 Clusters:",round(accuracy_score(list(labels_true_nr5),list(labels_lda2_14_nr) )*100,2),"%") #,  normalize=True
print("F1 Score:",round(f1_score(list(labels_true_nr5),list(labels_lda2_14_nr),average='weighted')*100,2),"%") #,  

Accuracy with idf=False 14 Clusters: 83.61 %
F1 Score: 83.63 %


In [589]:
data_classes_true6=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label12','Label13']
data_classes_true_nr6=[1,       2,      3,       4,        4,         6,       7,       8,       9,         10,      9,        12,   13]
d_true6= dict(zip(data_classes_true6, data_classes_true_nr6))
labels_true_nr6=(UK_News_V['True Topic'].map(d_true6, na_action='ignore')).astype(int)
data_classes_lda2_13=['Label1','Label2','Label3','Label4,Label4_2','Label6','Label7','Label8','Label8_2,Label10','Label9','Label12','Label13']
data_classes_lda2_13_nr=[1,       2,       3,        4,               6,         7,     8,       9,                10,         12,    13, ]
d_lda2_13 = dict(zip(data_classes_lda2_13, data_classes_lda2_13_nr))
labels_lda2_13_nr=UK_News_V['LDA_skl_False_idf Topic13 Label'].map(d_lda2_13, na_action='ignore')
print("Accuracy with idf=False 13 Clusters:",round(accuracy_score(list(labels_true_nr6),list(labels_lda2_13_nr) )*100,2),"%") #,  normalize=True
print("F1 Score:",round(f1_score(list(labels_true_nr6),list(labels_lda2_13_nr),average='weighted')*100,2),"%") #,  

Accuracy with idf=False 13 Clusters: 78.76 %
F1 Score: 78.5 %


In [592]:
data_classes_true7=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label12','Label13']
data_classes_true_nr7=[1,       2,      3,       4,        5,         6,       7,       8,       9,         10,      11,        12,   13]
d_true7= dict(zip(data_classes_true7, data_classes_true_nr7))
labels_true_nr7=(UK_News_V['True Topic'].map(d_true7, na_action='ignore')).astype(int)
data_classes_lda2_16=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8_2','Label9','Label10','Label13']
data_classes_lda2_16_nr=[1,       2,       3,        4,      5 ,        6,         7,      9,         10,    11,        13, ]
d_lda2_16 = dict(zip(data_classes_lda2_16, data_classes_lda2_16_nr))
labels_lda2_16_nr=UK_News_V['LDA_skl_False_idf Topic16 Label'].map(d_lda2_16, na_action='ignore')
print("Accuracy with idf=False 16 Clusters:",round(accuracy_score(list(labels_true_nr7),list(labels_lda2_16_nr) )*100,2),"%") #,  normalize=True
print("F1 Score:",round(f1_score(list(labels_true_nr7),list(labels_lda2_16_nr),average='weighted')*100,2),"%") #,  

Accuracy with idf=False 16 Clusters: 69.65 %
F1 Score: 65.3 %


### LDA with CountVectorizer

In [590]:
data_classes_true3=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label12','Label13']
data_classes_true_nr3=[1,       2,      3,       4,        4,         6,       7,       8,       9,         10,      9,         11,     13]
d_true3= dict(zip(data_classes_true3, data_classes_true_nr3))
labels_true_nr3=(UK_News_V['True Topic'].map(d_true3, na_action='ignore'))
data_classes_lda3_13=['Label1','Label2','Label3','Label4,Label4_2','Label6','Label7','Label8','Label8_2,Label10','Label9','Label13','Life_schoolMix']
data_classes_lda3_13_nr=[1,       2,       3,        4,               6,         7,       8,        9,               10,        13,        14]
d_lda3_13 = dict(zip(data_classes_lda3_13, data_classes_lda3_13_nr))
labels_lda3_13_nr=UK_News_V['LDA_skl_CountV Topic13 Label'].map(d_lda3_13, na_action='ignore')
print("Accuracy with CountV 13 Clusters:",round(accuracy_score(list(labels_true_nr3),list(labels_lda3_13_nr) )*100,2),"%") #,  normalize=True
print("F1 Score:",round(f1_score(list(labels_true_nr3),list(labels_lda3_13_nr),average='weighted')*100,2),"%") #,  

Accuracy with CountV 13 Clusters: 74.52 %
F1 Score: 72.64 %


In [523]:
data_classes_true2=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label12','Label13']
data_classes_true_nr2=[1,       2,      3,       4,        4,         6,       7,       8,       9,         10,      11,        10,   13]
d_true2= dict(zip(data_classes_true2, data_classes_true_nr2))
labels_true_nr2=(UK_News_V['True Topic'].map(d_true2, na_action='ignore')).astype(int)
data_classes_lda3_14=['Label1','Label2','Label3','Label4,Label4_2','Label6','Label7','Label8','Label8_2','Label9, Label12','Label10','Label13','Life_schoolMix']
data_classes_lda3_14_nr=[1,       2,       3,        4,               6,         7,    8,        9,         10,              11,        13,        14]
d_lda3_14 = dict(zip(data_classes_lda3_14, data_classes_lda3_14_nr))
labels_lda3_14_nr=UK_News_V['LDA_skl_CountV Topic14 Label'].map(d_lda3_14, na_action='ignore')
print("Accuracy with CountV 14 Clusters:",round(accuracy_score(list(labels_true_nr2),list(labels_lda3_14_nr) )*100,2),"%") #,  normalize=True
print("F1 Score:",round(f1_score(list(labels_true_nr2),list(labels_lda3_14_nr),average='weighted')*100,2),"%") #,  

Accuracy with CountV 14 Clusters: 78.28 %
F1 Score: 78.8 %


In [524]:
data_classes_lda3_15=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8_2','Label9','Label10','Label12','Label13','Label_life']
data_classes_lda3_15_nr=[1,2,3,4,5,6,7,9,10,11,12,13,14]
d_lda3_15 = dict(zip(data_classes_lda3_15, data_classes_lda3_15_nr))
labels_lda3_15_nr=UK_News_V['LDA_skl_CountV Topic15 Label'].map(d_lda3_15, na_action='ignore')
print("Accuracy with CountV 15 Clusters:",round(accuracy_score(list(labels_true_nr),list(labels_lda3_15_nr) )*100,2),"%") #,  normalize=True
print("F1 Score:",round(f1_score(list(labels_true_nr),list(labels_lda3_15_nr),average='weighted')*100,2),"%") #,  

Accuracy with CountV 15 Clusters: 64.13 %
F1 Score: 63.25 %


In [508]:
data_classes_true4=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8','Label8_2','Label9','Label10','Label12','Label13']
data_classes_true_nr4=[1,       2,      3,       4,        5,         6,       7,       8,       9,         10,      11,       12,        13]
d_true4= dict(zip(data_classes_true4, data_classes_true_nr4))
labels_true_nr4=(UK_News_V['True Topic'].map(d_true4, na_action='ignore'))
data_classes_lda3_16=['Label1','Label2','Label3','Label4','Label4_2','Label6','Label7','Label8_2','Label9','Label10','Label13','Life_schoolMix']
data_classes_lda3_16_nr=[1,       2,       3,        4,      5,         6,         7,         9,       10,      11,        13,        14]
d_lda3_16 = dict(zip(data_classes_lda3_16, data_classes_lda3_16_nr))
labels_lda3_16_nr=UK_News_V['LDA_skl_CountV Topic16 Label'].map(d_lda3_16, na_action='ignore')
print("Accuracy with CountV 16 Clusters:",round(accuracy_score(list(labels_true_nr4),list(labels_lda3_16_nr) )*100,2),"%") #,  normalize=True
print("F1 Score:",round(f1_score(list(labels_true_nr4),list(labels_lda3_16_nr),average='weighted')*100,2),"%") #,  


Accuracy with CountV 16 Clusters: 62.7 %
F1 Score: 59.63 %


In [None]:
#Saving dataset 
#UK_News_V.to_pickle('UK_News_Acc_LDA.df')
#UK_News_V = pd.read_pickle(r'UK_News_Acc_LDA.df')

## Accuracy and F1 Score - summarizing

In [672]:
#A for lda with idf=False
a_Acc=[round(accuracy_score(list(labels_true_nr6),list(labels_lda2_13_nr) )*100,2),
       round(accuracy_score(list(labels_true_nr5),list(labels_lda2_14_nr) )*100,2),
       round(accuracy_score(list(labels_true_nr),list(labels_lda2_15_nr) )*100,2),
       round(accuracy_score(list(labels_true_nr7),list(labels_lda2_16_nr) )*100,2)]
a_F1=[round(f1_score(list(labels_true_nr6),list(labels_lda2_13_nr),average='weighted')*100,2),
      round(f1_score(list(labels_true_nr5),list(labels_lda2_14_nr),average='weighted')*100,2),
      round(f1_score(list(labels_true_nr),list(labels_lda2_15_nr),average='weighted')*100,2),
      round(f1_score(list(labels_true_nr7),list(labels_lda2_16_nr),average='weighted')*100,2)]
#B for lda with CountV
b_Acc=[round(accuracy_score(list(labels_true_nr3),list(labels_lda3_13_nr) )*100,2), 
      round(accuracy_score(list(labels_true_nr2),list(labels_lda3_14_nr) )*100,2),
      round(accuracy_score(list(labels_true_nr),list(labels_lda3_15_nr) )*100,2),
      round(accuracy_score(list(labels_true_nr4),list(labels_lda3_16_nr) )*100,2)]

b_F1=[round(f1_score(list(labels_true_nr3),list(labels_lda3_13_nr),average='weighted')*100,2),
     round(f1_score(list(labels_true_nr2),list(labels_lda3_14_nr),average='weighted')*100,2),
     round(f1_score(list(labels_true_nr),list(labels_lda3_15_nr),average='weighted')*100,2),
     round(f1_score(list(labels_true_nr4),list(labels_lda3_16_nr),average='weighted')*100,2)]
#C for lda with idf=True
c_Acc=[round(accuracy_score(list(labels_true_nr01),list(labels_lda1_13_nr),normalize=True)*100,2),
       round(accuracy_score(list(labels_true_nr0),list(labels_lda1_14_nr),normalize=True)*100,2),
       round(accuracy_score(list(labels_true_nr),list(labels_lda1_15_nr), normalize=True)*100,2),
       round(accuracy_score(list(labels_true_nr00),list(labels_lda1_16_nr), normalize=True)*100,2)]
c_F1=[round(f1_score(list(labels_true_nr01),list(labels_lda1_13_nr),average='weighted')*100,2) 
      ,round(f1_score(list(labels_true_nr0),list(labels_lda1_14_nr),  average='weighted')*100,2),
      round(f1_score(list(labels_true_nr),list(labels_lda1_15_nr),  average='weighted')*100,2),
      round(f1_score(list(labels_true_nr00),list(labels_lda1_16_nr),  average='weighted')*100,2)]
topic=[13,14,15,16]
acc_df=pd.DataFrame(columns=['Nr.Topics','A_Accuracy', 'A_F1 Score','B_Accuracy','B_F1 Score', 'C_Accuracy','C_F1 Score'])
acc_df['Nr.Topics']=topic
acc_df['A_Accuracy']=a_Acc
acc_df['A_F1 Score']=a_F1
acc_df['B_Accuracy']=b_Acc
acc_df['B_F1 Score']=b_F1
acc_df['C_Accuracy']=c_Acc
acc_df['C_F1 Score']=c_F1

In [673]:
acc_df

Unnamed: 0,Nr.Topics,A_Accuracy,A_F1 Score,B_Accuracy,B_F1 Score,C_Accuracy,C_F1 Score
0,13,78.76,78.5,74.52,72.64,58.17,52.99
1,14,83.61,83.63,78.28,78.8,68.63,66.74
2,15,74.86,71.18,64.13,63.25,65.62,61.86
3,16,69.65,65.3,62.7,59.63,63.21,61.65


**We can note that the highest accuracy has been got by LDA with idf=Flase and using nr of topics =14, under letter 'A'-> 'LDA best model'**

# LDA  best model 

In [680]:
# Define and fit the best LDA model found out
LDA_skl_A_14 = LatentDirichletAllocation(n_components=14,random_state=11,max_iter=100,max_doc_update_iter=1000,n_jobs=-1) 
LDA_skl_A_14.fit(dtm_UKNews_LDA ) 
topics_LDA_skl_A_14 = LDA_skl_A_14.transform(dtm_UKNews_LDA)

In [682]:
# define some columns names that are useless for the saving of a new datset
#that contains only the best LDA model labels
to_remove=['LDA_skl_CountV Topic13 Label','LDA_skl_CountV Topic14 Label','LDA_skl_CountV Topic15 Label','LDA_skl_CountV Topic16 Label',
'LDA_skl_False_idf Topic13 Label','LDA_skl_False_idf Topic14 Label','LDA_skl_False_idf Topic15 Label','LDA_skl_False_idf Topic16 Label',
'LDA_skl Topic13 Label','LDA_skl Topic14 Label','LDA_skl Topic15 Label','LDA_skl Topic16 Label',
'LDA_skl_CountV Topic13','LDA_skl_CountV Topic14','LDA_skl_CountV Topic15','LDA_skl_CountV Topic16',
'LDA_skl_False_idf Topic13','LDA_skl_False_idf Topic14','LDA_skl_False_idf Topic15','LDA_skl_False_idf Topic16',
'LDA_skl Topic13','LDA_skl Topic14','LDA_skl Topic15','LDA_skl Topic16',
   'Topics_SOM_Gens',       ]

In [684]:
#make a copy of the dataframe and remove some column defined above
UK_News2=UK_News
UK_News2=UK_News2.drop(to_remove,axis=1)

In [686]:
#assigning to each tweet its LDA cluster
UK_News2['LDA_skl_A Topics14'] = topics_LDA_skl_A_14.argmax(axis=1) 
#Valisualization of the LDA best model clusters and clusters relevant words
dash_finalConfig = pyLDAvis.sklearn.prepare(LDA_skl_A_14, dtm_UKNews_LDA, tfidf_UKNews_LDA,mds='mmds', sort_topics=False) #mds='mmds' , mds='tsne'
dash_finalConfig 

After for each clsuter have been examined the most relevant words the
following topics and respective labels came out:
<br>
Label1=['coronavirus','lockdown','restrictions','vaccine']
<br>
Label2=['election','US politics and public people']
<br>
Label3=['crime','police','jail','terror','arrest']
<br>
Label4=['public people','school','social media']
<br>
Label5=['celebrities','royals','entratainment-film-tv-netflix-music']
<br>
Label6=['sport']
<br>
Label7=['brexit','european deals/issues']
<br>
Label8=['people&life','space'] 
<br>
Label9=['tech/digital companies','Christmas','shopping','sells','free-time']
<br>
Label10=['politics','labour party','mp','politic people']
<br>
Label11['financials','business','economy','market','trading','bank','companies','tax','workers','unemployment','investor']
<br>
Label12=['international affairs/issues politics and conflicts','climate change','war','nuclear']
<br>
Label13=['travel','holiday','weather','eviroment/climate condtions']

In [714]:
# Labeling LDA best model Clusters 
UK_News2.insert(20, "LDA_skl_A Topics14 Label", " ")
labels_trueF=['Label1','Label2','Label3','Label4','Label5','Label6','Label7','Label8','Label9','Label10','Label11','Label12','Label13']
labels_ldaF=[    2,         0,     6  ,    8,            5,     9,       7,       3,     4,     10,        13           ,1 ,    11  ]
for i, j in zip(labels_ldaF, labels_trueF):
    UK_News2.loc[UK_News2['LDA_skl_A Topics14'] == i, ['LDA_skl_A Topics14 Label']] =j
labels_trueFF=['Label1']
labels_ldaFF=[12]
for i, j in zip(labels_ldaFF, labels_trueFF):
    UK_News2.loc[UK_News2['LDA_skl_A Topics14'] == i, ['LDA_skl_A Topics14 Label']] =j

In [726]:
# Save the dataset with labels for the best LDA model found out
#UK_News2.to_pickle('UK_News_LDA_final_bestModel.df')

# Guided and GSDMM models
The code for these 2 models are reported just to indicate that there are other clustering models (linked to LDA) that one could perform. Such models were not validated as previously because after LDA have been performed another clustering models reported in another Jupyter Notebook.

## Guided LDA

In [19]:
from lda import guidedlda as glda

In [21]:
countV_UKNews_GLDA = CountVectorizer(tokenizer=lambda x: x,preprocessor=lambda x: x,min_df=0,max_df=1.0, ) #min_df=0.0001 
countV_Fit_UKNews_GLDA = countV_UKNews_GLDA.fit_transform(UK_News['Tweets_C_tok12'])

In [22]:
seed_topic_list = [['coronavirus', 'lockdown', 'restriction', 'infection', 'vaccine'],
                   ['nasa','mars','meteor'],
                   ['prince', 'crown', 'queen','royal'],
                   ['celebrity', 'netflix', 'season','series','movie'],
                  ['brexit','european','brussels'],
                  ['vote','election','senator','campaign','republican','elect','poll'],
                  ['money','company','economy','deal','company','economic','pound'],
                  ['sport','football','league'],
                  ['police','arrest','kill','attack','murder','terrorist','shoot'],
                  ['court','judge','politic'],
                  ['technology','tech','friday','amazon','apple','iphone'],
                  ['iran','china','russia'],['climate','weather']]

In [23]:
#Define GLDA model
model_glda = glda.GuidedLDA(n_topics=14, n_iter=1000, random_state=11, refresh=20)

In [24]:
seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[countV_UKNews_GLDA.vocabulary_[word]] = t_id

In [26]:
model_glda.fit(countV_Fit_UKNews_GLDA, seed_topics=seed_topics, seed_confidence=0.25)

INFO:lda:n_documents: 105622
INFO:lda:vocab_size: 4352
INFO:lda:n_words: 814470
INFO:lda:n_topics: 14
INFO:lda:n_iter: 1000
INFO:lda:<0> log likelihood: -10731805
INFO:lda:<20> log likelihood: -6519524
INFO:lda:<40> log likelihood: -6290184
INFO:lda:<60> log likelihood: -6179220
INFO:lda:<80> log likelihood: -6123404
INFO:lda:<100> log likelihood: -6092707
INFO:lda:<120> log likelihood: -6070776
INFO:lda:<140> log likelihood: -6056995
INFO:lda:<160> log likelihood: -6045387
INFO:lda:<180> log likelihood: -6039550
INFO:lda:<200> log likelihood: -6033076
INFO:lda:<220> log likelihood: -6028644
INFO:lda:<240> log likelihood: -6025494
INFO:lda:<260> log likelihood: -6022488
INFO:lda:<280> log likelihood: -6020210
INFO:lda:<300> log likelihood: -6017622
INFO:lda:<320> log likelihood: -6016684
INFO:lda:<340> log likelihood: -6014525
INFO:lda:<360> log likelihood: -6013491
INFO:lda:<380> log likelihood: -6012318
INFO:lda:<400> log likelihood: -6011853
INFO:lda:<420> log likelihood: -6010988
I

<lda.guidedlda.GuidedLDA at 0x7fb07c5ff580>

In [29]:
vocab=countV_UKNews_GLDA.get_feature_names()
n_top_words = 20
topic_word = model_glda.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))


Topic 0: coronavirus lockdown tier restriction vaccine england boris johnson uk rule christmas government people pm area pub minister case national warn
Topic 1: uk world space sky weather man people hit dog water storm old nasa rescue life moon home britain sea migrant
Topic 2: prince royal harry queen meghan princess crown kate diana william markle family charles middleton morgan pier netflix celebrity interview duchess
Topic 3: celebrity star fan die wife son share break james host baby death daughter viewer age husband love ex mum smith
Topic 4: boris johnson brexit european union deal uk labour coronavirus tory minister starmer trade keir pm lockdown government leader politic prime
Topic 5: trump election biden usa donald joe president win vote state debate house white result presidential campaign voter coronavirus america victory
Topic 6: pound coronavirus uk pandemic job sunak bn dollar pay rishi government business percentage million money market economy rise company cut
Topic 

## GSDMM

In [None]:
#pip install GPyM-TM

In [None]:
from GPyM_TM import GSDMM
from GPyM_TM import GPM
nTopics = 13
corpus = UK_News['Tweets_C_tok12']
data_dmm = GSDMM.DMM(corpus, nTopics,nTopWords = 20) # Initialize the object, with default parameters.
data_dmm.topicAssigmentInitialise() # Performs the inital document assignments and counts
data_dmm.inference()
psi, theta, selected_psi, selected_theta = data_dmm.worddist() # Determines and stores the psi, theta and selected_psi and selected_theta values
finalAssignments = data_dmm.writeTopicAssignments() # Records the final topic assignments for the documents
coherence_topwords = data_dmm.writeTopTopicalWords(finalAssignments) # Record the top words for each document
score = data_dmm.coherence(coherence_topwords, len(finalAssignments)) #Calculates and stores the coherence
print("Final number of topics found: " + str(len(finalAssignments)))