In [1]:
#Base and Cleaning 
import sys
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import time
import pickle
from datetime import datetime
from datetime import date
import json
import requests
import emoji
import re
import string
import warnings
from pprint import pprint
warnings.filterwarnings("ignore")

In [2]:
# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim
warnings.filterwarnings("ignore")

# Notebook methodology
In this notebook, I use Gensim to perform topic modelling on the cleaned tweets. An initial model with the parameters topics = 4, passes = 5, iteration = 5 and alpha = symmetric was created, and it's coherence score noted. Hyperparameter tuning was then performed manually by adjusting from left to right each parameter. Unfortunately, a gridsearch to tune the hyperparameters was advised against by several guides I followed on LDA usage, so I performed it manually by evaluating which parameter provided the best improvement over the previous parameter, as well as represented a plateau in coherence.

In [33]:
df = pd.read_csv('../datasets/avlliv_sentiment.csv', converters = {'tokens': eval, 'sentiment_dict':eval})

In [34]:
df['datetime']=pd.to_datetime(df['datetime'])

In [5]:
df

Unnamed: 0,date,user,is_retweet,is_quote,text,quoted_text,mentions,hashtags,cleantext,lang,tokens,datetime,time,sentiment_dict,sentiment_score,sentiment_rating
0,4/10/2020 18:10,justincroser,False,False,come on reds!! 🔴🔴 have to sleep for work but h...,,,"['#AVLLIV', '#LFC', '#LFCFamily', '#YNWA']",come on reds have to sleep for work but hoping...,1,"[come, red, sleep, work, hope, win, go, watch,...",2020-10-04 18:10:00,18:10:00,"{'neg': 0.0, 'neu': 0.729, 'pos': 0.271, 'comp...",0.8720,pos
1,4/10/2020 18:10,LFCYNWA125,True,False,rt @lfc: jürgen klopp provides detail on the s...,,"['@LFC', '@Alissonbecker']","['#LFC', '#AVLLIV']",username jrgen klopp provides detail on the sh...,1,"[username, detail, shoulder_injury, sideline, ...",2020-10-04 18:10:00,18:10:00,"{'neg': 0.167, 'neu': 0.833, 'pos': 0.0, 'comp...",-0.4215,neg
2,4/10/2020 18:10,itstugenfinest,True,False,rt @skysportspl: 'i'm pretty sure he won't be ...,,['@SkySportsPL'],,username im pretty sure he wont be ready after...,1,"[username, pretty, sure, ready, set, week, sid...",2020-10-04 18:10:00,18:10:00,"{'neg': 0.079, 'neu': 0.714, 'pos': 0.207, 'co...",0.5252,pos
3,4/10/2020 18:10,guu_mendees,True,False,rt @ludovicofans: now follow the news l...,,['@ludovicofans'],,username now follow the news live live streami...,1,"[username, follow, news, live, live, streaming...",2020-10-04 18:10:00,18:10:00,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neu
4,4/10/2020 18:10,justindivine5,True,True,rt @anfieldwatch: jurgen klopp: “it’s an inter...,Liverpool face an anxious wait on how long the...,['@AnfieldWatch'],,username jurgen klopp its an international bre...,1,"[username, jurgen, klopp, pretty, sure, ready,...",2020-10-04 18:10:00,18:10:00,"{'neg': 0.074, 'neu': 0.734, 'pos': 0.192, 'co...",0.5252,pos
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207722,4/10/2020 20:20,notbitterbetter,False,False,villa were poor there should’ve scored 10 or 11.,,,,villa were poor there shouldve scored or,1,"[poor, score]",2020-10-04 20:20:00,20:20:00,"{'neg': 0.341, 'neu': 0.659, 'pos': 0.0, 'comp...",-0.4767,neg
207723,4/10/2020 20:20,artDante1,False,False,good time to be alive... manchester united lo...,,,['#AVLLFC'],good time to be alive manchester united loses ...,1,"[good, time, alive, manchester, united, lose, ...",2020-10-04 20:20:00,20:20:00,"{'neg': 0.184, 'neu': 0.495, 'pos': 0.321, 'co...",0.7184,pos
207724,4/10/2020 20:20,jonesy73,False,False,gutted that we couldn’t all be there together ...,,,,gutted that we couldnt all be there together t...,1,"[gutte, could, together, enjoy, tonight, trip,...",2020-10-04 20:20:00,20:20:00,"{'neg': 0.058, 'neu': 0.676, 'pos': 0.266, 'co...",0.9169,pos
207725,4/10/2020 20:20,ryanYNWA,True,False,rt @elliothackney: everyone: last season was b...,,['@ElliotHackney'],,username everyone last season was boring liver...,1,"[username, everyone, last, season, bore, run, ...",2020-10-04 20:20:00,20:20:00,"{'neg': 0.113, 'neu': 0.887, 'pos': 0.0, 'comp...",-0.3182,neg


In [6]:
data = df.tokens.to_list()

# Creating the base model

In [7]:
# Create Dictionary
id2word = corpora.Dictionary(data)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data]

In [8]:
# Define a function to evaluate the model
def evaluator(model, dictionary=id2word, texts=data, corpus=corpus):
    base_perplexity = model.log_perplexity(corpus)
    print('\nPerplexity: ', base_perplexity) 

    # Compute Coherence Score
    coherence_model = CoherenceModel(model=model, texts=data, 
                                       dictionary=id2word, coherence='c_v')
    coherence_lda_model= coherence_model.get_coherence()
    print('\nCoherence Score: ', coherence_lda_model)

In [20]:
# Build LDA model
lda_model1_01 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=50,
                                           per_word_topics=True)


In [21]:
pprint(lda_model1_01.print_topics())

[(0,
  '0.090*"username" + 0.027*"man" + 0.023*"united" + 0.019*"concede" + '
  '0.019*"goal" + 0.017*"lose" + 0.013*"manchester" + 0.013*"game" + '
  '0.013*"team" + 0.012*"utd"'),
 (1,
  '0.116*"username" + 0.066*"live" + 0.041*"vs" + 0.034*"stream" + '
  '0.028*"watch" + 0.021*"match" + 0.018*"league" + 0.016*"mane" + 0.014*"hd" '
  '+ 0.014*"free"'),
 (2,
  '0.040*"username" + 0.019*"pron" + 0.015*"league" + 0.014*"go" + 0.012*"get" '
  '+ 0.011*"play" + 0.011*"good" + 0.010*"goal" + 0.009*"premier" + '
  '0.009*"win"'),
 (3,
  '0.156*"username" + 0.048*"fan" + 0.027*"united" + 0.026*"man" + '
  '0.025*"watkins" + 0.024*"goal" + 0.018*"pron" + 0.018*"ollie" + 0.018*"utd" '
  '+ 0.015*"score"')]


In [22]:
evaluator(lda_model1_01)


Perplexity:  -6.581280774763232

Coherence Score:  0.4939216056115421


In [37]:
modelseries1results = [['model1_01',4,5,50,'symmetric',-6.5812,0.4939]]


In [35]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model1_01, corpus, id2word)

# Hyperparameter tuning
#### Testing for optimal model number

In [23]:
lda_model1_02 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=50,
                                           per_word_topics=True)
pprint(lda_model1_02.print_topics())
evaluator(lda_model1_02)

[(0,
  '0.091*"username" + 0.062*"man" + 0.044*"fan" + 0.036*"united" + 0.036*"utd" '
  '+ 0.022*"manchester" + 0.019*"lose" + 0.013*"football" + 0.012*"team" + '
  '0.011*"city"'),
 (1,
  '0.095*"username" + 0.073*"live" + 0.049*"vs" + 0.037*"stream" + '
  '0.031*"watch" + 0.027*"league" + 0.022*"premier" + 0.021*"mane" + '
  '0.021*"match" + 0.019*"season"'),
 (2,
  '0.054*"username" + 0.034*"pron" + 0.022*"never" + 0.020*"walk" + '
  '0.019*"alone" + 0.017*"say" + 0.014*"league" + 0.012*"united" + '
  '0.012*"really" + 0.011*"goal"'),
 (3,
  '0.112*"username" + 0.034*"watkins" + 0.027*"ollie" + 0.024*"score" + '
  '0.023*"go" + 0.020*"goal" + 0.017*"first" + 0.016*"see" + 0.016*"united" + '
  '0.015*"fan"'),
 (4,
  '0.161*"username" + 0.038*"goal" + 0.026*"concede" + 0.016*"one" + '
  '0.015*"get" + 0.010*"win" + 0.009*"score" + 0.009*"game" + 0.008*"time" + '
  '0.008*"beat"')]

Perplexity:  -6.609614575216423

Coherence Score:  0.49306840581414113


In [39]:
modelseries1results.append(['model1_02',5,5,50,'symmetric',-6.6096,0.4930])

In [24]:
lda_model1_03 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=50,
                                           per_word_topics=True)
pprint(lda_model1_03.print_topics())
evaluator(lda_model1_03)

[(0,
  '0.109*"username" + 0.083*"man" + 0.052*"utd" + 0.042*"fan" + 0.040*"united" '
  '+ 0.025*"manchester" + 0.024*"lose" + 0.020*"football" + 0.018*"city" + '
  '0.016*"concede"'),
 (1,
  '0.103*"live" + 0.091*"username" + 0.058*"vs" + 0.053*"stream" + '
  '0.044*"watch" + 0.030*"match" + 0.023*"hd" + 0.023*"free" + 0.017*"link" + '
  '0.014*"come"'),
 (2,
  '0.080*"username" + 0.055*"league" + 0.039*"premier" + 0.030*"goal" + '
  '0.028*"score" + 0.025*"walk" + 0.025*"never" + 0.023*"alone" + '
  '0.020*"since" + 0.018*"first"'),
 (3,
  '0.132*"username" + 0.055*"united" + 0.044*"fan" + 0.023*"go" + 0.022*"hold" '
  '+ 0.020*"say" + 0.019*"pron" + 0.018*"see" + 0.017*"goal" + 0.015*"let"'),
 (4,
  '0.189*"username" + 0.045*"goal" + 0.041*"watkins" + 0.030*"score" + '
  '0.029*"ollie" + 0.019*"concede" + 0.016*"one" + 0.014*"hattrick" + '
  '0.014*"first" + 0.013*"back"'),
 (5,
  '0.057*"username" + 0.022*"pron" + 0.018*"get" + 0.013*"go" + 0.013*"well" + '
  '0.012*"team" + 0.011*

In [41]:
modelseries1results.append(['model1_03',6,5,50,'symmetric',-6.5712,0.5382])

In [25]:
lda_model1_04 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=7, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=50,
                                           per_word_topics=True)
pprint(lda_model1_04.print_topics())
evaluator(lda_model1_04)

[(0,
  '0.077*"username" + 0.039*"goal" + 0.038*"game" + 0.025*"first" + '
  '0.022*"united" + 0.020*"concede" + 0.018*"last" + 0.016*"league" + '
  '0.014*"manchester" + 0.013*"time"'),
 (1,
  '0.113*"live" + 0.097*"username" + 0.065*"vs" + 0.061*"stream" + '
  '0.043*"watch" + 0.034*"match" + 0.027*"free" + 0.026*"hd" + 0.019*"link" + '
  '0.014*"league"'),
 (2,
  '0.065*"league" + 0.055*"username" + 0.046*"premier" + 0.021*"season" + '
  '0.021*"everton" + 0.020*"goal" + 0.014*"go" + 0.014*"win" + 0.013*"watch" + '
  '0.011*"wtf"'),
 (3,
  '0.129*"username" + 0.062*"united" + 0.031*"never" + 0.030*"pron" + '
  '0.030*"walk" + 0.029*"alone" + 0.026*"score" + 0.024*"say" + 0.023*"fan" + '
  '0.023*"manchester"'),
 (4,
  '0.198*"username" + 0.040*"goal" + 0.037*"concede" + 0.022*"watkins" + '
  '0.020*"one" + 0.014*"grealish" + 0.012*"beat" + 0.012*"park" + '
  '0.010*"seven" + 0.010*"fpl"'),
 (5,
  '0.061*"username" + 0.024*"pron" + 0.019*"get" + 0.013*"well" + 0.013*"go" + '
  '0.012

In [42]:
modelseries1results.append(['model1_04',7,5,50,'symmetric',-6.6232,0.5236])

In [26]:
lda_model1_05 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=50,
                                           per_word_topics=True)
pprint(lda_model1_05.print_topics())
evaluator(lda_model1_05)

[(0,
  '0.112*"username" + 0.058*"united" + 0.038*"manchester" + 0.031*"goal" + '
  '0.027*"hold" + 0.023*"game" + 0.022*"first" + 0.019*"champion" + '
  '0.019*"league" + 0.015*"man"'),
 (1,
  '0.122*"live" + 0.095*"username" + 0.070*"vs" + 0.063*"stream" + '
  '0.049*"watch" + 0.033*"match" + 0.027*"hd" + 0.027*"free" + 0.020*"link" + '
  '0.019*"league"'),
 (2,
  '0.086*"username" + 0.061*"never" + 0.054*"walk" + 0.051*"alone" + '
  '0.047*"pron" + 0.044*"league" + 0.039*"say" + 0.034*"united" + '
  '0.030*"premier" + 0.023*"really"'),
 (3,
  '0.110*"username" + 0.047*"watkins" + 0.034*"ollie" + 0.031*"go" + '
  '0.030*"goal" + 0.028*"hattrick" + 0.027*"first" + 0.027*"let" + '
  '0.017*"score" + 0.015*"player"'),
 (4,
  '0.188*"username" + 0.041*"concede" + 0.040*"goal" + 0.027*"united" + '
  '0.020*"lose" + 0.019*"one" + 0.017*"city" + 0.015*"grealish" + '
  '0.013*"manchester" + 0.012*"leicester"'),
 (5,
  '0.053*"username" + 0.024*"pron" + 0.019*"get" + 0.015*"well" + 0.013*"goo

In [43]:
modelseries1results.append(['model1_05',8,5,50,'symmetric',-6.6457,0.5241])

In [27]:
lda_model1_06 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=9, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=50,
                                           per_word_topics=True)
pprint(lda_model1_06.print_topics())
evaluator(lda_model1_06)

[(0,
  '0.097*"username" + 0.063*"united" + 0.047*"manchester" + 0.039*"man" + '
  '0.039*"lose" + 0.025*"city" + 0.019*"goal" + 0.018*"tottenham" + '
  '0.015*"utd" + 0.015*"game"'),
 (1,
  '0.134*"live" + 0.095*"username" + 0.072*"vs" + 0.070*"stream" + '
  '0.056*"watch" + 0.035*"match" + 0.030*"hd" + 0.029*"free" + 0.021*"link" + '
  '0.018*"league"'),
 (2,
  '0.084*"username" + 0.060*"never" + 0.056*"league" + 0.052*"walk" + '
  '0.050*"alone" + 0.043*"pron" + 0.042*"premier" + 0.036*"say" + '
  '0.026*"united" + 0.024*"goal"'),
 (3,
  '0.130*"username" + 0.044*"united" + 0.038*"fan" + 0.027*"go" + 0.026*"hold" '
  '+ 0.025*"see" + 0.021*"laugh" + 0.019*"let" + 0.015*"man" + 0.014*"pron"'),
 (4,
  '0.210*"username" + 0.060*"goal" + 0.048*"concede" + 0.023*"one" + '
  '0.020*"grealish" + 0.015*"watkins" + 0.014*"time" + 0.013*"fpl" + '
  '0.011*"seven" + 0.011*"min"'),
 (5,
  '0.058*"username" + 0.027*"pron" + 0.022*"get" + 0.015*"well" + 0.013*"good" '
  '+ 0.012*"team" + 0.012*"p

In [44]:
modelseries1results.append(['model1_06',9,5,50,'symmetric',-6.6868,0.5455])

In [28]:
lda_model1_07 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=50,
                                           per_word_topics=True)
pprint(lda_model1_07.print_topics())
evaluator(lda_model1_07)

[(0,
  '0.101*"username" + 0.074*"united" + 0.057*"man" + 0.045*"manchester" + '
  '0.032*"concede" + 0.030*"lose" + 0.026*"goal" + 0.025*"city" + 0.021*"utd" '
  '+ 0.017*"tottenham"'),
 (1,
  '0.134*"live" + 0.096*"username" + 0.071*"vs" + 0.071*"stream" + '
  '0.055*"watch" + 0.035*"match" + 0.031*"free" + 0.031*"hd" + 0.020*"league" '
  '+ 0.019*"link"'),
 (2,
  '0.086*"username" + 0.073*"never" + 0.064*"walk" + 0.061*"alone" + '
  '0.056*"pron" + 0.046*"say" + 0.042*"united" + 0.029*"really" + '
  '0.018*"embarrass" + 0.016*"league"'),
 (3,
  '0.117*"username" + 0.034*"fan" + 0.032*"go" + 0.030*"united" + 0.026*"let" '
  '+ 0.026*"laugh" + 0.021*"see" + 0.020*"goal" + 0.016*"game" + 0.015*"pron"'),
 (4,
  '0.235*"username" + 0.042*"goal" + 0.026*"one" + 0.021*"grealish" + '
  '0.020*"concede" + 0.017*"watkins" + 0.015*"fpl" + 0.013*"seven" + '
  '0.013*"two" + 0.012*"min"'),
 (5,
  '0.053*"username" + 0.028*"pron" + 0.020*"get" + 0.016*"well" + 0.015*"good" '
  '+ 0.012*"play" + 0

In [45]:
modelseries1results.append(['model1_07',10,5,50,'symmetric',-6.7240,0.5513])

In [48]:
lda_model1_08 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=11, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=50,
                                           per_word_topics=True)
pprint(lda_model1_08.print_topics())
evaluator(lda_model1_08)

[(0,
  '0.095*"username" + 0.076*"united" + 0.053*"manchester" + 0.049*"man" + '
  '0.040*"lose" + 0.039*"concede" + 0.028*"city" + 0.026*"goal" + '
  '0.019*"tottenham" + 0.017*"season"'),
 (1,
  '0.140*"live" + 0.101*"username" + 0.084*"vs" + 0.073*"stream" + '
  '0.053*"watch" + 0.036*"match" + 0.031*"hd" + 0.031*"free" + 0.024*"league" '
  '+ 0.021*"link"'),
 (2,
  '0.084*"username" + 0.079*"never" + 0.074*"walk" + 0.070*"alone" + '
  '0.055*"pron" + 0.048*"say" + 0.036*"united" + 0.035*"really" + '
  '0.021*"embarrass" + 0.016*"league"'),
 (3,
  '0.143*"username" + 0.048*"united" + 0.044*"hold" + 0.036*"see" + '
  '0.035*"goal" + 0.028*"let" + 0.025*"laugh" + 0.024*"beer" + 0.023*"go" + '
  '0.015*"game"'),
 (4,
  '0.238*"username" + 0.045*"goal" + 0.027*"watkins" + 0.022*"grealish" + '
  '0.018*"one" + 0.017*"fpl" + 0.016*"concede" + 0.015*"seven" + 0.014*"min" + '
  '0.013*"assist"'),
 (5,
  '0.067*"username" + 0.022*"pron" + 0.017*"good" + 0.017*"one" + 0.016*"get" '
  '+ 0.013

In [50]:
modelseries1results.append(['model1_08',11,5,50,'symmetric',-6.7608,0.5392])

In [49]:
lda_model1_09 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=12, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=50,
                                           per_word_topics=True)
pprint(lda_model1_09.print_topics())
evaluator(lda_model1_09)

[(0,
  '0.089*"username" + 0.038*"united" + 0.036*"team" + 0.026*"manchester" + '
  '0.026*"last" + 0.021*"season" + 0.020*"man" + 0.019*"city" + 0.019*"lose" + '
  '0.018*"today"'),
 (1,
  '0.150*"live" + 0.099*"username" + 0.079*"stream" + 0.079*"vs" + '
  '0.059*"watch" + 0.037*"match" + 0.034*"hd" + 0.033*"free" + 0.024*"league" '
  '+ 0.022*"link"'),
 (2,
  '0.086*"never" + 0.084*"username" + 0.073*"walk" + 0.069*"alone" + '
  '0.062*"pron" + 0.055*"say" + 0.042*"united" + 0.036*"really" + 0.019*"tell" '
  '+ 0.019*"embarrass"'),
 (3,
  '0.135*"username" + 0.062*"united" + 0.032*"hold" + 0.028*"see" + '
  '0.026*"let" + 0.026*"laugh" + 0.020*"go" + 0.017*"lose" + 0.016*"watkins" + '
  '0.016*"beer"'),
 (4,
  '0.230*"username" + 0.071*"goal" + 0.059*"concede" + 0.022*"one" + '
  '0.021*"grealish" + 0.019*"watkins" + 0.016*"fpl" + 0.013*"min" + '
  '0.013*"many" + 0.012*"seven"'),
 (5,
  '0.076*"username" + 0.019*"pron" + 0.019*"mane" + 0.015*"back" + 0.014*"get" '
  '+ 0.014*"playe

In [51]:
modelseries1results.append(['model1_09',12,5,50,'symmetric',-6.8001, 0.5278])

In [65]:
modeltracker = pd.DataFrame(modelseries1results, columns =['model_name', 'num_topics', 'passes','iterations','alpha','perplexity','coherence'], dtype = float) 

In [66]:
modeltracker

Unnamed: 0,model_name,num_topics,passes,iterations,alpha,perplexity,coherence
0,model1_01,4.0,5.0,50.0,symmetric,-6.5812,0.4939
1,model1_02,5.0,5.0,50.0,symmetric,-6.6096,0.493
2,model1_03,6.0,5.0,50.0,symmetric,-6.5712,0.5382
3,model1_04,7.0,5.0,50.0,symmetric,-6.6232,0.5236
4,model1_05,8.0,5.0,50.0,symmetric,-6.6457,0.5241
5,model1_06,9.0,5.0,50.0,symmetric,-6.6868,0.5455
6,model1_07,10.0,5.0,50.0,symmetric,-6.724,0.5513
7,model1_08,11.0,5.0,50.0,symmetric,-6.7608,0.5392
8,model1_09,12.0,5.0,50.0,symmetric,-6.8001,0.5278


In [54]:
lda_model2_01 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=60,
                                           per_word_topics=True)
pprint(lda_model2_01.print_topics())
evaluator(lda_model2_01)

[(0,
  '0.097*"username" + 0.076*"united" + 0.053*"man" + 0.044*"manchester" + '
  '0.033*"lose" + 0.029*"concede" + 0.025*"city" + 0.022*"goal" + 0.018*"utd" '
  '+ 0.018*"tottenham"'),
 (1,
  '0.135*"live" + 0.097*"username" + 0.073*"vs" + 0.072*"stream" + '
  '0.051*"watch" + 0.033*"match" + 0.031*"hd" + 0.031*"free" + 0.020*"link" + '
  '0.019*"league"'),
 (2,
  '0.084*"username" + 0.065*"never" + 0.058*"walk" + 0.054*"alone" + '
  '0.053*"pron" + 0.042*"say" + 0.036*"united" + 0.026*"really" + '
  '0.018*"league" + 0.016*"see"'),
 (3,
  '0.116*"username" + 0.036*"go" + 0.027*"goal" + 0.027*"let" + 0.023*"united" '
  '+ 0.021*"watkins" + 0.020*"fan" + 0.019*"laugh" + 0.017*"game" + '
  '0.016*"give"'),
 (4,
  '0.236*"username" + 0.045*"goal" + 0.021*"grealish" + 0.020*"watkins" + '
  '0.018*"concede" + 0.018*"one" + 0.016*"fpl" + 0.016*"minute" + 0.014*"min" '
  '+ 0.013*"seven"'),
 (5,
  '0.058*"username" + 0.027*"pron" + 0.020*"get" + 0.016*"well" + 0.014*"good" '
  '+ 0.012*"tea

In [59]:
modelseries2results = [['model2_01',10,5,60,'symmetric',-6.7403,0.5484]]

In [55]:
lda_model2_02 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=70,
                                           per_word_topics=True)
pprint(lda_model2_02.print_topics())
evaluator(lda_model2_02)

[(0,
  '0.087*"username" + 0.061*"united" + 0.045*"man" + 0.039*"manchester" + '
  '0.032*"lose" + 0.026*"city" + 0.023*"concede" + 0.023*"goal" + '
  '0.017*"tottenham" + 0.017*"utd"'),
 (1,
  '0.132*"live" + 0.099*"username" + 0.076*"vs" + 0.070*"stream" + '
  '0.054*"watch" + 0.035*"match" + 0.030*"hd" + 0.029*"free" + 0.020*"link" + '
  '0.019*"league"'),
 (2,
  '0.082*"username" + 0.073*"never" + 0.064*"walk" + 0.060*"alone" + '
  '0.056*"pron" + 0.047*"say" + 0.034*"united" + 0.031*"really" + '
  '0.017*"embarrass" + 0.016*"league"'),
 (3,
  '0.127*"username" + 0.056*"united" + 0.033*"hold" + 0.031*"go" + 0.030*"see" '
  '+ 0.030*"fan" + 0.024*"let" + 0.023*"laugh" + 0.018*"beer" + 0.014*"pron"'),
 (4,
  '0.224*"username" + 0.061*"goal" + 0.033*"concede" + 0.024*"one" + '
  '0.020*"grealish" + 0.017*"watkins" + 0.014*"fpl" + 0.013*"min" + '
  '0.012*"game" + 0.012*"minute"'),
 (5,
  '0.054*"username" + 0.027*"pron" + 0.019*"get" + 0.017*"well" + 0.015*"good" '
  '+ 0.012*"play" +

In [60]:
modelseries2results.append(['model2_02',10,5,70,'symmetric',-6.7151,0.5426])

In [56]:
lda_model2_03 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=80,
                                           per_word_topics=True)
pprint(lda_model2_03.print_topics())
evaluator(lda_model2_03)

[(0,
  '0.105*"username" + 0.078*"united" + 0.056*"man" + 0.048*"manchester" + '
  '0.036*"concede" + 0.032*"lose" + 0.025*"city" + 0.024*"goal" + 0.019*"hold" '
  '+ 0.018*"utd"'),
 (1,
  '0.135*"live" + 0.097*"username" + 0.075*"vs" + 0.069*"stream" + '
  '0.056*"watch" + 0.035*"match" + 0.030*"hd" + 0.029*"free" + 0.021*"league" '
  '+ 0.020*"link"'),
 (2,
  '0.087*"username" + 0.076*"never" + 0.067*"walk" + 0.064*"alone" + '
  '0.060*"pron" + 0.051*"say" + 0.046*"united" + 0.031*"really" + 0.020*"see" '
  '+ 0.020*"embarrass"'),
 (3,
  '0.121*"username" + 0.035*"go" + 0.031*"let" + 0.025*"united" + '
  '0.024*"laugh" + 0.023*"fan" + 0.020*"watkins" + 0.015*"see" + 0.014*"give" '
  '+ 0.014*"ollie"'),
 (4,
  '0.237*"username" + 0.042*"goal" + 0.031*"one" + 0.022*"grealish" + '
  '0.017*"watkins" + 0.015*"fpl" + 0.015*"two" + 0.015*"seven" + 0.013*"min" + '
  '0.013*"three"'),
 (5,
  '0.055*"username" + 0.027*"pron" + 0.018*"get" + 0.017*"well" + 0.015*"good" '
  '+ 0.013*"play" + 0.

In [61]:
modelseries2results.append(['model2_03',10,5,80,'symmetric',-6.7209,0.5511])

In [57]:
lda_model2_04 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=90,
                                           per_word_topics=True)
pprint(lda_model2_04.print_topics())
evaluator(lda_model2_04)

[(0,
  '0.095*"username" + 0.073*"united" + 0.054*"man" + 0.042*"manchester" + '
  '0.032*"lose" + 0.026*"concede" + 0.025*"goal" + 0.024*"city" + 0.020*"utd" '
  '+ 0.017*"tottenham"'),
 (1,
  '0.129*"live" + 0.100*"username" + 0.070*"vs" + 0.070*"stream" + '
  '0.051*"watch" + 0.033*"match" + 0.030*"hd" + 0.029*"free" + 0.022*"league" '
  '+ 0.019*"link"'),
 (2,
  '0.083*"username" + 0.071*"never" + 0.062*"walk" + 0.059*"alone" + '
  '0.055*"pron" + 0.046*"say" + 0.036*"united" + 0.030*"really" + '
  '0.018*"league" + 0.017*"embarrass"'),
 (3,
  '0.117*"username" + 0.034*"go" + 0.028*"united" + 0.025*"fan" + 0.024*"see" '
  '+ 0.023*"let" + 0.022*"laugh" + 0.021*"goal" + 0.019*"watkins" + '
  '0.014*"ollie"'),
 (4,
  '0.227*"username" + 0.044*"goal" + 0.027*"concede" + 0.026*"one" + '
  '0.019*"grealish" + 0.018*"watkins" + 0.016*"fpl" + 0.013*"min" + '
  '0.013*"seven" + 0.012*"minute"'),
 (5,
  '0.053*"username" + 0.028*"pron" + 0.018*"get" + 0.017*"well" + 0.015*"good" '
  '+ 0.01

In [62]:
modelseries2results.append(['model2_04',10,5,90,'symmetric',-6.7094,0.5507])

In [58]:
lda_model2_05 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           workers=12,
                                           passes=5,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)
pprint(lda_model2_05.print_topics())
evaluator(lda_model2_05)

[(0,
  '0.092*"username" + 0.067*"united" + 0.048*"man" + 0.046*"manchester" + '
  '0.031*"concede" + 0.025*"city" + 0.024*"goal" + 0.022*"lose" + '
  '0.019*"season" + 0.016*"last"'),
 (1,
  '0.136*"live" + 0.099*"username" + 0.072*"stream" + 0.072*"vs" + '
  '0.054*"watch" + 0.033*"match" + 0.031*"hd" + 0.030*"free" + 0.020*"league" '
  '+ 0.019*"link"'),
 (2,
  '0.081*"username" + 0.075*"never" + 0.065*"walk" + 0.061*"alone" + '
  '0.056*"pron" + 0.047*"say" + 0.038*"united" + 0.031*"really" + '
  '0.018*"embarrass" + 0.017*"league"'),
 (3,
  '0.123*"username" + 0.043*"united" + 0.031*"hold" + 0.030*"go" + 0.027*"see" '
  '+ 0.026*"fan" + 0.023*"laugh" + 0.020*"let" + 0.019*"lose" + 0.015*"beer"'),
 (4,
  '0.231*"username" + 0.045*"goal" + 0.022*"grealish" + 0.020*"watkins" + '
  '0.019*"concede" + 0.017*"one" + 0.016*"fpl" + 0.014*"seven" + 0.014*"min" + '
  '0.013*"minute"'),
 (5,
  '0.055*"username" + 0.027*"pron" + 0.018*"get" + 0.016*"well" + 0.015*"good" '
  '+ 0.012*"team" + 

In [63]:
modelseries2results.append(['model2_05',10,5,100,'symmetric',-6.7348,0.5452])

In [None]:
modelseries2results

In [80]:
modeltracker2 = pd.DataFrame(modelseries2results, columns =['model_name', 'num_topics', 'passes','iterations','alpha','perplexity','coherence'], dtype = float) 

In [81]:
modeltracker = modeltracker.append(modeltracker2, ignore_index=True)

In [70]:
lda_model3_01 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           workers=12,
                                           passes=10,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=80,
                                           per_word_topics=True)
pprint(lda_model3_01.print_topics())
evaluator(lda_model3_01)

[(0,
  '0.097*"username" + 0.061*"man" + 0.053*"united" + 0.046*"concede" + '
  '0.036*"lose" + 0.033*"goal" + 0.029*"city" + 0.028*"manchester" + '
  '0.024*"season" + 0.024*"utd"'),
 (1,
  '0.138*"live" + 0.096*"username" + 0.091*"vs" + 0.076*"watch" + '
  '0.069*"stream" + 0.043*"match" + 0.030*"free" + 0.030*"hd" + 0.025*"league" '
  '+ 0.022*"link"'),
 (2,
  '0.098*"username" + 0.086*"never" + 0.074*"pron" + 0.073*"walk" + '
  '0.069*"alone" + 0.069*"say" + 0.049*"united" + 0.037*"really" + '
  '0.021*"embarrass" + 0.020*"tell"'),
 (3,
  '0.143*"username" + 0.086*"united" + 0.044*"hold" + 0.038*"fan" + '
  '0.032*"manchester" + 0.030*"laugh" + 0.028*"go" + 0.028*"let" + 0.027*"see" '
  '+ 0.025*"beer"'),
 (4,
  '0.235*"username" + 0.053*"goal" + 0.037*"one" + 0.026*"grealish" + '
  '0.023*"watkins" + 0.016*"fpl" + 0.016*"two" + 0.014*"min" + 0.014*"three" + '
  '0.014*"minute"'),
 (5,
  '0.051*"username" + 0.025*"pron" + 0.019*"well" + 0.017*"get" + 0.015*"good" '
  '+ 0.015*"play

In [71]:
modelseries3results = [['model3_01',10,10,80,'symmetric',-6.6738,0.5634]]

In [72]:
lda_model3_02 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           workers=12,
                                           passes=15,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=80,
                                           per_word_topics=True)
pprint(lda_model3_02.print_topics())
evaluator(lda_model3_02)

[(0,
  '0.096*"username" + 0.071*"united" + 0.056*"man" + 0.048*"lose" + '
  '0.043*"manchester" + 0.035*"concede" + 0.028*"city" + 0.026*"season" + '
  '0.026*"goal" + 0.021*"league"'),
 (1,
  '0.144*"live" + 0.098*"username" + 0.095*"vs" + 0.077*"watch" + '
  '0.073*"stream" + 0.044*"match" + 0.031*"free" + 0.030*"hd" + 0.027*"league" '
  '+ 0.024*"link"'),
 (2,
  '0.090*"username" + 0.086*"pron" + 0.081*"never" + 0.081*"say" + '
  '0.066*"walk" + 0.062*"alone" + 0.046*"united" + 0.038*"really" + '
  '0.024*"tell" + 0.019*"see"'),
 (3,
  '0.131*"username" + 0.046*"united" + 0.042*"go" + 0.039*"fan" + 0.036*"let" '
  '+ 0.030*"laugh" + 0.027*"see" + 0.023*"love" + 0.022*"goal" + 0.021*"hold"'),
 (4,
  '0.233*"username" + 0.058*"goal" + 0.035*"one" + 0.032*"grealish" + '
  '0.023*"watkins" + 0.019*"fpl" + 0.017*"min" + 0.016*"minute" + '
  '0.016*"concede" + 0.014*"salah"'),
 (5,
  '0.048*"username" + 0.024*"pron" + 0.022*"get" + 0.019*"well" + 0.017*"good" '
  '+ 0.015*"play" + 0.013*

In [75]:
modelseries3results.append(['model3_02',10,15,80,'symmetric',-6.6300,0.5482])

In [73]:
lda_model3_03 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           workers=12,
                                           passes=20,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=80,
                                           per_word_topics=True)
pprint(lda_model3_03.print_topics())
evaluator(lda_model3_03)

[(0,
  '0.094*"username" + 0.055*"united" + 0.055*"man" + 0.051*"lose" + '
  '0.039*"concede" + 0.033*"manchester" + 0.028*"city" + 0.026*"goal" + '
  '0.023*"team" + 0.021*"utd"'),
 (1,
  '0.145*"live" + 0.097*"username" + 0.096*"vs" + 0.081*"watch" + '
  '0.074*"stream" + 0.045*"match" + 0.031*"free" + 0.031*"hd" + 0.026*"league" '
  '+ 0.024*"link"'),
 (2,
  '0.095*"username" + 0.082*"pron" + 0.081*"never" + 0.073*"say" + '
  '0.067*"walk" + 0.063*"alone" + 0.044*"united" + 0.036*"really" + '
  '0.024*"tell" + 0.018*"embarrass"'),
 (3,
  '0.129*"username" + 0.068*"united" + 0.045*"fan" + 0.042*"go" + 0.037*"see" '
  '+ 0.033*"let" + 0.029*"hold" + 0.027*"laugh" + 0.021*"love" + 0.016*"man"'),
 (4,
  '0.241*"username" + 0.078*"goal" + 0.033*"one" + 0.033*"grealish" + '
  '0.021*"watkins" + 0.018*"fpl" + 0.017*"mins" + 0.016*"min" + 0.015*"minute" '
  '+ 0.014*"seven"'),
 (5,
  '0.047*"username" + 0.024*"pron" + 0.020*"well" + 0.019*"get" + 0.017*"good" '
  '+ 0.016*"play" + 0.014*"te

In [76]:
modelseries3results.append(['model3_03',10,20,80,'symmetric',-6.6291,0.5424])

In [74]:
lda_model3_04 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           workers=12,
                                           passes=25,
                                           random_state=1892,
                                           alpha='symmetric',
                                           iterations=80,
                                           per_word_topics=True)
pprint(lda_model3_04.print_topics())
evaluator(lda_model3_04)

[(0,
  '0.103*"username" + 0.082*"united" + 0.076*"man" + 0.045*"lose" + '
  '0.044*"manchester" + 0.039*"concede" + 0.028*"city" + 0.026*"utd" + '
  '0.023*"goal" + 0.018*"tottenham"'),
 (1,
  '0.141*"live" + 0.108*"username" + 0.106*"vs" + 0.074*"watch" + '
  '0.072*"stream" + 0.044*"match" + 0.030*"free" + 0.030*"hd" + 0.026*"follow" '
  '+ 0.024*"league"'),
 (2,
  '0.094*"username" + 0.089*"say" + 0.088*"pron" + 0.086*"never" + '
  '0.072*"walk" + 0.068*"alone" + 0.055*"united" + 0.037*"really" + '
  '0.032*"see" + 0.024*"tell"'),
 (3,
  '0.122*"username" + 0.046*"go" + 0.040*"let" + 0.038*"fan" + 0.031*"laugh" + '
  '0.028*"united" + 0.026*"come" + 0.024*"love" + 0.019*"game" + 0.019*"see"'),
 (4,
  '0.226*"username" + 0.074*"goal" + 0.039*"one" + 0.034*"grealish" + '
  '0.022*"watkins" + 0.017*"fpl" + 0.017*"seven" + 0.015*"min" + '
  '0.015*"minute" + 0.015*"two"'),
 (5,
  '0.052*"username" + 0.023*"pron" + 0.020*"get" + 0.020*"well" + 0.017*"good" '
  '+ 0.015*"play" + 0.014*"t

In [78]:
modelseries3results.append(['model3_04',10,20,80,'symmetric',-6.6183,0.5635])

In [82]:
modeltracker3 = pd.DataFrame(modelseries3results, columns =['model_name', 'num_topics', 'passes','iterations','alpha','perplexity','coherence'], dtype = float) 
modeltracker = modeltracker.append(modeltracker3, ignore_index=True)
modeltracker

Unnamed: 0,model_name,num_topics,passes,iterations,alpha,perplexity,coherence
0,model1_01,4.0,5.0,50.0,symmetric,-6.5812,0.4939
1,model1_02,5.0,5.0,50.0,symmetric,-6.6096,0.493
2,model1_03,6.0,5.0,50.0,symmetric,-6.5712,0.5382
3,model1_04,7.0,5.0,50.0,symmetric,-6.6232,0.5236
4,model1_05,8.0,5.0,50.0,symmetric,-6.6457,0.5241
5,model1_06,9.0,5.0,50.0,symmetric,-6.6868,0.5455
6,model1_07,10.0,5.0,50.0,symmetric,-6.724,0.5513
7,model1_08,11.0,5.0,50.0,symmetric,-6.7608,0.5392
8,model1_09,12.0,5.0,50.0,symmetric,-6.8001,0.5278
9,model2_01,10.0,5.0,60.0,symmetric,-6.7403,0.5484


In [77]:
lda_model4_01 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           workers=12,
                                           passes=10,
                                           random_state=1892,
                                           alpha='asymmetric',
                                           iterations=80,
                                           per_word_topics=True)
pprint(lda_model4_01.print_topics())
evaluator(lda_model4_01)

[(0,
  '0.099*"username" + 0.022*"united" + 0.019*"lose" + 0.014*"game" + '
  '0.012*"take" + 0.012*"goal" + 0.012*"happen" + 0.008*"say" + 0.007*"lead" + '
  '0.007*"get"'),
 (1,
  '0.105*"username" + 0.057*"go" + 0.052*"come" + 0.031*"back" + 0.026*"match" '
  '+ 0.020*"mane" + 0.018*"without" + 0.018*"fuck" + 0.014*"look" + '
  '0.012*"keep"'),
 (2,
  '0.090*"username" + 0.068*"pron" + 0.066*"never" + 0.055*"walk" + '
  '0.052*"alone" + 0.049*"say" + 0.047*"united" + 0.029*"really" + 0.019*"see" '
  '+ 0.016*"tell"'),
 (3,
  '0.126*"username" + 0.082*"score" + 0.079*"watkins" + 0.059*"ollie" + '
  '0.033*"yet" + 0.031*"let" + 0.020*"goal" + 0.019*"laugh" + 0.018*"love" + '
  '0.017*"see"'),
 (4,
  '0.196*"username" + 0.073*"goal" + 0.060*"concede" + 0.048*"united" + '
  '0.031*"manchester" + 0.023*"one" + 0.017*"man" + 0.015*"grealish" + '
  '0.014*"two" + 0.014*"fpl"'),
 (5,
  '0.050*"username" + 0.031*"pron" + 0.028*"get" + 0.019*"well" + 0.019*"good" '
  '+ 0.015*"play" + 0.014*"

In [85]:
modeltracker4 = pd.DataFrame([['model4_01',10,20,80,'asymmetric',-6.7089,0.5323]], columns =['model_name', 'num_topics', 'passes','iterations','alpha','perplexity','coherence'], dtype = float)

In [86]:
modeltracker = modeltracker.append(modeltracker4, ignore_index=True)

In [87]:
modeltracker

Unnamed: 0,model_name,num_topics,passes,iterations,alpha,perplexity,coherence
0,model1_01,4.0,5.0,50.0,symmetric,-6.5812,0.4939
1,model1_02,5.0,5.0,50.0,symmetric,-6.6096,0.493
2,model1_03,6.0,5.0,50.0,symmetric,-6.5712,0.5382
3,model1_04,7.0,5.0,50.0,symmetric,-6.6232,0.5236
4,model1_05,8.0,5.0,50.0,symmetric,-6.6457,0.5241
5,model1_06,9.0,5.0,50.0,symmetric,-6.6868,0.5455
6,model1_07,10.0,5.0,50.0,symmetric,-6.724,0.5513
7,model1_08,11.0,5.0,50.0,symmetric,-6.7608,0.5392
8,model1_09,12.0,5.0,50.0,symmetric,-6.8001,0.5278
9,model2_01,10.0,5.0,60.0,symmetric,-6.7403,0.5484


# Final step

In [91]:
lda_model3_01.print_topics()

[(0,
  '0.097*"username" + 0.061*"man" + 0.053*"united" + 0.046*"concede" + 0.036*"lose" + 0.033*"goal" + 0.029*"city" + 0.028*"manchester" + 0.024*"season" + 0.024*"utd"'),
 (1,
  '0.138*"live" + 0.096*"username" + 0.091*"vs" + 0.076*"watch" + 0.069*"stream" + 0.043*"match" + 0.030*"free" + 0.030*"hd" + 0.025*"league" + 0.022*"link"'),
 (2,
  '0.098*"username" + 0.086*"never" + 0.074*"pron" + 0.073*"walk" + 0.069*"alone" + 0.069*"say" + 0.049*"united" + 0.037*"really" + 0.021*"embarrass" + 0.020*"tell"'),
 (3,
  '0.143*"username" + 0.086*"united" + 0.044*"hold" + 0.038*"fan" + 0.032*"manchester" + 0.030*"laugh" + 0.028*"go" + 0.028*"let" + 0.027*"see" + 0.025*"beer"'),
 (4,
  '0.235*"username" + 0.053*"goal" + 0.037*"one" + 0.026*"grealish" + 0.023*"watkins" + 0.016*"fpl" + 0.016*"two" + 0.014*"min" + 0.014*"three" + 0.014*"minute"'),
 (5,
  '0.051*"username" + 0.025*"pron" + 0.019*"well" + 0.017*"get" + 0.015*"good" + 0.015*"play" + 0.013*"team" + 0.011*"think" + 0.011*"mane" + 0.010

In [171]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model3_01, corpus, id2word)

In [146]:
def topiclist(model, n):
    topiclist = []
    for i in range(0,n):
        modellist = [f'topic_{i+1}']
        words = []
        for i,j in model.show_topic(i):
            words.append(i)
        modellist.append(words)
        topiclist.append(modellist)
    return topiclist

In [147]:
modeldf = pd.DataFrame(topiclist(lda_model3_01, 10), columns = ['dominant topic', 'keywords'])

In [148]:
modeldf

Unnamed: 0,dominant topic,keywords
0,topic_1,"[username, man, united, concede, lose, goal, c..."
1,topic_2,"[live, username, vs, watch, stream, match, fre..."
2,topic_3,"[username, never, pron, walk, alone, say, unit..."
3,topic_4,"[username, united, hold, fan, manchester, laug..."
4,topic_5,"[username, goal, one, grealish, watkins, fpl, ..."
5,topic_6,"[username, pron, well, get, good, play, team, ..."
6,topic_7,"[username, fan, man, utd, football, make, big,..."
7,topic_8,"[username, score, barkley, win, watkins, yet, ..."
8,topic_9,"[username, happen, fucking, fuck, please, beat..."
9,topic_10,"[username, first, half, league, score, goal, p..."


In [88]:
topics = lda_model3_01.get_document_topics(corpus)
topics_csr = gensim.matutils.corpus2csc(topics)
topics_numpy = topics_csr.T.toarray()
topics_df = pd.DataFrame(topics_numpy)
topics_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.000000,0.281664,0.000000,0.000000,0.000000,0.341387,0.000000,0.000000,0.326916,0.000000
1,0.016671,0.016671,0.016671,0.016671,0.849961,0.016670,0.016671,0.016671,0.016673,0.016671
2,0.012503,0.012502,0.012502,0.887476,0.012503,0.012504,0.012502,0.012502,0.012503,0.012502
3,0.011117,0.774820,0.011113,0.011113,0.011113,0.011112,0.011113,0.011113,0.011112,0.136274
4,0.175313,0.011112,0.011113,0.735782,0.011113,0.011115,0.011112,0.011113,0.011113,0.011112
...,...,...,...,...,...,...,...,...,...,...
207722,0.033336,0.033337,0.033338,0.033336,0.033336,0.355397,0.033336,0.377900,0.033337,0.033346
207723,0.159785,0.000000,0.000000,0.262621,0.168200,0.277544,0.000000,0.000000,0.000000,0.102433
207724,0.000000,0.054090,0.054916,0.136425,0.119787,0.613933,0.000000,0.000000,0.000000,0.000000
207725,0.467087,0.010004,0.452890,0.010002,0.010002,0.010003,0.010003,0.010002,0.010002,0.010005


In [96]:
topics_df = topics_df.rename(columns={0:'topic_1', 1:'topic_2', 2:'topic_3', 3:'topic_4', 4:'topic_5', 5:'topic_6', 6:'topic_7', 7:'topic_8', 8:'topic_9', 9:'topic_10'})

In [103]:
topics_df.idxmax(axis=1)

0         topic_6
1         topic_5
2         topic_4
3         topic_2
4         topic_4
           ...   
207722    topic_8
207723    topic_6
207724    topic_6
207725    topic_1
207726    topic_5
Length: 207727, dtype: object

In [104]:
df['dominanttopic'] = topics_df.idxmax(axis=1)

In [172]:
df.to_csv('../datasets/avlliv_final.csv', index_label=False)

In [159]:
topiccount = pd.DataFrame(topics_df.idxmax(axis=1).value_counts())

In [160]:
modeldf = modeldf.join(topiccount, on='dominant topic').rename(columns = {0:'tweetcount'})

In [163]:
modeldf['topic_percent'] = modeldf['tweetcount'].apply(lambda x: x/207727)

In [164]:
modeldf

Unnamed: 0,dominant topic,keywords,tweetcount,topic_percent
0,topic_1,"[username, man, united, concede, lose, goal, c...",23933,0.115214
1,topic_2,"[live, username, vs, watch, stream, match, fre...",9993,0.048106
2,topic_3,"[username, never, pron, walk, alone, say, unit...",11041,0.053151
3,topic_4,"[username, united, hold, fan, manchester, laug...",16221,0.078088
4,topic_5,"[username, goal, one, grealish, watkins, fpl, ...",20527,0.098817
5,topic_6,"[username, pron, well, get, good, play, team, ...",51321,0.24706
6,topic_7,"[username, fan, man, utd, football, make, big,...",20232,0.097397
7,topic_8,"[username, score, barkley, win, watkins, yet, ...",23978,0.11543
8,topic_9,"[username, happen, fucking, fuck, please, beat...",13308,0.064065
9,topic_10,"[username, first, half, league, score, goal, p...",17173,0.082671


In [169]:
df[['dominanttopic','sentiment_rating','sentiment_score']].groupby('dominanttopic')['sentiment_score'].mean()

dominanttopic
topic_1     0.033907
topic_10    0.078353
topic_2     0.203907
topic_3     0.118642
topic_4     0.240113
topic_5     0.081051
topic_6     0.083259
topic_7    -0.004753
topic_8     0.091008
topic_9    -0.032897
Name: sentiment_score, dtype: float64

In [170]:
modeldf.to_csv('../datasets/avlliv_topics.csv', index_label=False)