## Libraries and Paths

In [49]:
import os
import re
import sys
import numpy as np
import pandas as pd
import string
import re


import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
from gensim.test.utils import common_corpus, common_dictionary
import gensim


import preprocessor as p
from preprocessor.api import clean


from wordcloud import WordCloud


import matplotlib.pyplot as plt

# TODO: before saving the script try cleaning with RT @... and other punctiation stuff 
# may be # can stay it's a bit problematic


In [42]:
DATA_DIR = "../Data"
TWEETS_PATH = os.path.join(DATA_DIR, 'tweets')
TREND_PATH = os.path.join(DATA_DIR, 'trends')
SAVE_PATH = os.path.join(DATA_DIR, 'save')
os.listdir(SAVE_PATH)[:5]

['2019-07-26_trends.csv',
 '2019-07-18_trends.csv',
 '2019-07-20_trends.csv',
 '2019-07-07_trends.csv',
 '2019-07-12_trends.csv']

## Text Cleaning

In [3]:
df = pd.read_csv( os.path.join(SAVE_PATH,"2019-07-26_trends.csv"),
                header=0, usecols=[4,5,6,7], parse_dates=['trend_date'])
#Take only english ones
df = df[df.lang == "en"]
#Trend_date is not necessary now
df.drop(["lang","trend_date"], inplace=True, axis=1)

In [4]:
remove_digits = str.maketrans('', '', string.digits)
exclude = '[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
non_ascii = re.compile(r'[^\x00-\x7F]+')
#p.set_options(p.OPT.URL, p.OPT.EMOJI)

In [5]:
df['trend'] = df['trend'].map(lambda x : x.lower())
df['trend'] = df['trend'].map(lambda x : x.translate(remove_digits))
df['trend'] = df['trend'].map(lambda x : re.sub(str(exclude), '', x))    


df['text'] = df['text'].map(lambda x : x.lower())
df['text'] = df['text'].map(lambda x : clean(x))
df['text'] = df['text'].map(lambda x : x.translate(remove_digits))
df['text'] = df['text'].map(lambda x : re.sub(str(exclude), '', x))    
df['text'] = df['text'].map(lambda x : re.sub(non_ascii, '', x))

In [6]:
df = df.groupby(['trend'])['text']\
            .apply(lambda x: ','.join(x)).reset_index()
df

Unnamed: 0,trend,text
0,abel,rt abel and rihanna are doing everything but ...
1,adheera,"rt unveiling from on july th,rt hes back fro..."
2,adlive,rt adlive zero cast dates and locations annou...
3,aflbluescrows,night time is the right time to buy a for the ...
4,afleaglesnorth,rt problems for the cowboys already ezekiel e...
...,...,...
617,바이나인디어나인의우주,rt we by chart is here to help the trainees i...
618,세훈찬열부르면돼,rt sehun was the one who reached his hand out...
619,우리만믿어엑스원,"rt fighting for your debut,rt cant wait for ..."
620,유벤투스,rt camera founder found hidden camera now yo...


In [71]:
df.loc[2,:]['text']

'rt  adlive zero cast dates and locations announced for this years edition'

In [8]:
#nltk.download('punkt')
tokenized_df =  df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
tokenized_df

0      [rt, abel, and, rihanna, are, doing, everythin...
1      [rt, unveiling, from, on, july, th, ,, rt, hes...
2      [rt, adlive, zero, cast, dates, and, locations...
3      [night, time, is, the, right, time, to, buy, a...
4      [rt, problems, for, the, cowboys, already, eze...
                             ...                        
617    [rt, we, by, chart, is, here, to, help, the, t...
618    [rt, sehun, was, the, one, who, reached, his, ...
619    [rt, fighting, for, your, debut, ,, rt, cant, ...
620    [rt, camera, founder, found, hidden, camera, n...
621    [boss, ,, rt, camera, founder, found, hidden, ...
Length: 622, dtype: object

In [9]:
def semmatize_text(text):
    return [ps.stem(w)  for w in text if len(w)>3]
ps = PorterStemmer() 
stemmed_dataset = tokenized_df.apply(semmatize_text)
stemmed_dataset

0      [abel, rihanna, do, everyth, drop, album, abel...
1      [unveil, from, juli, back, from, dead, unveil,...
2      [adliv, zero, cast, date, locat, announc, thi,...
3      [night, time, right, time, celebr, here, somet...
4      [problem, cowboy, alreadi, ezekiel, elliott, r...
                             ...                        
617    [chart, here, help, traine, achiev, their, dre...
618    [sehun, reach, hand, love, love, sehun, reach,...
619    [fight, your, debut, cant, wait, your, redebut...
620    [camera, founder, found, hidden, camera, easil...
621    [boss, camera, founder, found, hidden, camera,...
Length: 622, dtype: object

## Exploratory Analysis

In [13]:
dataset_words=''.join(list(str(stemmed_dataset.values)))
wordcloud = WordCloud(width = 800, height = 500, 
                background_color ='white',  
                min_font_size = 10).generate(dataset_words) 

# plt.figure(figsize = (5, 5), facecolor = None) 
# plt.imshow(wordcloud) 
# plt.axis("off") 
# plt.tight_layout(pad = 0) 
  
# plt.show()

In [15]:
dictionary_of_words = gensim.corpora.Dictionary(stemmed_dataset)
print(len(dictionary_of_words))

25155


In [35]:
word_corpus = [dictionary_of_words.doc2bow(word) for word in stemmed_dataset]

for corp in word_corpus[:1]:
    for id, freq in corp[:5]:
        print(dictionary_of_words[id],freq)

dictionary_of_words[15]

abel 34
advic 1
aint 1
album 25
analysi 1


'call'

## LDA Model Initial Trial

In [36]:
lda_model = models.ldamodel.LdaModel(corpus=word_corpus,
                                           id2word=dictionary_of_words,
                                           num_topics=30, 
                                           random_state=101,
                                           update_every=1,
                                           chunksize=100,
                                           passes=50,
                                           alpha='auto',
                                           per_word_topics=True)

In [38]:
# for idx, topic in lda_model.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))

In [43]:
coherence_val = CoherenceModel(model=lda_model, texts=stemmed_dataset, 
                               dictionary=dictionary_of_words, coherence='c_v').get_coherence()
print('Coherence Score: ', coherence_val)



Coherence Score:  0.5331569891775539


#### Best Score is at num_topics=20

In [44]:
lda_models=[]
coherence_values = []
for topic_number in range(5,50,5):
    lda_model = models.ldamodel.LdaModel(corpus=word_corpus,
                                           id2word=dictionary_of_words,
                                           num_topics=topic_number, 
                                           random_state=101,
                                           update_every=1,
                                           chunksize=100,
                                           passes=50,
                                           alpha='auto',
                                           per_word_topics=True)
    lda_models.append(lda_model)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=stemmed_dataset, 
                                         dictionary=dictionary_of_words, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    coherence_values.append(coherence_lda)
    print("number of topics ",topic_number,"coherence_value :" , coherence_lda)

number of topics  5 coherence_value : 0.4322777314653199
number of topics  10 coherence_value : 0.5775055519448571
number of topics  15 coherence_value : 0.5608834659197979
number of topics  20 coherence_value : 0.5734361394539241
number of topics  25 coherence_value : 0.5684044645318396
number of topics  30 coherence_value : 0.5331569891775539
number of topics  35 coherence_value : 0.5607330212208015
number of topics  40 coherence_value : 0.5323336344312884
number of topics  45 coherence_value : 0.5174372823206751


In [46]:
# Model with the best coherence_value
lda_model_20 = models.ldamodel.LdaModel(corpus=word_corpus,
                                           id2word=dictionary_of_words,
                                           num_topics=20, 
                                           random_state=1,
                                           update_every=1,
                                           chunksize=100,
                                           passes=50,
                                           alpha='auto',
                                           per_word_topics=True)

In [48]:
# Compute Coherence Score
cohr_val = CoherenceModel(model=lda_model_20, texts=stemmed_dataset, dictionary=dictionary_of_words, coherence='c_v').get_coherence()

print('\nCoherence Score: ', cohr_val)


Coherence Score:  0.5945144461036688


## LDAMulticore Model

In [52]:
lda_multicore_model = models.ldamulticore.LdaMulticore(corpus=word_corpus, 
                                                              num_topics=20, 
                                                              id2word=dictionary_of_words,                                                             
                                                              chunksize=100, 
                                                              passes=50,                                
                                                              alpha='symmetric',
                                                              eta=0.1,
                                                              decay=0.5, 
                                                              offset=1.0, 
                                                              gamma_threshold=0.001,
                                                              random_state=101,
                                                              minimum_probability=0.01,
                                                              minimum_phi_value=0.01,
                                                              per_word_topics=False)



In [53]:
# Compute Coherence Score
cohr_lda_multicore_model1 = CoherenceModel(model=lda_multicore_model, texts=stemmed_dataset, 
                                           dictionary=dictionary_of_words, coherence='c_v').get_coherence()
print('\nCoherence Score: ', cohr_lda_multicore_model1)


Coherence Score:  0.48426339501658616


## Evaluation

In [77]:
v = lda_model[word_corpus[2]]
print(type(lda_model[word_corpus[2]]))
z=sorted(v[0], key=lambda tup: -1*tup[1])
print(z)

<class 'tuple'>
[(21, 0.2869622), (4, 0.28240708), (16, 0.105208606), (25, 0.089200296), (28, 0.03805037), (17, 0.033370774), (31, 0.0281642), (44, 0.01557884), (26, 0.012868903), (18, 0.011408153)]


In [75]:
lda_model[word_corpus[2]][0]

[(4, 0.2822459),
 (16, 0.10521838),
 (17, 0.03337036),
 (18, 0.011408333),
 (21, 0.2871611),
 (25, 0.089150466),
 (26, 0.01286891),
 (28, 0.03805329),
 (31, 0.028163819),
 (44, 0.015578847)]

In [78]:
for  index,score in sorted(lda_model[word_corpus[2]][0], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.2864798605442047	 
Topic: 0.040*"follow" + 0.033*"enter" + 0.032*"thi" + 0.027*"music" + 0.023*"chanc" + 0.020*"your" + 0.020*"congratul" + 0.018*"villag" + 0.016*"with" + 0.015*"winner"

Score: 0.282893568277359	 
Topic: 0.023*"thi" + 0.020*"with" + 0.012*"have" + 0.011*"your" + 0.010*"more" + 0.009*"that" + 0.009*"will" + 0.008*"from" + 0.008*"time" + 0.007*"here"

Score: 0.10516836494207382	 
Topic: 0.057*"preview" + 0.045*"solo" + 0.040*"seoul" + 0.039*"mark" + 0.038*"fansign" + 0.035*"teaser" + 0.031*"event" + 0.029*"fancam" + 0.025*"tour" + 0.024*"with"

Score: 0.08927897363901138	 
Topic: 0.109*"cover" + 0.067*"edit" + 0.047*"champion" + 0.047*"from" + 0.043*"back" + 0.043*"girl" + 0.042*"small" + 0.039*"villag" + 0.034*"biggi" + 0.032*"come"

Score: 0.038033097982406616	 
Topic: 0.015*"that" + 0.015*"with" + 0.012*"trump" + 0.012*"from" + 0.010*"when" + 0.009*"about" + 0.008*"peopl" + 0.008*"thi" + 0.008*"will" + 0.007*"news"

Score: 0.03334563225507736	 
Topic: 0.025