In [1]:
import os
import re
import sys
import numpy as np
import pandas as pd
import string
import re

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer


from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
from gensim.test.utils import common_corpus, common_dictionary, datapath
import gensim


import preprocessor as p
from preprocessor.api import clean
from wordcloud import WordCloud
import matplotlib.pyplot as plt


# NLTK Stop words
from nltk.corpus import stopwords

In [2]:
DATA_DIR = "../Data"
TWEETS_PATH = os.path.join(DATA_DIR, 'tweets')
TREND_PATH = os.path.join(DATA_DIR, 'trends')
SAVE_PATH = os.path.join(DATA_DIR, 'save')
STATS_PATH = os.path.join(DATA_DIR, 'stats')
TOPICS_PATH = os.path.join(DATA_DIR, 'topics')

os.listdir(SAVE_PATH)[:5]

['lda_test_data',
 '2019-07-01_trends.csv',
 'oo-2019-08-30_trends.csv',
 '2019-07-02_trends.csv',
 'lda_train_data']

In [3]:
def semmatize_stop_words(w):
    ps = PorterStemmer()
    return ps.stem(w)


def get_stop_words():
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'rt'])
    stop_words_stem = [semmatize_stop_words(x) for x in stop_words]
    stop_words.extend(stop_words_stem)
    stop_words = list(dict.fromkeys(stop_words))

    return stop_words

In [4]:
dfs_train =  pd.read_csv(os.path.join(SAVE_PATH, "lda_train_data"), header=0, parse_dates=['trend_date'])
dfs_test =  pd.read_csv(os.path.join(SAVE_PATH, "lda_test_data"), header=0, parse_dates=['trend_date'])

In [5]:
print(dfs_train.shape)
dfs_train.head()

(430015, 5)


Unnamed: 0,author_id,id,text,trend,trend_date
0,3244519976,1146083229075685377,rt the average height of the sixers lineup is,sixers,2019-07-01
1,2232937624,1146434208438657024,rt live feed of most people not yet realizing...,twitter dms,2019-07-03
2,800669560181387265,1146451136645357568,now down,nzveng,2019-07-03
3,951756622426144768,1145798976899309569,rt time for williamson being spoken in the sa...,dhoniatcwc,2019-06-30
4,3140403385,1146055852849131520,finals results come out tmr but im more worrie...,michael,2019-07-03


In [6]:
print(dfs_test.shape)
dfs_test.head()

(44, 5)


Unnamed: 0,author_id,id,text,trend,trend_date
0,1312460263,1145847739856707584,rt and just destroyed each other,mondaynightraw,2019-07-02
1,755539196,1145678327728021504,rt remember years ago when liberals were scre...,north korea,2019-06-30
2,161068986,1146192301917708289,annas face as soon as michael said interesting...,michael,2019-07-03
3,548849142,1146570959501111298,rt so joanna is copying her mans lingo immatu...,joanna,2019-07-02
4,363519310,1145686137522053120,rt antifa are the ones acting like fascists t...,antifa,2019-06-30


In [7]:
dfsLDA = dfs_train.loc[:,["trend","text"]]
dfsLDA.dropna(inplace=True)
trend_doc = dfsLDA.groupby(['trend'])['text'].apply(lambda x: ','.join(x)).reset_index()
trend_doc

Unnamed: 0,trend,text
0,acefamily,"rt is that yo sandwich,rt video of when pick..."
1,achilles,rt an achilles for a jersey the guilt is real...
2,acuña,its sad how an average player like acua plays ...
3,adviceforateenager,"from the mercedesamg a amp cla leaked,rt som..."
4,aewfyterfest,rt aew fyter fest recap highlights part han...
...,...,...
1146,여자친구열대야로여름을열때야,rt kpop stans listening to amp liking it then...
1147,욜라대충트친소,rt music is my life
1148,우리대장윤두준생일축하해,"rt happy dujun day,happiest birthday to my fi..."
1149,위버스,rt weverse is in korean means upper and me...


# LDA

In [8]:
def semmatize_text(text):
    ps = PorterStemmer() 
    return [ps.stem(w)  for w in text if len(w)>3]

def tokanize_text(trend_doc):
    return trend_doc.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

def remove_stopwords(texts):
    return [word for word in texts if word not in stop_words ]

def process_lda_format(trend_doc):
    tokenized_df = tokanize_text(trend_doc)
    stemmed_dataset = tokenized_df.apply(semmatize_text)
    stemmed_dataset = stemmed_dataset.map(lambda x: remove_stopwords(x))
    return stemmed_dataset

def initialize_corpus_and_dictionary(stemmed_dataset):
    
    dictionary_of_words = gensim.corpora.Dictionary(stemmed_dataset)
    word_corpus = [dictionary_of_words.doc2bow(word) for word in stemmed_dataset]
    
    return word_corpus, dictionary_of_words

def lda_datasets(trend_doc):
    stemmed_dataset = process_lda_format(trend_doc)
    corpus, dictionary = initialize_corpus_and_dictionary(stemmed_dataset)
    
    return stemmed_dataset, corpus, dictionary

In [9]:
def run_lda(topic_num):
    # Model with the best coherence_value
    lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=topic_num,
                                            random_state=1, update_every=1, chunksize=100,
                                            passes=50, alpha='auto', per_word_topics=True)

    cwd = os.getcwd()
    temp_file = datapath(os.path.join(cwd, "models/lda_model_"+str(topic_num)))
    print('Model is saving... at', temp_file)
    lda_model.save(temp_file)

    
    # Compute Perplexity Score
    print('Perplexity Score: ', lda_model.log_perplexity(corpus)) 
    
    # Compute Coherence Score
    cohr_val = CoherenceModel(model=lda_model, texts=stemmed_dataset, dictionary=dictionary,
                                      coherence='c_v').get_coherence()

    print('Coherence Score: ', cohr_val)
    

In [77]:
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# from gensim.corpora import MmCorpus
# from gensim.test.utils import get_tmpfile
# output_fname = get_tmpfile("try_corpus.mm")

# MmCorpus.serialize(output_fname, corpus)
# mm = MmCorpus(output_fname)  # `mm` document stream now has random access
# print(mm[0])  # retrieve document no. 42, etc.

In [12]:
stop_words = get_stop_words()
stemmed_dataset, corpus, dictionary = lda_datasets(trend_doc)

In [13]:
topic_num = 16
run_lda(topic_num)

Model is saving... at /home/lilith/Desktop/EPFL_courses/PROJE_TUGRULCAN/Trend_Topic_Analysis/Notebooks/models/lda_model_16
Perplexity Score:  -8.169396325989698
Coherence Score:  0.6634416559086657


In [85]:
topic_num = 20
run_lda(topic_num)

Model is saving... at /home/lilith/Desktop/EPFL_courses/PROJE_TUGRULCAN/Trend_Topic_Analysis/Notebooks/models/lda_model_20
Perplexity Score:  -8.684606187293076
Coherence Score:  0.6404153957118417


# Data Structures

Looking at the following:

>- corpus encodes words with (word_id, word_frequency) but not necessarily in the order
>- in this case. "video" coded as (17,2)

In [25]:
print(len(corpus), len(stemmed_dataset))

1151 1151


In [46]:
print(corpus[:1] , "\n")
# Human readable format of corpus (term-frequency)
print([[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]], "\n")
print(stemmed_dataset[0], "\n")

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1)]] 

[[('basketbal', 1), ('care', 1), ('chariti', 1), ('convo', 1), ('event', 1), ('everyth', 1), ('fan', 1), ('guy', 1), ('hear', 1), ('kill', 1), ('person', 1), ('pick', 1), ('proof', 1), ('sandwich', 1), ('sincer', 1), ('thank', 1), ('turn', 1), ('video', 2), ('volum', 1)]] 

['sandwich', 'video', 'pick', 'hear', 'convo', 'turn', 'volum', 'thank', 'everyth', 'proof', 'sincer', 'person', 'care', 'fan', 'basketbal', 'chariti', 'event', 'video', 'guy', 'kill'] 



In [33]:
from collections import Counter
words = stemmed_dataset[0]

Counter(words).keys() # equals to list(set(words))
Counter(words).values() # counts the elements' frequency

dict_values([1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [44]:
# Proof
dictionary[17]

'video'

# Check The Model

>- doc_lda where it transforms the corpus to TransformedCorpus as classified documents

In [88]:
# Get file path
topic_num = 20
cwd = os.getcwd()
temp_file = datapath(os.path.join(cwd, "models/lda_model_"+str(topic_num)))
# Load a potentially pretrained model from disk.
lda_model = models.ldamodel.LdaModel.load(temp_file)

In [96]:
from pprint import pprint
# pprint(lda_model.print_topics(num_words=3))
model_topics = lda_model.show_topics(formatted=False, num_topics=topic_num,num_words=3)
print(len(model_topics))
pprint(model_topics)

20
[(0, [('ariel', 0.05975069), ('hall', 0.05629759), ('mermaid', 0.04177357)]),
 (1, [('senat', 0.042167105), ('abbo', 0.034638327), ('elisha', 0.034416202)]),
 (2, [('happi', 0.03583244), ('tyler', 0.021117616), ('follow', 0.019993914)]),
 (3, [('light', 0.04861343), ('song', 0.026775075), ('offici', 0.02142202)]),
 (4, [('nike', 0.07964832), ('flag', 0.058787692), ('american', 0.033802193)]),
 (5, [('taylor', 0.06783093), ('swift', 0.039096177), ('justin', 0.035911713)]),
 (6, [('venu', 0.050858438), ('william', 0.04766493), ('gauff', 0.03955582)]),
 (7, [('club', 0.08978907), ('sign', 0.063360356), ('harri', 0.031851713)]),
 (8, [('kawhi', 0.047820732), ('laker', 0.04068579), ('warrior', 0.029701719)]),
 (9, [('morgan', 0.043510433), ('press', 0.037303764), ('alex', 0.03019603)]),
 (10, [('vote', 0.18516022), ('parti', 0.09345937), ('back', 0.076539636)]),
 (11, [('today', 0.006988071), ('make', 0.006485599), ('live', 0.006233082)]),
 (12, [('jason', 0.09143905), ('felt', 0.0522944

In [72]:
doc_lda = lda_model[corpus]
print(len(doc_lda[1]))
pprint(doc_lda[0][0])
# pprint(doc_lda[0][1])
# pprint(doc_lda[0][2])

3
[(8, 0.069758415), (11, 0.26580918), (12, 0.62151796)]


In [97]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [107]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=trend_doc.text):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)

        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences()

In [108]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,14.0,0.333,"like, love, dont, know, time, much, want, back...","rt is that yo sandwich,rt video of when pick..."
1,1,11.0,0.3453,"today, make, live, look, time, year, peopl, fi...",rt an achilles for a jersey the guilt is real...
2,2,13.0,0.2442,"trump, north, korea, presid, independ, juli, a...",its sad how an average player like acua plays ...
3,3,11.0,0.6444,"today, make, live, look, time, year, peopl, fi...","from the mercedesamg a amp cla leaked,rt som..."
4,4,11.0,0.5436,"today, make, live, look, time, year, peopl, fi...",rt aew fyter fest recap highlights part han...
5,5,11.0,0.5014,"today, make, live, look, time, year, peopl, fi...",day afghanistan vs westindies to win correct...
6,6,11.0,0.6522,"today, make, live, look, time, year, peopl, fi...",rt the reigning medallist and one of s revel...
7,7,11.0,0.563,"today, make, live, look, time, year, peopl, fi...",rt tj hockenson is rated one of the best rook...
8,8,11.0,0.5251,"today, make, live, look, time, year, peopl, fi...",southern daily echo saints confirm first team ...
9,9,5.0,0.5846,"taylor, swift, justin, scooter, support, bulli...",rt tts borrell christine lagarde https...


# LDA TEST

In [14]:
# Get file path
topic_num = 0
cwd = os.getcwd()
temp_file = datapath(os.path.join(cwd, "models/lda_model_"+str(topic_num)))
# Load a potentially pretrained model from disk.
lda_test = models.ldamodel.LdaModel.load(temp_file)

In [15]:
dfsLDA_test = dfs_test.loc[:,["trend","text"]]
dfsLDA_test.dropna(inplace=True)
test_doc = dfsLDA_test.groupby(['trend'])['text'].apply(lambda x: ','.join(x)).reset_index()

In [17]:
test_doc.head()

Unnamed: 0,trend,text
0,antifa,rt antifa are the ones acting like fascists t...
1,armyseicaday,rt fall in love with me
2,armyselcaday,"rt ,rt ..."
3,barrie,tyson barrie will be a ufa after this season
4,bbnaijaupdates,rt i actually like this you dey feel me guy s...


In [18]:
stemmed_test = process_lda_format(test_doc)
corpus_test = [dictionary.doc2bow(word) for word in stemmed_test]

In [19]:
# We will see

lda_test[corpus_test]

<gensim.interfaces.TransformedCorpus at 0x7f1309b316d8>

In [None]:
target_doc = pd.read_csv(os.path.join(DATA_DIR, 'categories'), header=0)
target_doc.head(2)

In [None]:
# Check the words

# words_match = re.compile(r'\"\w+\"')
# for idx, topic in lda.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))
#     topic_file = open(os.path.join(TOPICS_PATH, "topic-"+str(idx)+".txt"), "w+")
#     words = re.findall( words_match, topic)
#     topic_file.write( str(words) )
#     topic_file.close()