In [1]:
import os
import re
import sys
import numpy as np
import pandas as pd
import string
import re

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
# NLTK Stop words
from nltk.corpus import stopwords

import gensim
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
from gensim.test.utils import common_corpus, common_dictionary, datapath
from gensim.corpora import MmCorpus, Dictionary
from gensim.test.utils import get_tmpfile

from preprocessor.api import clean
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

from utils import load_lda_datasets, load_test_dataset, load_model, get_stop_words, load_raw_datasets

LOADING RAW DATA TREND-TEXT, LENGTH:  1151
LOADING CORPUS, LENGTH:  1151
LOADING DICTIONARY, LENGTH:  54164
LOADING DATASET, LENGTH:  1151


In [2]:
DATA_DIR = "../Data"
TWEETS_PATH = os.path.join(DATA_DIR, 'tweets')
TREND_PATH = os.path.join(DATA_DIR, 'trends')
SAVE_PATH = os.path.join(DATA_DIR, 'save')
STATS_PATH = os.path.join(DATA_DIR, 'stats')
TOPICS_PATH = os.path.join(DATA_DIR, 'topics')

# LOAD DATA STRUCTURES TEST DATA

In [3]:
dfs_train, trend_doc = load_raw_datasets()
stemmed_dataset, corpus, dictionary = load_lda_datasets()

LOADING RAW DATA TREND-TEXT, LENGTH:  1151
LOADING CORPUS, LENGTH:  1151
LOADING DICTIONARY, LENGTH:  54164
LOADING DATASET, LENGTH:  1151


In [4]:
stop_words = get_stop_words()
test_doc, stemmed_test, corpus_test = load_test_dataset()

In [5]:
print(stemmed_test[0])
print(test_doc.loc[0].text)
print(corpus_test[:1])

['antifa', 'one', 'act', 'like', 'fascist']
rt  antifa are the ones acting like fascists the end
[[(509, 1), (2241, 1), (2493, 1), (6103, 1), (6710, 1)]]


# LOAD MODEL AND TOPIC LIST

In [6]:
lda_model = load_model(10)

In [7]:
target_doc = pd.read_csv(os.path.join(DATA_DIR, 'categories'), header=0)
target_doc.head(3)

Unnamed: 0,Category ID,Category Name
0,0,Art & Design
1,1,Books
2,2,Business


In [8]:
k_list_10 = ['Fashion','Art & Design','Sports','Technology','Politics','News','Entertainment','Books','Sports','Music']
k_topics_classified = dict(zip(np.arange(10),k_list_10))
print(k_topics_classified)

{0: 'Fashion', 1: 'Art & Design', 2: 'Sports', 3: 'Technology', 4: 'Politics', 5: 'News', 6: 'Entertainment', 7: 'Books', 8: 'Sports', 9: 'Music'}


# VISUALIZE PRETRAINED MODEL

In [9]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [10]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=trend_doc.text):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)

        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences()

In [11]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic['Dominant_Topic'] = df_dominant_topic['Dominant_Topic'].map(lambda x: k_topics_classified[x])
# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,Entertainment,0.869,"venu, william, love, happi, like, gauff, time,...","rt is that yo sandwich,rt video of when pick..."
1,1,Sports,0.6377,"warrior, kawhi, sign, laker, knick, net, deal,...",rt an achilles for a jersey the guilt is real...
2,2,Entertainment,0.4832,"venu, william, love, happi, like, gauff, time,...",its sad how an average player like acua plays ...
3,3,Entertainment,0.767,"venu, william, love, happi, like, gauff, time,...","from the mercedesamg a amp cla leaked,rt som..."
4,4,Entertainment,0.668,"venu, william, love, happi, like, gauff, time,...",rt aew fyter fest recap highlights part han...
5,5,Entertainment,0.6129,"venu, william, love, happi, like, gauff, time,...",day afghanistan vs westindies to win correct...
6,6,Technology,0.3849,"whatsapp, yix, kyungsoo, instagram, light, twi...",rt the reigning medallist and one of s revel...
7,7,Entertainment,0.4424,"venu, william, love, happi, like, gauff, time,...",rt tj hockenson is rated one of the best rook...
8,8,Sports,0.3893,"warrior, kawhi, sign, laker, knick, net, deal,...",southern daily echo saints confirm first team ...
9,9,News,0.7373,"taylor, swift, scooter, justin, support, biebe...",rt tts borrell christine lagarde https...


In [12]:
trend_doc_topic = pd.concat([trend_doc, df_dominant_topic[['Dominant_Topic']]], axis=1)
trend_doc_topic.head(5)

Unnamed: 0,trend,text,Dominant_Topic
0,acefamily,"rt is that yo sandwich,rt video of when pick...",Entertainment
1,achilles,rt an achilles for a jersey the guilt is real...,Sports
2,acuña,its sad how an average player like acua plays ...,Entertainment
3,adviceforateenager,"from the mercedesamg a amp cla leaked,rt som...",Entertainment
4,aewfyterfest,rt aew fyter fest recap highlights part han...,Entertainment


# VISUALIZATION

In [13]:
dfs_train.head(5)

Unnamed: 0,author_id,id,text,trend,trend_date
0,3244519976,1146083229075685377,rt the average height of the sixers lineup is,sixers,2019-07-01
1,2232937624,1146434208438657024,rt live feed of most people not yet realizing...,twitter dms,2019-07-03
2,800669560181387265,1146451136645357568,now down,nzveng,2019-07-03
3,951756622426144768,1145798976899309569,rt time for williamson being spoken in the sa...,dhoniatcwc,2019-06-30
4,3140403385,1146055852849131520,finals results come out tmr but im more worrie...,michael,2019-07-03


In [14]:
joined = pd.merge(trend_doc_topic[['trend','Dominant_Topic']],dfs_train, left_on='trend', right_on='trend')
joined

Unnamed: 0,trend,Dominant_Topic,author_id,id,text,trend_date
0,acefamily,Entertainment,1486824241,1145810083390971904,rt is that yo sandwich,2019-06-30
1,acefamily,Entertainment,740225179318509568,1145613529929539584,rt video of when picked me up you can hear ou...,2019-06-30
2,acefamily,Entertainment,4727708973,1145682018698768386,rt thank you for everything proof that he is ...,2019-06-30
3,acefamily,Entertainment,999371881714077696,1145726516099026949,here is our basketball charity event video you...,2019-06-30
4,achilles,Sports,1631550686,1145802563016712192,rt an achilles for a jersey the guilt is real,2019-07-01
...,...,...,...,...,...,...
430010,우리대장윤두준생일축하해,Entertainment,999646206912479234,1146539628066795520,rt more than my birthday our album is more im...,2019-07-03
430011,위버스,Technology,821285006676893696,1145854530438889473,rt weverse is in korean means upper and me...,2019-07-01
430012,음악곡으로취향을드러내보자,Entertainment,931876456732401664,1145958209435271168,rt billie eilish bad guy luis fonsi despaci...,2019-07-02
430013,음악곡으로취향을드러내보자,Entertainment,1059434566950121473,1145839514851536898,acdc back in black acdc shoot to thrill a...,2019-07-02


In [61]:
topic_by_time = joined[['Dominant_Topic','trend_date','trend']].groupby(['trend_date','Dominant_Topic'])\
                ['trend'].apply(set).reset_index()

topic_by_time['Frequency'] = topic_by_time['trend'].apply(lambda x: len(x))
plot_df = topic_by_time.groupby(['trend_date']).apply(lambda x: x.nlargest(3, 'Frequency')).reset_index(drop=True)
plot_df

Unnamed: 0,trend_date,Dominant_Topic,trend,Frequency
0,2019-06-30,Entertainment,"{kedar, hiltonbetyaşında, miorgulloes, eurosub...",197
1,2019-06-30,Sports,"{north korea, asapnatinto, kenny omega, middle...",46
2,2019-06-30,Music,"{leclerc, vettel, ben stokes, the cure, versta...",17
3,2019-07-01,Entertainment,"{keepsmilingyunhyeong, clawstnt, prialcantaran...",162
4,2019-07-01,Sports,"{klay, rip tyler, iguodala, brooklyn, survivor...",91
5,2019-07-01,Technology,"{여자친구열대야로여름을열때야, kinpri, buenlunes, my ot, exo...",19
6,2019-07-02,Entertainment,"{baba rahman, keepsmilingyunhyeong, jordan aye...",167
7,2019-07-02,Music,"{houghton, rapinoe, jonathan pearce, millie br...",40
8,2019-07-02,Sports,"{coys, pablo sarabia, smibukabukaanblbi, borre...",30
9,2019-07-03,Entertainment,"{ลับลวงใจep, snappingstwin, tldenttverilir, ec...",132


In [42]:
import plotly.express as px

fig = px.bar(plot_df, x="trend_date", y="Frequency", color='Dominant_Topic', barmode='group',
             height=400, width=900 )
fig.show()

In [75]:
topic_by_author = joined[['Dominant_Topic','trend_date', 'trend','author_id']].groupby(['trend_date','Dominant_Topic']).\
                    agg({'author_id': [set], 'trend':[set]}).reset_index()
topic_by_author.columns = ['trend_date', 'Dominant_Topic','author_id','trend']
topic_by_author['Frequency'] = topic_by_author['author_id'].apply(lambda x: len(x))
topic_by_author.drop(['author_id'], axis=1, inplace=True)

plot_df2 = topic_by_author.groupby(['trend_date']).apply(lambda x: x.nlargest(3, 'Frequency')).reset_index(drop=True)
plot_df2

Unnamed: 0,trend_date,Dominant_Topic,trend,Frequency
0,2019-06-30,Entertainment,"{kedar, hiltonbetyaşında, miorgulloes, eurosub...",8939
1,2019-06-30,Sports,"{north korea, asapnatinto, kenny omega, middle...",8933
2,2019-06-30,Art & Design,"{مليونهيونيو, antifa, andy ngo, stepdownpastor}",5096
3,2019-07-01,Entertainment,"{keepsmilingyunhyeong, clawstnt, prialcantaran...",26569
4,2019-07-01,Sports,"{klay, rip tyler, iguodala, brooklyn, survivor...",23559
5,2019-07-01,Technology,"{여자친구열대야로여름을열때야, kinpri, buenlunes, my ot, exo...",21652
6,2019-07-02,Entertainment,"{baba rahman, keepsmilingyunhyeong, jordan aye...",30795
7,2019-07-02,Books,"{dubas, kerfoot, michael, ovie, sylvia mulinge...",18312
8,2019-07-02,Technology,"{ddos, marmitinhasdojustin, taokaenoixsehun, b...",12841
9,2019-07-03,Entertainment,"{ลับลวงใจep, snappingstwin, tldenttverilir, ec...",14349


In [76]:
import plotly.express as px

fig = px.bar(plot_df2, x="trend_date", y="Frequency", color='Dominant_Topic', barmode='group',
             height=400, width=900 )
fig.show()