In [11]:
import os
import re
import sys
import numpy as np
import pandas as pd
import string
import re

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [12]:
DATA_DIR = "../Data"
TWEETS_PATH = os.path.join(DATA_DIR, 'tweets')
TREND_PATH = os.path.join(DATA_DIR, 'trends')
SAVE_PATH = os.path.join(DATA_DIR, 'save')
STATS_PATH = os.path.join(DATA_DIR, 'stats')
TOPICS_PATH = os.path.join(DATA_DIR, 'topics')
MEDIA_PATH = os.path.join(DATA_DIR, 'media')

# LOAD DATA STRUCTURES TEST DATA

In [13]:
import utils as ut
# from utils import load_raw_datasets, load_lda_datasets, load_test_dataset, load_model, get_stop_words
dfs_train, trend_doc = ut.load_raw_datasets()
stemmed_dataset, corpus, dictionary = ut.load_lda_datasets()

LOADING RAW DATA TREND-TEXT, LENGTH:  6737
LOADING CORPUS, LENGTH:  6737
LOADING DICTIONARY, LENGTH:  239733
LOADING DATASET, LENGTH:  6737


In [14]:
test_doc, stemmed_test, corpus_test = ut.load_test_dataset(dictionary)

Test dataset is loaded, LENGHT:  401
Test corpus is created, LENGTH:  301


In [15]:
print(stemmed_test[0])
print(test_doc.loc[0].text)
print(corpus_test[:1])

['sibl', 'look', 'second']
rt  my sibling looks at me for seconds me
[[(35, 1), (1109, 1), (1374, 1)]]


In [6]:
lda_model_19 = ut.load_model(19)

# VISUALIZE PRETRAINED MODEL 19

In [9]:
# Visualize the topics
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline


pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_19, corpus, dictionary)
vis

In [20]:
k_list_19 = ['Funny/Daily','Technology','News (world)','Music','Sports (tennis)','News (politics)','Celebrities','Politics','Cinema','Entertainment',
             'Art & Design','Business','Sports (futball)','News','News (law)','Fashion','Entertainment','Sports (basketball)','Charity']
k_topics_classified = dict(zip(np.arange(19),k_list_19))
print(k_topics_classified)

{0: 'Funny/Daily', 1: 'Technology', 2: 'News (world)', 3: 'Music', 4: 'Sports (tennis)', 5: 'News (politics)', 6: 'Celebrities', 7: 'Politics', 8: 'Cinema', 9: 'Entertainment', 10: 'Art & Design', 11: 'Business', 12: 'Sports (futball)', 13: 'News', 14: 'News (law)', 15: 'Fashion', 16: 'Entertainment', 17: 'Sports (basketball)', 18: 'Charity'}


In [18]:
def format_topics_sentences(ldamodel=lda_model_19, corpus=corpus, texts=trend_doc.text):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)

        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences()

In [21]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic['Dominant_Topic'] = df_dominant_topic['Dominant_Topic'].map(lambda x: k_topics_classified[x])
# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,Entertainment,0.7017,"happi, follow, today, thank, time, juli, like,...","flows,flows,rt flows that shit go hard,price ..."
1,1,Technology,0.3744,"paul, trade, zion, russel, first, angel, georg...","rt hrs vs,everyone complaining that since vla..."
2,2,Art & Design,0.7076,"trump, racist, america, countri, american, bac...",republicans voted to condemn i mean thats more...
3,3,Sports (tennis),0.8221,"like, back, dont, know, love, want, look, time...",me i dont even like blueberries also me devour...
4,4,Entertainment,0.2847,"happi, follow, today, thank, time, juli, like,...",then now and forever the thackerays aa...
5,5,Entertainment,0.5608,"happi, follow, today, thank, time, juli, like,...",hey guys are so always is our future chief min...
6,6,Sports (tennis),0.5744,"like, back, dont, know, love, want, look, time...",aaron boone gets onegame suspension for savage...
7,7,Business,0.2676,"daniel, club, coach, sign, player, assist, spu...","aaron donald,rt top players in the nfl voted ..."
8,8,Business,0.4094,"daniel, club, coach, sign, player, assist, spu...","aaron hicks omg,aaron hicks is a liability,rt ..."
9,9,Technology,0.4076,"paul, trade, zion, russel, first, angel, georg...",rt well aaron judge first career games hr rbi...


# GRAPHS

In [24]:
trend_doc_topic = pd.concat([trend_doc, df_dominant_topic[['Dominant_Topic']]], axis=1)
joined = pd.merge(trend_doc_topic[['trend','Dominant_Topic']],dfs_train, left_on='trend', right_on='trend')

In [25]:
topic_by_time = joined[['Dominant_Topic','trend_date','trend']].groupby(['trend_date','Dominant_Topic'])\
                ['trend'].apply(set).reset_index()

topic_by_time['Frequency'] = topic_by_time['trend'].apply(lambda x: len(x))
plot_df = topic_by_time.groupby(['trend_date']).apply(lambda x: x.nlargest(3, 'Frequency')).reset_index(drop=True)

In [36]:
import plotly.express as px

fig = px.bar(plot_df, x="trend_date", y="Frequency", color='Dominant_Topic', barmode='group',
             height=600, width=1800 )

fig.update_layout(title_text="Daily Dominant Category by Tweet")
# Set x-axis title
fig.update_xaxes(title_text="Days")
# Set y-axes titles
fig.update_yaxes(title_text="#Trends")

fig.write_image(os.path.join(MEDIA_PATH, "topic_by_time.png"))
fig.show()

In [32]:
topic_by_author = joined[['Dominant_Topic','trend_date', 'trend','author_id']].groupby(['trend_date','Dominant_Topic']).\
                    agg({'author_id': [set], 'trend':[set]}).reset_index()
topic_by_author.columns = ['trend_date', 'Dominant_Topic','author_id','trend']
topic_by_author['Frequency'] = topic_by_author['author_id'].apply(lambda x: len(x))
topic_by_author.drop(['author_id'], axis=1, inplace=True)

plot_df2 = topic_by_author.groupby(['trend_date']).apply(lambda x: x.nlargest(3, 'Frequency')).reset_index(drop=True)

In [37]:
# target_docimport plotly.express as px

fig = px.bar(plot_df2, x="trend_date", y="Frequency", color='Dominant_Topic', barmode='group',
             height=600, width=1800 )

fig.update_layout(title_text="Daily Dominant Category by Author")
# Set x-axis title
fig.update_xaxes(title_text="Days")
# Set y-axes titles
fig.update_yaxes(title_text="#Authors")

fig.write_image(os.path.join(MEDIA_PATH, "topic_by_author.png"))
fig.show()