In [1]:
import pandas as pd
import numpy as np

import re
import string

import spacy

import gensim
from gensim import corpora

# libraries for visualization
import pyLDAvis
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


import warnings
warnings.filterwarnings('ignore')


In [2]:
data1= pd.read_csv("C:\\Users\CHAITANYA AGRAWAL\\OneDrive - IIT Delhi\\Desktop\\Projects\\Toronto\\Scraping the data\\Final script and data\\US\\US_2020.csv")
from nltk.corpus import stopwords
stop_words_list = stopwords.words('english')

In [3]:
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']): 
       output = []
       for sent in texts:
             doc = nlp(sent) 
             output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output

def preprocessing_text(table):
    #put everythin in lowercase
    table['text'] = table['text'].str.lower()
    #Fix contractions
    #table['text'] = table['text'].apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))
    #remove stopwords
    table['text'] = table['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words_list)]))
    #Replace rt indicating that was a retweet
    #table['text'] = table['text'].str.replace('rt', '')
    #Replace occurences of mentioning @UserNames
    table['text'] = table['text'].replace(r'@\w+', '', regex=True)
    #Replace links contained in the tweet
    table['text'] = table['text'].replace(r'http\S+', '', regex=True)
    table['text'] = table['text'].replace(r'www.[^ ]+', '', regex=True)
    #remove numbers
    table['text'] = table['text'].replace(r'[0-9]+', '', regex=True)
    #replace special characters and puntuation marks
    table['text'] = table['text'].replace(r'[!"#$%&()*+,-./:;<=>?@[\]^_`{|}~]', '', regex=True)
    return table

preprocessing_text(data1)
text_list_1=data1['text'].tolist()
tokenized_reviews_1 = lemmatization(text_list_1)
print(tokenized_reviews_1[1])

['power', 'amp', 'superbowl', 'time', 'heartbroken', 'chiefskingdom', 'chief']


In [4]:
dictionary_1 = corpora.Dictionary(tokenized_reviews_1)

In [5]:
doc_term_matrix_1 = [dictionary_1.doc2bow(rev) for rev in tokenized_reviews_1]

# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
lda_model_1 = LDA(corpus=doc_term_matrix_1, id2word=dictionary_1, num_topics=10, random_state=100,update_every=1,
                chunksize=100, passes=50,iterations=100)

In [6]:
lda_model_1.print_topics()

[(0,
  '0.091*"part" + 0.040*"enough" + 0.015*"center" + 0.014*"joke" + 0.012*"art" + 0.011*"past" + 0.009*"meme" + 0.009*"lesson" + 0.008*"cup" + 0.008*"sauce"'),
 (1,
  '0.144*"superbowl" + 0.099*"halftime" + 0.083*"show" + 0.049*"day" + 0.041*"year" + 0.034*"next" + 0.032*"commercial" + 0.029*"fan" + 0.021*"good" + 0.019*"first"'),
 (2,
  '0.130*"super" + 0.125*"bowl" + 0.054*"people" + 0.027*"week" + 0.024*"thing" + 0.023*"weekend" + 0.020*"way" + 0.018*"happy" + 0.015*"night" + 0.014*"❤"'),
 (3,
  '0.048*"white" + 0.038*"dance" + 0.032*"hour" + 0.023*"song" + 0.022*"ticket" + 0.019*"patrickmahome" + 0.017*"✔" + 0.015*"😍" + 0.015*"hip" + 0.012*"beautiful"'),
 (4,
  '0.225*"super" + 0.221*"bowl" + 0.026*"chief" + 0.025*"year" + 0.013*"party" + 0.012*"performance" + 0.012*"national" + 0.012*"anthem" + 0.011*"game" + 0.010*"er"'),
 (5,
  '0.042*"superbowlliv" + 0.039*"man" + 0.030*"work" + 0.029*"lot" + 0.027*"rock" + 0.026*"hard" + 0.025*"city" + 0.023*"sport" + 0.017*"little" + 0.01

In [7]:
def format_topics_sentences(ldamodel=lda_model_1, corpus=doc_term_matrix_1, texts=tokenized_reviews_1):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,10), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model_1, corpus=doc_term_matrix_1, texts=tokenized_reviews_1 )

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

In [8]:
df_dominant_topic

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,4,0.619997,"super, bowl, chief, year, party, performance, ...","[super, bowl, party, part]"
1,1,9,0.586792,"amp, half, superbowl, thank, chiefskingdom, 🏼,...","[power, amp, superbowl, time, heartbroken, chi..."
2,2,1,0.427634,"superbowl, halftime, show, day, year, next, co...","[cheap, pizza, beer, super, bowl, night, life,..."
3,3,4,0.699993,"super, bowl, chief, year, party, performance, ...","[super, bowl]"
4,4,9,0.366668,"amp, half, superbowl, thank, chiefskingdom, 🏼,...","[thank, ❤]"
...,...,...,...,...,...
14943,14943,4,0.420000,"super, bowl, chief, year, party, performance, ...","[super, bowl, commercial, lol]"
14944,14944,9,0.439320,"amp, half, superbowl, thank, chiefskingdom, 🏼,...","[question, schlock, superbowl, half, time, sure]"
14945,14945,1,0.198250,"superbowl, halftime, show, day, year, next, co...","[enough, brother, half, time, show, woman, but..."
14946,14946,8,0.513895,"time, ad, parade, amazing, world, lol, kid, fu...","[fking, ad, super, bowl, ad]"


In [9]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0,0.871126,"part, enough, center, joke, art, past, meme, lesson, cup, sauce","[hace, americana, pura, visto, señora, doblemoral]"
1,1,0.871427,"superbowl, halftime, show, day, year, next, commercial, fan, good, first","[superbowl, halftime, show, terrible, next, day]"
2,2,0.899998,"super, bowl, people, week, thing, weekend, way, happy, night, ❤","[super, bowl, 😭, 😭, 😭, 😭, 😭, 😭]"
3,3,0.864936,"white, dance, hour, song, ticket, patrickmahome, ✔, 😍, hip, beautiful","[dice, mil, dólare, por, solo, tocar]"
4,4,0.935707,"super, bowl, chief, year, party, performance, national, anthem, game, er","[election, year, playoff, election, year, playoff, game, election, year, super, bowl, election, ..."
5,5,0.850715,"superbowlliv, man, work, lot, rock, hard, city, sport, little, home","[legend, celebrity, celebration, hard, rock, longlivelove]"
6,6,0.777287,"team, great, last, guy, woman, video, coach, eagle, many, real","[missy, music, rap, producer, choreographer]"
7,7,0.888124,"football, favorite, love, event, much, bet, coverage, cowboy, call, u","[vote, ummm, 🧐, 🧐, 🧐, 🧐, 🧐, 🧐]"
8,8,0.765412,"time, ad, parade, amazing, world, lol, kid, full, fun, black","[backpack, br, time]"
9,9,0.872143,"amp, half, superbowl, thank, chiefskingdom, 🏼, tomorrow, 👏, 🔥, folk","[superbowl, goodness, amp, 🔥, 🔥, 🔥, 🔥, 🔥, 🔥, 🔥, 🔥, 🔥, 🔥, 🔥, fucken, good]"
