In [1]:
import pandas as pd
import numpy as np

import re
import string

import spacy

import gensim
from gensim import corpora

# libraries for visualization
import pyLDAvis
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


import warnings
warnings.filterwarnings('ignore')


In [2]:
data1= pd.read_csv("C:\\Users\CHAITANYA AGRAWAL\\OneDrive - IIT Delhi\\Desktop\\Projects\\Toronto\\Scraping the data\\Final script and data\\US\\US_2021.csv")
from nltk.corpus import stopwords
stop_words_list = stopwords.words('english')

In [3]:
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']): 
       output = []
       for sent in texts:
             doc = nlp(sent) 
             output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output

def preprocessing_text(table):
    #put everythin in lowercase
    table['text'] = table['text'].str.lower()
    #Fix contractions
    #table['text'] = table['text'].apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))
    #remove stopwords
    table['text'] = table['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words_list)]))
    #Replace rt indicating that was a retweet
    #table['text'] = table['text'].str.replace('rt', '')
    #Replace occurences of mentioning @UserNames
    table['text'] = table['text'].replace(r'@\w+', '', regex=True)
    #Replace links contained in the tweet
    table['text'] = table['text'].replace(r'http\S+', '', regex=True)
    table['text'] = table['text'].replace(r'www.[^ ]+', '', regex=True)
    #remove numbers
    table['text'] = table['text'].replace(r'[0-9]+', '', regex=True)
    #replace special characters and puntuation marks
    table['text'] = table['text'].replace(r'[!"#$%&()*+,-./:;<=>?@[\]^_`{|}~]', '', regex=True)
    return table

preprocessing_text(data1)
text_list_1=data1['text'].tolist()
tokenized_reviews_1 = lemmatization(text_list_1)
print(tokenized_reviews_1[1])

['excited', 'teacher', 'cheerleader', 'student']


In [4]:
dictionary_1 = corpora.Dictionary(tokenized_reviews_1)

In [77]:
doc_term_matrix_1 = [dictionary_1.doc2bow(rev) for rev in tokenized_reviews_1]

# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
lda_model_1 = LDA(corpus=doc_term_matrix_1, id2word=dictionary_1, num_topics=10, random_state=100,update_every=1,
                chunksize=100, passes=50,iterations=100)

In [78]:
lda_model_1.print_topics()

[(0,
  '0.185*"bowl" + 0.183*"super" + 0.071*"year" + 0.041*"chief" + 0.029*"ad" + 0.022*"champion" + 0.014*"shit" + 0.011*"life" + 0.011*"champ" + 0.009*"work"'),
 (1,
  '0.280*"super" + 0.278*"bowl" + 0.034*"good" + 0.030*"team" + 0.028*"time" + 0.013*"buccaneer" + 0.013*"football" + 0.009*"weekend" + 0.007*"many" + 0.007*"history"'),
 (2,
  '0.116*"people" + 0.083*"fan" + 0.053*"bad" + 0.042*"man" + 0.040*"today" + 0.033*"bucs" + 0.031*"coach" + 0.020*"medium" + 0.020*"stadium" + 0.018*"free"'),
 (3,
  '0.055*"party" + 0.036*"mask" + 0.030*"top" + 0.028*"state" + 0.024*"🇸" + 0.023*"low" + 0.023*"number" + 0.020*"ball" + 0.017*"hope" + 0.016*"case"'),
 (4,
  '0.124*"game" + 0.058*"thing" + 0.037*"big" + 0.028*"tv" + 0.025*"event" + 0.024*"point" + 0.023*"last" + 0.021*"🏆" + 0.019*"lot" + 0.018*"reason"'),
 (5,
  '0.055*"way" + 0.042*"much" + 0.026*"loss" + 0.025*"whole" + 0.023*"superbowllv" + 0.020*"🏾" + 0.019*"post" + 0.015*"smh" + 0.014*"ready" + 0.013*"one"'),
 (6,
  '0.061*"half

In [79]:
def format_topics_sentences(ldamodel=lda_model_1, corpus=doc_term_matrix_1, texts=tokenized_reviews_1):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,10), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model_1, corpus=doc_term_matrix_1, texts=tokenized_reviews_1 )

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

In [80]:
df_dominant_topic

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,6,0.700402,"halftime, show, next, mahome, week, performance, sport, love, streaker, line","[superbowl, week, enough, attention, fact, entire, professional, sport, team, due, covid, protocol]"
1,1,0,0.775570,"bowl, super, year, chief, ad, champion, shit, life, champ, work","[excited, teacher, cheerleader, student]"
2,2,1,0.820218,"super, bowl, good, team, time, buccaneer, football, weekend, many, history","[good, evening, dante, super, bowl]"
3,3,4,0.470985,"game, thing, big, tv, event, point, last, 🏆, lot, reason","[super, bowl, takeout, wing, special, lbs, bone, lb, boneless, takeout, mix, kitchen, order, wing]"
4,4,1,0.775329,"super, bowl, good, team, time, buccaneer, football, weekend, many, history","[tenth, super, bowl, weekend]"
...,...,...,...,...,...
16881,16881,1,0.619995,"super, bowl, good, team, time, buccaneer, football, weekend, many, history","[super, bowl, w, cowboy]"
16882,16882,1,0.294637,"super, bowl, good, team, time, buccaneer, football, weekend, many, history","[dak, super, bowl, ring, first, cowboy, dak]"
16883,16883,1,0.619999,"super, bowl, good, team, time, buccaneer, football, weekend, many, history","[event, super, bowl, weekend]"
16884,16884,0,0.584032,"bowl, super, year, chief, ad, champion, shit, life, champ, work","[divisional, round, super, bowl, fucking, guy]"


In [81]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0,0.906416,"bowl, super, year, chief, ad, champion, shit, life, champ, work","[glory, tgbtg, 🏼, congratulation, super, bowl, champ, 🙏, 🏼]"
1,1,0.949999,"super, bowl, good, team, time, buccaneer, football, weekend, many, history","[super, super, super, super, super, super, super, super, super, super, super, super, super, supe..."
2,2,0.769534,"people, fan, bad, man, today, bucs, coach, medium, stadium, free","[rocket, mortgage, ®, superbowl, square, sweepstake, chance, big, super, bowl, score, change, gr..."
3,3,0.866875,"party, mask, top, state, 🇸, low, number, ball, hope, case","[party, spam, oven, potato, wedge, cheesy, focaccia, bread, marinara]"
4,4,0.777851,"game, thing, big, tv, event, point, last, 🏆, lot, reason","[sound, superspreader, event, crossconnection]"
5,5,0.774992,"way, much, loss, whole, superbowllv, 🏾, post, smh, ready, one","[ready, kelce, pat]"
6,6,0.874109,"halftime, show, next, mahome, week, performance, sport, love, streaker, line","[fresh, road, tap, keyword, theater, friday‼️, newmovie]"
7,7,0.943745,"superbowl, last, year, win, amp, ring, guy, rating, well, money","[🖤, 🖤, 🖤, 🖤, 🖤, 🖤, 🖤, 🖤, 🖤, 🖤, 🖤, 🖤, 🖤, 🖤, superbowl]"
8,8,0.774995,"commercial, great, second, home, little, lv, moment, different, flag, check","[home, lv, gobuc]"
9,9,0.796822,"night, day, season, first, part, yesterday, field, beach, parade, right","[existence, positive, affirmation, beach]"
