# Clustering Tweets and Comments from BYU football and basketball tweets scraped from Twitter

## Import modules and read in data

In [1]:
# Import modules
import pandas as pd
import numpy as np

In [2]:
# Read in csv of joined tweets with comments
# Only need rowNum, tweet content and comment content
data = pd.read_csv('./SportsTwitterClean_JoinedGameData - SportsTwitterClean.csv', usecols=['Tweet_Content','Comment_Content'])

In [3]:
# Add index to preserve row order
data['originalIndex'] = np.arange(len(data))
data.head()

Unnamed: 0,Tweet_Content,Comment_Content,originalIndex
0,Cody Fueger isnt as well-known as BYU basketba...,Thanks Jay for the great article. I owe it all...,0
1,Cody Fueger isnt as well-known as BYU basketba...,Cody Fueger isnt as well-known as BYU basketba...,1
2,Cody Fueger isnt as well-known as BYU basketba...,We love Coach Fueger!,2
3,Cody Fueger isnt as well-known as BYU basketba...,Anybody that dont know the name didnt play any...,3
4,Cody Fueger isnt as well-known as BYU basketba...,is the man! Ive been a huge fan of his since h...,4


## Pre-Processing

In [4]:
# Check for any Null values
new_data = data.dropna(axis = 0, how ='any')
new_data.head()

print("Old data frame length:", len(data)) 
print("New data frame length:", len(new_data))  
print("Number of rows with at least 1 NA value: ", (len(data)-len(new_data)))

## Output
## Old data frame length: 3278
## New data frame length: 3278
## Number of rows with at least 1 NA value:  0

## No NA values


Old data frame length: 3278
New data frame length: 3278
Number of rows with at least 1 NA value:  0


In [5]:
# Remove Twitter symbols and lowercase
# Import regex
import re

data.describe()

#Remove punctuation from comments
tweets_processed = data['Tweet_Content'].map(lambda x: re.sub('[,\.!?]', '', str(x)))
comments_processed = data['Comment_Content'].map(lambda x: re.sub('[,\.!?]', '', str(x)))

#Convert to lowercase
tweets_processed = tweets_processed.map(lambda x: x.lower())
comments_processed = comments_processed.map(lambda x: x.lower())

comments_processed.head()

0    thanks jay for the great article i owe it all ...
1    cody fueger isnt as well-known as byu basketba...
2                                 we love coach fueger
3    anybody that dont know the name didnt play any...
4    is the man ive been a huge fan of his since he...
Name: Comment_Content, dtype: object

In [6]:
# Tokenize comments and tweets
import nltk
nltk.download('punkt')
comment_token_list = comments_processed.map(lambda row: nltk.word_tokenize(row))
tweet_token_list = tweets_processed.map(lambda row: nltk.word_tokenize(row))

print("Comment Token List : ",comment_token_list[:20])
print("\n Total Comment Tokens : ",len(comment_token_list))

print("Tweet Token List : ",tweet_token_list[:20])
print("\n Total Tweet Tokens : ",len(tweet_token_list))

[nltk_data] Downloading package punkt to C:\Users\Colby
[nltk_data]     Nelson\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Comment Token List :  0     [thanks, jay, for, the, great, article, i, owe...
1     [cody, fueger, isnt, as, well-known, as, byu, ...
2                             [we, love, coach, fueger]
3     [anybody, that, dont, know, the, name, didnt, ...
4     [is, the, man, ive, been, a, huge, fan, of, hi...
5                                               [facts]
6                        [ehhh, hes, alright, i, guess]
7                                     [nice, hi, serge]
8                                           [go, gaels]
9     [its, all, good, but, just, a, few, notes, of,...
10    [go, gaels, were, excited, to, be, the, presen...
11       [at, least, the, hair, cut, was, better, then]
12          [rare, picture, of, a, young, zac, seljaas]
13                            [aye, it, worked, though]
14    [this, is, during, any, byu, basketball, game,...
15    [byu, football, and, basketball, are, both, cu...
16    [they, just, said, on, radio, for, 2, weeks, h...
17    [are, you, freaking,

In [7]:
# Use WordNet library to lemmatize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
i = 0
for comment in comment_token_list:
    comment_token_list[i] = [lemmatizer.lemmatize(word) for word in comment_token_list[i] ]
    i = i + 1
    
i = 0
for tweet in tweet_token_list:
    tweet_token_list[i] = [lemmatizer.lemmatize(word) for word in tweet_token_list[i] ]
    i = i + 1
print("Comment token list after Lemmatization : ", comment_token_list[:20])
print("\nTotal comment tokens after Lemmatization : ", len(comment_token_list))
print("Tweet token list after Lemmatization : ", tweet_token_list[:20])
print("\nTotal tweet tokens after Lemmatization : ", len(tweet_token_list))

# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
i = 0
for comment in comment_token_list:
    comment_token_list[i] = [stemmer.stem(t) for t in comment_token_list[i]]
    i = i + 1
i=0
for comment in tweet_token_list:
    tweet_token_list[i] = [stemmer.stem(t) for t in tweet_token_list[i]]
    i = i + 1

[nltk_data] Downloading package wordnet to C:\Users\Colby
[nltk_data]     Nelson\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Comment token list after Lemmatization :  0     [thanks, jay, for, the, great, article, i, owe...
1     [cody, fueger, isnt, a, well-known, a, byu, ba...
2                             [we, love, coach, fueger]
3     [anybody, that, dont, know, the, name, didnt, ...
4     [is, the, man, ive, been, a, huge, fan, of, hi...
5                                                [fact]
6                         [ehhh, he, alright, i, guess]
7                                     [nice, hi, serge]
8                                            [go, gael]
9     [it, all, good, but, just, a, few, note, of, d...
10    [go, gael, were, excited, to, be, the, present...
11        [at, least, the, hair, cut, wa, better, then]
12          [rare, picture, of, a, young, zac, seljaas]
13                            [aye, it, worked, though]
14    [this, is, during, any, byu, basketball, game,...
15    [byu, football, and, basketball, are, both, cu...
16    [they, just, said, on, radio, for, 2, week, he...
17    

### Remove duplicated tweets in dataframe for LDA analysis

In [8]:
# Remove duplicate tweets for LDA Analysis
tweet_token_list_with_dups = tweet_token_list

import itertools
tweet_token_list = list(tweet_token_list for tweet_token_list,_ in itertools.groupby(tweet_token_list))

print("Tweet token list after dup removal : ", tweet_token_list[:20])
print("\nTotal tweet tokens after dup removal : ", len(tweet_token_list))

Tweet token list after dup removal :  [['codi', 'fueger', 'isnt', 'a', 'well-known', 'a', 'byu', 'basketbal', 'coach', 'mark', 'pope', 'other', 'two', 'assist', 'but', 'he', 'just', 'a', 'valuabl', 'stori', 'by'], ['go', 'gael', 'were', 'excit', 'to', 'be', 'the', 'present', 'sponsor', 'of', 'the', 'v', 'byu', 'basketbal', 'game', 'tonight', 'and', 'to', 'celebr', 'the', 'newli', 'name', 'ucu', 'pavilion', '#', 'beatbyu', '#', 'ucupavilion', '#', 'gaelsris'], ['rare', 'pictur', 'of', 'a', 'young', 'zac', 'seljaa'], ['this', 'is', 'dure', 'ani', 'byu', 'basketbal', 'game', 'when', 'were', 'losingc', 'do', 'it', 'got', 'ta', 'leav', 'man', 'it', 'our', 'onli', 'hope', 'this', 'chief', 'fan', 'realli', 'left', 'in', 'the', '1st', 'quarter', '(', 'via', '@', 'cpenn4thewin', ')'], ['are', 'you', 'freak', 'kid', 'me', 'is', 'byu', 'basketbal', 'just', 'curs', 'gosh', 'damn', 'byus', 'yoeli', 'child', 'sidelin', 'for', 'saint', 'mari', 'game', 'due', 'to', 'injur', 'finger', 'stori', 'by', '@

### Process words with Gensim package for final dataset

In [9]:
# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel

# Build the bigram and trigram models
bigram = gensim.models.Phrases(tweet_token_list, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[tweet_token_list], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

#### Need to add more stopwords in analysis

In [10]:
# Set Stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
# Add Custom stopwords as a list (stop_words.extend(['sfd','sdf']))
# stop_words.extend()

[nltk_data] Downloading package stopwords to C:\Users\Colby
[nltk_data]     Nelson\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and perform Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])    # Load spacy, but we don't need the parser or NER (named entity extraction) modules
    
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

tweet_data_ready = process_words(tweet_token_list)  # processed Tweet Data!
comment_data_ready = process_words(comment_token_list)  # processed Comment Data!

In [12]:
print(tweet_data_ready[:3])
print(comment_data_ready[:3])

[['fueger', 'well', 'know', 'valuabl', 'stori'], ['go', 'excit', 'present', 'sponsor', 'tonight', 'name', 'pavilion'], ['rare']]
[['thank', 'work', 'staff', 'player'], ['fueger', 'well', 'know', 'valuabl', 'stori'], ['love', 'coach', 'fueger']]


## Tweet analysis section
#### Need to test different numbers of topics to see which are most relevant (esp. where merchandise is relevant)

In [13]:
# Import pprint
from pprint import pprint

# Create Dictionary
id2word = corpora.Dictionary(tweet_data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in tweet_data_ready]

# Build LDA model
# Change Number of topics here and adjust to explore different options
num_topics = 6


lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=500,
                                           per_word_topics=True)

ldatopics = lda_model.show_topics(formatted=False)
pprint(lda_model.print_topics())

[(0,
  '0.043*"week" + 0.034*"end" + 0.033*"play" + 0.027*"scarf" + 0.027*"sell" + '
  '0.027*"still" + 0.024*"great" + 0.022*"know" + 0.021*"ask" + 0.018*"care"'),
 (1,
  '0.120*"go" + 0.053*"get" + 0.045*"walk" + 0.037*"good" + 0.031*"guy" + '
  '0.024*"make" + 0.016*"side" + 0.016*"parti" + 0.016*"pink" + 0.016*"dress"'),
 (2,
  '0.053*"footbal" + 0.051*"team" + 0.048*"fan" + 0.036*"year" + 0.032*"win" + '
  '0.031*"look" + 0.026*"would" + 0.024*"come" + 0.020*"see" + 0.019*"follow"'),
 (3,
  '0.123*"game" + 0.032*"player" + 0.030*"byu" + 0.027*"coach" + 0.025*"let" + '
  '0.023*"big" + 0.021*"may" + 0.020*"find" + 0.018*"kid" + 0.016*"low"'),
 (4,
  '0.067*"time" + 0.046*"season" + 0.033*"first" + 0.032*"watch" + 0.029*"top" '
  '+ 0.029*"think" + 0.025*"mean" + 0.024*"start" + 0.023*"basketbal" + '
  '0.020*"rank"'),
 (5,
  '0.045*"take" + 0.038*"back" + 0.032*"nation" + 0.032*"love" + 0.024*"also" '
  '+ 0.023*"lot" + 0.022*"drop" + 0.022*"state" + 0.021*"point" + '
  '0.018*"fee

In [14]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [21]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=tweet_data_ready)

# Format
# This does not include multiple entries for each tweet; need to duplicate for each of the values
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic = pd.concat([df_dominant_topic, data], axis=1)
df_dominant_topic.rename(columns={'Dominant_Topic':'Tweet_Dom_Topic','Perc_Contribution':'Tweet_Perc_Contribution','Topic_Keywords':'Tweet_Topic_Keywords'},inplace=True)
df_dominant_topic




Unnamed: 0,index,Tweet_Dom_Topic,Tweet_Perc_Contribution,Tweet_Topic_Keywords,0,Tweet_Content,Comment_Content,originalIndex,Tweet_Dom_Topic.1,Tweet_Perc_Contribution.1,Tweet_Topic_Keywords.1,Comment_Dom_Topic,Comment_Perc_Contribution,Comment_Topic_Keywords
0,0,2.0,0.6346,"good, man, want, look, player, love, thing, li...","[fueger, well, know, valuabl, stori]",Cody Fueger isnt as well-known as BYU basketba...,Thanks Jay for the great article. I owe it all...,0,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",2.0,0.6346,"good, man, want, look, player, love, thing, li..."
1,1,5.0,0.5779,"would, year, know, way, play, could, back, may...","[go, excit, present, sponsor, tonight, name, p...",Cody Fueger isnt as well-known as BYU basketba...,Cody Fueger isnt as well-known as BYU basketba...,1,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",5.0,0.5779,"would, year, know, way, play, could, back, may..."
2,2,2.0,0.4226,"good, man, want, look, player, love, thing, li...",[rare],Cody Fueger isnt as well-known as BYU basketba...,We love Coach Fueger!,2,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",2.0,0.4226,"good, man, want, look, player, love, thing, li..."
3,3,5.0,0.7709,"would, year, know, way, play, could, back, may...","[game, losingc, get, realli, leave, quarter]",Cody Fueger isnt as well-known as BYU basketba...,Anybody that dont know the name didnt play any...,3,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",5.0,0.7709,"would, year, know, way, play, could, back, may..."
4,4,1.0,0.4752,"see, time, come, fan, win, half, woman, first,...","[cur, game, due, finger, sport, basketball, yo...",Cody Fueger isnt as well-known as BYU basketba...,is the man! Ive been a huge fan of his since h...,4,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",1.0,0.4753,"see, time, come, fan, win, half, woman, first,..."
5,5,1.0,0.5833,"see, time, come, fan, win, half, woman, first,...","[make, shot, keep, throw]",Cody Fueger isnt as well-known as BYU basketba...,Facts,5,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",1.0,0.5833,"see, time, come, fan, win, half, woman, first,..."
6,6,4.0,0.7194,"think, team, make, still, point, byu, great, l...","[onli, team, hear, leagu]",Cody Fueger isnt as well-known as BYU basketba...,ehhh hes alright I guess,6,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",4.0,0.7194,"think, team, make, still, point, byu, great, l..."
7,7,0.0,0.1667,"game, get, say, week, better, bad, mean, give,...","[get, good, get, text, today, someon, meet, gc...",Go Gaels! Were excited to be the presenting sp...,Nice! Hi Serge!,7,1.0,0.7491,"go, get, walk, good, guy, make, side, parti, p...",0.0,0.1667,"game, get, say, week, better, bad, mean, give,..."
8,8,3.0,0.5833,"go, well, watch, much, let, need, also, wear, ...","[need, crummi, way, run, thing, holmo, come]",Go Gaels! Were excited to be the presenting sp...,Go Gaels!,8,1.0,0.7491,"go, get, walk, good, guy, make, side, parti, p...",3.0,0.5833,"go, well, watch, much, let, need, also, wear, ..."
9,9,2.0,0.6943,"good, man, want, look, player, love, thing, li...","[fan, actual, enjoy, sport, onli, win, footbal...",Go Gaels! Were excited to be the presenting sp...,Its all good! But just a few notes of distinct...,9,1.0,0.7491,"go, get, walk, good, guy, make, side, parti, p...",2.0,0.6943,"good, man, want, look, player, love, thing, li..."


In [16]:
# Duplicate LDA values for each tweet
tweet_dom_topic = []
tweet_perc_contribution = []
tweet_topic_keywords = []

prevTweet = ''
i=-1
for tweet in data['Tweet_Content']:
    if (prevTweet == tweet):
        tweet_dom_topic.append(df_dominant_topic['Tweet_Dom_Topic'][i])
        tweet_perc_contribution.append(df_dominant_topic['Tweet_Perc_Contribution'][i])
        tweet_topic_keywords.append(df_dominant_topic['Tweet_Topic_Keywords'][i])
    else:
#         print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ',i)
#         print(prevTweet)
#         print(tweet)
        prevTweet = tweet
        i += 1
        tweet_dom_topic.append(df_dominant_topic['Tweet_Dom_Topic'][i])
#         print(tweet_dom_topic)
        tweet_perc_contribution.append(df_dominant_topic['Tweet_Perc_Contribution'][i])
#         print(tweet_perc_contribution)
        tweet_topic_keywords.append(df_dominant_topic['Tweet_Topic_Keywords'][i])
#         print(tweet_topic_keywords)

data['Tweet_Dom_Topic'] = tweet_dom_topic
data['Tweet_Perc_Contribution'] = tweet_perc_contribution
data['Tweet_Topic_Keywords'] = tweet_topic_keywords
data

Unnamed: 0,Tweet_Content,Comment_Content,originalIndex,Tweet_Dom_Topic,Tweet_Perc_Contribution,Tweet_Topic_Keywords
0,Cody Fueger isnt as well-known as BYU basketba...,Thanks Jay for the great article. I owe it all...,0,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
1,Cody Fueger isnt as well-known as BYU basketba...,Cody Fueger isnt as well-known as BYU basketba...,1,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
2,Cody Fueger isnt as well-known as BYU basketba...,We love Coach Fueger!,2,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
3,Cody Fueger isnt as well-known as BYU basketba...,Anybody that dont know the name didnt play any...,3,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
4,Cody Fueger isnt as well-known as BYU basketba...,is the man! Ive been a huge fan of his since h...,4,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
5,Cody Fueger isnt as well-known as BYU basketba...,Facts,5,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
6,Cody Fueger isnt as well-known as BYU basketba...,ehhh hes alright I guess,6,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
7,Go Gaels! Were excited to be the presenting sp...,Nice! Hi Serge!,7,1.0,0.7491,"go, get, walk, good, guy, make, side, parti, p..."
8,Go Gaels! Were excited to be the presenting sp...,Go Gaels!,8,1.0,0.7491,"go, get, walk, good, guy, make, side, parti, p..."
9,Go Gaels! Were excited to be the presenting sp...,Its all good! But just a few notes of distinct...,9,1.0,0.7491,"go, get, walk, good, guy, make, side, parti, p..."


## This is the same process (except for duplicate handling for the comments)
#### Need to test different numbers of topics to see which are most relevant (esp. where merchandise is relevant)

In [17]:
# Import pprint
from pprint import pprint

# Create Dictionary
id2word = corpora.Dictionary(comment_data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in comment_data_ready]

# Build LDA model
# Change Number of topics here and adjust to explore different options
num_topics = 6


lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=500,
                                           per_word_topics=True)

ldatopics = lda_model.show_topics(formatted=False)
pprint(lda_model.print_topics())

[(0,
  '0.096*"game" + 0.058*"get" + 0.041*"say" + 0.034*"week" + 0.032*"better" + '
  '0.031*"bad" + 0.026*"mean" + 0.024*"give" + 0.022*"kid" + 0.022*"day"'),
 (1,
  '0.065*"see" + 0.055*"time" + 0.049*"come" + 0.045*"fan" + 0.041*"win" + '
  '0.028*"half" + 0.025*"woman" + 0.024*"first" + 0.018*"gocoug" + '
  '0.018*"nation"'),
 (2,
  '0.057*"good" + 0.051*"man" + 0.044*"want" + 0.043*"look" + 0.034*"player" + '
  '0.032*"love" + 0.030*"thing" + 0.018*"live" + 0.018*"old" + 0.017*"thank"'),
 (3,
  '0.137*"go" + 0.067*"well" + 0.041*"watch" + 0.031*"much" + 0.027*"let" + '
  '0.022*"need" + 0.020*"also" + 0.019*"wear" + 0.019*"alway" + 0.016*"side"'),
 (4,
  '0.050*"think" + 0.047*"team" + 0.042*"make" + 0.040*"still" + 0.032*"point" '
  '+ 0.032*"byu" + 0.032*"great" + 0.030*"lose" + 0.025*"state" + '
  '0.025*"take"'),
 (5,
  '0.100*"would" + 0.064*"year" + 0.035*"know" + 0.035*"way" + 0.033*"play" + '
  '0.029*"could" + 0.027*"back" + 0.024*"may" + 0.021*"actual" + '
  '0.019*"alm

In [18]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [19]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=comment_data_ready)

# Format
# This does not include multiple entries for each tweet; need to duplicate for each of the values
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic = pd.concat([df_dominant_topic, data], axis=1)
df_dominant_topic.rename(columns={'Dominant_Topic':'Comment_Dom_Topic','Perc_Contribution':'Comment_Perc_Contribution','Topic_Keywords':'Comment_Topic_Keywords'},inplace=True)
df_dominant_topic

Unnamed: 0,index,Comment_Dom_Topic,Comment_Perc_Contribution,Comment_Topic_Keywords,0,Tweet_Content,Comment_Content,originalIndex,Tweet_Dom_Topic,Tweet_Perc_Contribution,Tweet_Topic_Keywords
0,0,2.0,0.6346,"good, man, want, look, player, love, thing, li...","[thank, work, staff, player]",Cody Fueger isnt as well-known as BYU basketba...,Thanks Jay for the great article. I owe it all...,0,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
1,1,5.0,0.5779,"would, year, know, way, play, could, back, may...","[fueger, well, know, valuabl, stori]",Cody Fueger isnt as well-known as BYU basketba...,Cody Fueger isnt as well-known as BYU basketba...,1,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
2,2,2.0,0.4226,"good, man, want, look, player, love, thing, li...","[love, coach, fueger]",Cody Fueger isnt as well-known as BYU basketba...,We love Coach Fueger!,2,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
3,3,5.0,0.7709,"would, year, know, way, play, could, back, may...","[know, name, play, back, back, back, know]",Cody Fueger isnt as well-known as BYU basketba...,Anybody that dont know the name didnt play any...,3,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
4,4,1.0,0.4753,"see, time, come, fan, win, half, woman, first,...","[huge, fan, sinc, coach, summer, camp, come]",Cody Fueger isnt as well-known as BYU basketba...,is the man! Ive been a huge fan of his since h...,4,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
5,5,1.0,0.5833,"see, time, come, fan, win, half, woman, first,...",[fact],Cody Fueger isnt as well-known as BYU basketba...,Facts,5,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
6,6,4.0,0.7194,"think, team, make, still, point, byu, great, l...","[alright, guess]",Cody Fueger isnt as well-known as BYU basketba...,ehhh hes alright I guess,6,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn..."
7,7,0.0,0.1667,"game, get, say, week, better, bad, mean, give,...",[],Go Gaels! Were excited to be the presenting sp...,Nice! Hi Serge!,7,1.0,0.7491,"go, get, walk, good, guy, make, side, parti, p..."
8,8,3.0,0.5833,"go, well, watch, much, let, need, also, wear, ...",[go],Go Gaels! Were excited to be the presenting sp...,Go Gaels!,8,1.0,0.7491,"go, get, walk, good, guy, make, side, parti, p..."
9,9,2.0,0.6943,"good, man, want, look, player, love, thing, li...","[good, distinct, man, woman, athlet, refer, co...",Go Gaels! Were excited to be the presenting sp...,Its all good! But just a few notes of distinct...,9,1.0,0.7491,"go, get, walk, good, guy, make, side, parti, p..."


In [20]:
# Add them to the final data set
data['Comment_Dom_Topic'] = df_dominant_topic['Comment_Dom_Topic']
data['Comment_Perc_Contribution'] = df_dominant_topic['Comment_Perc_Contribution']
data['Comment_Topic_Keywords'] = df_dominant_topic['Comment_Topic_Keywords']
data

Unnamed: 0,Tweet_Content,Comment_Content,originalIndex,Tweet_Dom_Topic,Tweet_Perc_Contribution,Tweet_Topic_Keywords,Comment_Dom_Topic,Comment_Perc_Contribution,Comment_Topic_Keywords
0,Cody Fueger isnt as well-known as BYU basketba...,Thanks Jay for the great article. I owe it all...,0,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",2.0,0.6346,"good, man, want, look, player, love, thing, li..."
1,Cody Fueger isnt as well-known as BYU basketba...,Cody Fueger isnt as well-known as BYU basketba...,1,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",5.0,0.5779,"would, year, know, way, play, could, back, may..."
2,Cody Fueger isnt as well-known as BYU basketba...,We love Coach Fueger!,2,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",2.0,0.4226,"good, man, want, look, player, love, thing, li..."
3,Cody Fueger isnt as well-known as BYU basketba...,Anybody that dont know the name didnt play any...,3,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",5.0,0.7709,"would, year, know, way, play, could, back, may..."
4,Cody Fueger isnt as well-known as BYU basketba...,is the man! Ive been a huge fan of his since h...,4,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",1.0,0.4753,"see, time, come, fan, win, half, woman, first,..."
5,Cody Fueger isnt as well-known as BYU basketba...,Facts,5,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",1.0,0.5833,"see, time, come, fan, win, half, woman, first,..."
6,Cody Fueger isnt as well-known as BYU basketba...,ehhh hes alright I guess,6,0.0,0.8593,"week, end, play, scarf, sell, still, great, kn...",4.0,0.7194,"think, team, make, still, point, byu, great, l..."
7,Go Gaels! Were excited to be the presenting sp...,Nice! Hi Serge!,7,1.0,0.7491,"go, get, walk, good, guy, make, side, parti, p...",0.0,0.1667,"game, get, say, week, better, bad, mean, give,..."
8,Go Gaels! Were excited to be the presenting sp...,Go Gaels!,8,1.0,0.7491,"go, get, walk, good, guy, make, side, parti, p...",3.0,0.5833,"go, well, watch, much, let, need, also, wear, ..."
9,Go Gaels! Were excited to be the presenting sp...,Its all good! But just a few notes of distinct...,9,1.0,0.7491,"go, get, walk, good, guy, make, side, parti, p...",2.0,0.6943,"good, man, want, look, player, love, thing, li..."
