In [1]:
#Base and Cleaning 
import sys
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import time
import pickle
from datetime import datetime
from datetime import date
import json
import requests
import emoji
import re
import string
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess

# Problem Statement
How can we use social media as an aggregator of topics and sentiment in user discussions? To investigate this, I used tweets posted over the course of a real time event and checked to see how sentiment and topics occurrence changed in response to discrete smaller events over the period.

# Executive Summary
Sports fans generate a great deal of opinionated data in response to what is happening during various sporting events. In this data there are both discrete topics and sentiments associated with each topic. To investigate the ability of NLP to model and provide insights on collected social media sentiment, I tested this by collecting and examining topics and sentiment during a specific football event. The data was cleaned to leave only basic english characters and words stripped of stopwords, lemmatized, and tokenized. The tweets were then assigned a sentiment score based on the tokens identified, before they were run through an unsupervised latent Dirichlet allocation model to identify and assign topics to the tweets. 
The results do provide insight into fan sentiment during the chosen event and that my approach was effective in answering the initial problem statement

#### Methodology Summary Of This Notebook
1. The dataset(s) were imported and assigned to df
2. Mentions and hashtags in the raw tweet text body were appended to their own dataframes
3. The tweet text was cleaned to leave only raw english words 
4. The tweet search term included english only tweets, so langdetect was not needed.
5. Because of the nature of twitter and mispelling player names, to preserve player names as a topic, regex was used to identify any words where player names had extra letters. For example, 'WATKINSSSS' returned 'watkins' after regex.
6. The cleaned text was tokenized and lemmatized, then saved to a dataframe.
7. Term tokens were also saved for later visualisation.

# Creating a combined dataset

In [3]:
df=pd.read_csv('../datasets/muntotcombined.csv')

In [4]:
print(df.shape)
df.head()

(38701, 11)


Unnamed: 0,username,text,tweetcreatedts,hashtags,retweetcount,likecount,acctdesc,location,followers,totaltweets,usercreatedts
0,ManUtd,Under way at Old Trafford — come on United! 🔴⚪...,2020-10-04 15:30:16,"['MUFC', 'MUNTOT']",262,1917,Official #MUFC account. @ManUtd_ES 🇪🇸 | @ManUt...,"Manchester, England",23298880,61791,2012-04-20 15:17:43
1,NelsonNelli2,Ready for battle.\n\n#MUFC #MUNTOT https://t.c...,2020-10-04 15:30:16,"['MUFC', 'MUNTOT']",757,0,I'm jovial and fun to be with,"Port Harcourt, Nigeria",1,45,2020-09-25 07:43:39
2,Dcfox82,Cmon #mufc 3pts today please,2020-10-04 15:30:16,['mufc'],0,0,"work for nhs, Husband, dad of 3, student of li...","Derby,England",595,4873,2010-10-18 16:11:48
3,lynger2000,COME UNITED!!!!!!!!!!!! 🔴🔴 #MUNTOT #MUFC,2020-10-04 15:30:16,"['MUNTOT', 'MUFC']",0,0,A Scottish lass who will always and forever be...,"Leeds, England",3337,44646,2012-11-14 20:15:43
4,URMySolskjaer,"Telles and Cavani now done, #MUFC will have o...",2020-10-04 15:30:16,['MUFC'],432,0,,,43,2738,2015-11-22 17:14:36


In [5]:
# Adjusting the time

In [6]:
df['datetime']=pd.to_datetime(df['tweetcreatedts'])

In [7]:
df['datetime'] = df.datetime.apply(lambda x: x.replace(second=0))

In [8]:
df['time'] = df['datetime'].dt.time

In [9]:
df

Unnamed: 0,username,text,tweetcreatedts,hashtags,retweetcount,likecount,acctdesc,location,followers,totaltweets,usercreatedts,datetime,time
0,ManUtd,Under way at Old Trafford — come on United! 🔴⚪...,2020-10-04 15:30:16,"['MUFC', 'MUNTOT']",262,1917,Official #MUFC account. @ManUtd_ES 🇪🇸 | @ManUt...,"Manchester, England",23298880,61791,2012-04-20 15:17:43,2020-10-04 15:30:00,15:30:00
1,NelsonNelli2,Ready for battle.\n\n#MUFC #MUNTOT https://t.c...,2020-10-04 15:30:16,"['MUFC', 'MUNTOT']",757,0,I'm jovial and fun to be with,"Port Harcourt, Nigeria",1,45,2020-09-25 07:43:39,2020-10-04 15:30:00,15:30:00
2,Dcfox82,Cmon #mufc 3pts today please,2020-10-04 15:30:16,['mufc'],0,0,"work for nhs, Husband, dad of 3, student of li...","Derby,England",595,4873,2010-10-18 16:11:48,2020-10-04 15:30:00,15:30:00
3,lynger2000,COME UNITED!!!!!!!!!!!! 🔴🔴 #MUNTOT #MUFC,2020-10-04 15:30:16,"['MUNTOT', 'MUFC']",0,0,A Scottish lass who will always and forever be...,"Leeds, England",3337,44646,2012-11-14 20:15:43,2020-10-04 15:30:00,15:30:00
4,URMySolskjaer,"Telles and Cavani now done, #MUFC will have o...",2020-10-04 15:30:16,['MUFC'],432,0,,,43,2738,2015-11-22 17:14:36,2020-10-04 15:30:00,15:30:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,__DANEJO,Define Manchester United in 1 - 3 words 📽📽 #MU...,2020-10-04 17:21:27,['MUNTOT'],215,0,ᴬʲᵉᵖᵃᵏᵒ • ᴵ'ᵐ ᴼⁿˡʸ ᴴᵘᵐᵃⁿ ☺ • ᴬᵐᵇⁱᵛᵉʳᵗ • ᴱˣᵖˡᵒ...,North 🇳🇬,9140,34660,2014-06-26 11:30:26,2020-10-04 17:21:00,17:21:00
3996,Chachi97357115,Pogba and Shaw showing they are breaking. Ne...,2020-10-04 17:21:27,['MUNTOT'],0,0,,,1,3,2020-09-29 22:56:29,2020-10-04 17:21:00,17:21:00
3997,MiesterBob,PENALTY to Spurs...\n\n#MUNTOT,2020-10-04 17:21:27,['MUNTOT'],329,0,extreme,"Botswana, Gaborone",2,7,2019-02-27 08:29:10,2020-10-04 17:21:00,17:21:00
3998,2020Kuami,Who is a better defender? \n\n1. Retweet for M...,2020-10-04 17:21:27,['MUNTOT'],34,0,Neva give up bro 🙌🙌🙌🙌,"Greater Accra, Ghana",248,1789,2020-10-01 17:13:59,2020-10-04 17:21:00,17:21:00


# Initial Processing

In [10]:
def add_mentions(text):
    mentionlist = re.findall(r'\B@\w+', text)
    if not mentionlist:
        mentionlist = np.NaN
    return mentionlist

def add_hashtags(text):
    hashtaglist = re.findall(r'\B(\#[a-zA-Z0-9]+\b)',text)
    if not hashtaglist:
        hashtaglist = np.NaN
    return hashtaglist

In [11]:
df['mentions'] = df['text'].apply(add_mentions)
# df['hashtags'] = df['text'].apply(add_hashtags)

In [12]:
df['text'] = df['text'].str.lower()

In [13]:
df.head(10)

Unnamed: 0,username,text,tweetcreatedts,hashtags,retweetcount,likecount,acctdesc,location,followers,totaltweets,usercreatedts,datetime,time,mentions
0,ManUtd,under way at old trafford — come on united! 🔴⚪...,2020-10-04 15:30:16,"['MUFC', 'MUNTOT']",262,1917,Official #MUFC account. @ManUtd_ES 🇪🇸 | @ManUt...,"Manchester, England",23298880,61791,2012-04-20 15:17:43,2020-10-04 15:30:00,15:30:00,
1,NelsonNelli2,ready for battle.\n\n#mufc #muntot https://t.c...,2020-10-04 15:30:16,"['MUFC', 'MUNTOT']",757,0,I'm jovial and fun to be with,"Port Harcourt, Nigeria",1,45,2020-09-25 07:43:39,2020-10-04 15:30:00,15:30:00,
2,Dcfox82,cmon #mufc 3pts today please,2020-10-04 15:30:16,['mufc'],0,0,"work for nhs, Husband, dad of 3, student of li...","Derby,England",595,4873,2010-10-18 16:11:48,2020-10-04 15:30:00,15:30:00,
3,lynger2000,come united!!!!!!!!!!!! 🔴🔴 #muntot #mufc,2020-10-04 15:30:16,"['MUNTOT', 'MUFC']",0,0,A Scottish lass who will always and forever be...,"Leeds, England",3337,44646,2012-11-14 20:15:43,2020-10-04 15:30:00,15:30:00,
4,URMySolskjaer,"telles and cavani now done, #mufc will have o...",2020-10-04 15:30:16,['MUFC'],432,0,,,43,2738,2015-11-22 17:14:36,2020-10-04 15:30:00,15:30:00,
5,brunopolo22,manchester united and porto have reached an ag...,2020-10-04 15:30:16,[],23284,0,,,263,18152,2014-05-07 15:56:28,2020-10-04 15:30:00,15:30:00,
6,GeekCHarmin,solskjaer decided to have small sense starting...,2020-10-04 15:30:16,['MUNTOT'],0,0,Mourinho FC,Nigeria,595,8620,2011-09-11 11:27:08,2020-10-04 15:30:00,15:30:00,
7,TJellyn,c'mon united! #mufc,2020-10-04 15:30:16,['MUFC'],2,0,Living In Lodebar.,"Toronto, Canada",904,140370,2009-06-23 23:31:15,2020-10-04 15:30:00,15:30:00,
8,Oloche69,cavani in manchester. medical to come. \ntelle...,2020-10-04 15:30:16,[],442,0,Trying to elevate small talk to medium talk. E...,,1695,357181,2015-03-28 23:38:27,2020-10-04 15:30:00,15:30:00,
9,BabaGroovy,final fee for alex telles will be around €15m ...,2020-10-04 15:30:16,[],3317,0,"Proud Father, All round cool dude.\n\n🎤 Podcas...",Earth 2,1011,131442,2017-01-14 16:17:34,2020-10-04 15:30:00,15:30:00,


# Text Cleaning

In [14]:
def twitter_cleaner(text):
    text = re.sub(r'\B@\w+', 'username', text) # Replace usernames with placeholder
    text = re.sub(r'(rt)\b', '', text) # Remove retweets
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)]) # Remove emojis
    text = re.sub(r'http\S+', '', text) # Remove urls
    text = re.sub(r'#*', '', text) # Remove hashtags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove all punctuation
    text = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
    text = re.sub('[^a-zA-Z 0-9]', '', text) # Remove all non letters or numbers
    return text   


In [15]:
df['cleantext'] = df['text'].apply(twitter_cleaner)

In [16]:
df.head(10)

Unnamed: 0,username,text,tweetcreatedts,hashtags,retweetcount,likecount,acctdesc,location,followers,totaltweets,usercreatedts,datetime,time,mentions,cleantext
0,ManUtd,under way at old trafford — come on united! 🔴⚪...,2020-10-04 15:30:16,"['MUFC', 'MUNTOT']",262,1917,Official #MUFC account. @ManUtd_ES 🇪🇸 | @ManUt...,"Manchester, England",23298880,61791,2012-04-20 15:17:43,2020-10-04 15:30:00,15:30:00,,under way at old trafford come on united mufc...
1,NelsonNelli2,ready for battle.\n\n#mufc #muntot https://t.c...,2020-10-04 15:30:16,"['MUFC', 'MUNTOT']",757,0,I'm jovial and fun to be with,"Port Harcourt, Nigeria",1,45,2020-09-25 07:43:39,2020-10-04 15:30:00,15:30:00,,ready for battle mufc muntot
2,Dcfox82,cmon #mufc 3pts today please,2020-10-04 15:30:16,['mufc'],0,0,"work for nhs, Husband, dad of 3, student of li...","Derby,England",595,4873,2010-10-18 16:11:48,2020-10-04 15:30:00,15:30:00,,cmon mufc today please
3,lynger2000,come united!!!!!!!!!!!! 🔴🔴 #muntot #mufc,2020-10-04 15:30:16,"['MUNTOT', 'MUFC']",0,0,A Scottish lass who will always and forever be...,"Leeds, England",3337,44646,2012-11-14 20:15:43,2020-10-04 15:30:00,15:30:00,,come united muntot mufc
4,URMySolskjaer,"telles and cavani now done, #mufc will have o...",2020-10-04 15:30:16,['MUFC'],432,0,,,43,2738,2015-11-22 17:14:36,2020-10-04 15:30:00,15:30:00,,telles and cavani now done mufc will have one ...
5,brunopolo22,manchester united and porto have reached an ag...,2020-10-04 15:30:16,[],23284,0,,,263,18152,2014-05-07 15:56:28,2020-10-04 15:30:00,15:30:00,,manchester united and porto have reached an ag...
6,GeekCHarmin,solskjaer decided to have small sense starting...,2020-10-04 15:30:16,['MUNTOT'],0,0,Mourinho FC,Nigeria,595,8620,2011-09-11 11:27:08,2020-10-04 15:30:00,15:30:00,,solskjaer decided to have small sense starting...
7,TJellyn,c'mon united! #mufc,2020-10-04 15:30:16,['MUFC'],2,0,Living In Lodebar.,"Toronto, Canada",904,140370,2009-06-23 23:31:15,2020-10-04 15:30:00,15:30:00,,cmon united mufc
8,Oloche69,cavani in manchester. medical to come. \ntelle...,2020-10-04 15:30:16,[],442,0,Trying to elevate small talk to medium talk. E...,,1695,357181,2015-03-28 23:38:27,2020-10-04 15:30:00,15:30:00,,cavani in manchester medical to come telles cl...
9,BabaGroovy,final fee for alex telles will be around €15m ...,2020-10-04 15:30:16,[],3317,0,"Proud Father, All round cool dude.\n\n🎤 Podcas...",Earth 2,1011,131442,2017-01-14 16:17:34,2020-10-04 15:30:00,15:30:00,,final fee for alex telles will be around with...


In [17]:
df.to_csv('../datasets/muntot_cleaned.csv', index_label=False)

# Processing for Twitter Specific 

In [18]:
df = pd.read_csv('../datasets/muntot_cleaned.csv')

In [19]:
df.reset_index(inplace = True, drop = True)

In [20]:
with open('pickle/muntot_playerlist.pickle', 'rb') as f:
    playerlist = pickle.load(f)

In [21]:
def clean_players(text, playerlist = playerlist):
    output=[]
    try:
        for name in playerlist:
            characters=['\\b']
            for letter in list(name.lower()):
                characters.append('([')
                characters.append(letter.upper())
                characters.append(letter.lower())
                characters.append(']+)')
            entry=''.join(characters)
            output.append(entry)
        for i,j in zip(output,playerlist):
            clean_text = re.sub(i, j, text)
    except TypeError:
        clean_text= ''
    return clean_text

In [22]:
df['cleantext'] = df['cleantext'].apply(lambda x: clean_players(x))

# Tokenizing and Lemmatizing



In [23]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [24]:
with open('pickle/muntot.pickle', 'rb') as f:
    matchterms = pickle.load(f)
stop_words.extend(matchterms)

In [25]:
def tokenize_lemmatize(df=df, textcolumn='cleantext'):
    tweetlist = []
    for tweet in df[textcolumn].values.tolist():
        tweet = simple_preprocess(str(tweet), deacc=True)
        tweetlist.append(tweet)
    
    # Instantiate and build the bigram and trigram models
    bigram = gensim.models.Phrases(tweetlist, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[tweetlist], threshold=100)  
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    
    # Form bigrams and trigrams with the tweet content
    tweetlist = [[word for word in simple_preprocess(str(tweet)) if word not in stop_words] for tweet in tweetlist]
    tweetlist = [bigram_mod[tweet] for tweet in tweetlist]
    tweetlist = [trigram_mod[bigram_mod[tweet]] for tweet in tweetlist]
    
    output = []
    nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
    
    for word in tweetlist:
        fulltweet = nlp(" ".join(word)) 
        output.append([token.lemma_ for token in fulltweet])
        
    # Final pass to filter the tokens for stopwords
    output = [[word for word in simple_preprocess(str(tweet)) if word not in stop_words] for tweet in output]
    df['tokens'] = output
    return df

df = tokenize_lemmatize()
    

In [26]:
df.to_csv('../datasets/muntot_cleaned_tokenized.csv', index_label=False)

In [27]:
df['tokens']

0                               [way, old, trafford, come]
1                                          [ready, battle]
2                                    [cmon, today, please]
3                                                   [come]
4        [telles, cavani, one, final, push, see, finall...
                               ...                        
38696                                       [define, word]
38697           [pogba, shaw, show, break, needless, foul]
38698                                            [penalty]
38699                     [well, defender, maguire, zouma]
38700    [evra, see, right, catastrophic, fan, go, play...
Name: tokens, Length: 38701, dtype: object

In [28]:
df

Unnamed: 0,username,text,tweetcreatedts,hashtags,retweetcount,likecount,acctdesc,location,followers,totaltweets,usercreatedts,datetime,time,mentions,cleantext,tokens
0,ManUtd,under way at old trafford — come on united! 🔴⚪...,2020-10-04 15:30:16,"['MUFC', 'MUNTOT']",262,1917,Official #MUFC account. @ManUtd_ES 🇪🇸 | @ManUt...,"Manchester, England",23298880,61791,2012-04-20 15:17:43,2020-10-04 15:30:00,15:30:00,,under way at old trafford come on united mufc...,"[way, old, trafford, come]"
1,NelsonNelli2,ready for battle.\n\n#mufc #muntot https://t.c...,2020-10-04 15:30:16,"['MUFC', 'MUNTOT']",757,0,I'm jovial and fun to be with,"Port Harcourt, Nigeria",1,45,2020-09-25 07:43:39,2020-10-04 15:30:00,15:30:00,,ready for battle mufc muntot,"[ready, battle]"
2,Dcfox82,cmon #mufc 3pts today please,2020-10-04 15:30:16,['mufc'],0,0,"work for nhs, Husband, dad of 3, student of li...","Derby,England",595,4873,2010-10-18 16:11:48,2020-10-04 15:30:00,15:30:00,,cmon mufc today please,"[cmon, today, please]"
3,lynger2000,come united!!!!!!!!!!!! 🔴🔴 #muntot #mufc,2020-10-04 15:30:16,"['MUNTOT', 'MUFC']",0,0,A Scottish lass who will always and forever be...,"Leeds, England",3337,44646,2012-11-14 20:15:43,2020-10-04 15:30:00,15:30:00,,come united muntot mufc,[come]
4,URMySolskjaer,"telles and cavani now done, #mufc will have o...",2020-10-04 15:30:16,['MUFC'],432,0,,,43,2738,2015-11-22 17:14:36,2020-10-04 15:30:00,15:30:00,,telles and cavani now done mufc will have one ...,"[telles, cavani, one, final, push, see, finall..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38696,__DANEJO,define manchester united in 1 - 3 words 📽📽 #mu...,2020-10-04 17:21:27,['MUNTOT'],215,0,ᴬʲᵉᵖᵃᵏᵒ • ᴵ'ᵐ ᴼⁿˡʸ ᴴᵘᵐᵃⁿ ☺ • ᴬᵐᵇⁱᵛᵉʳᵗ • ᴱˣᵖˡᵒ...,North 🇳🇬,9140,34660,2014-06-26 11:30:26,2020-10-04 17:21:00,17:21:00,,define manchester united in words muntot,"[define, word]"
38697,Chachi97357115,pogba and shaw showing they are breaking. ne...,2020-10-04 17:21:27,['MUNTOT'],0,0,,,1,3,2020-09-29 22:56:29,2020-10-04 17:21:00,17:21:00,,pogba and shaw showing they are breaking needl...,"[pogba, shaw, show, break, needless, foul]"
38698,MiesterBob,penalty to spurs...\n\n#muntot,2020-10-04 17:21:27,['MUNTOT'],329,0,extreme,"Botswana, Gaborone",2,7,2019-02-27 08:29:10,2020-10-04 17:21:00,17:21:00,,penalty to spurs muntot,[penalty]
38699,2020Kuami,who is a better defender? \n\n1. retweet for m...,2020-10-04 17:21:27,['MUNTOT'],34,0,Neva give up bro 🙌🙌🙌🙌,"Greater Accra, Ghana",248,1789,2020-10-01 17:13:59,2020-10-04 17:21:00,17:21:00,,who is a better defender retweet for maguire ...,"[well, defender, maguire, zouma]"


In [29]:
terms = df[['datetime','time','tokens']].explode('tokens')

In [30]:
terms.to_csv('../datasets/muntot_terms.csv', index_label = False)