In [1]:
#Base and Cleaning 
import json
import requests
import pandas as pd
import numpy as np
import emoji
import regex
import re
import string
from collections import Counter
import warnings

In [2]:
#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis.gensim
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls

In [3]:
#Natural Language Processing (NLP)
warnings.filterwarnings("ignore")
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)


  and should_run_async(code)
  from collections import Mapping, defaultdict
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
  from collections import namedtuple, defaultdict, Iterable


# Dataset inspection

In [5]:
df=pd.read_csv('../datasets/chelseapalacetweetscombined.csv')

In [6]:
df.head()

Unnamed: 0,username,text,tweetcreatedts,hashtags,acctdesc,location,followers,totaltweets,usercreatedts,retweetcount
0,titomar_758,Chelsea vs Crystal palace. \n\n• Mendy clean s...,2020-10-03 13:32:07,[],,Canaries st.lucia,973,52301,2014-02-01 20:02:37,77
1,Kennyroja191,"Wait, this same Crystal Palace flogged Man Utd...",2020-10-03 13:32:07,[],,"Osun, Nigeria",36,877,2019-12-30 17:20:58,59
2,John38466297,Seems like beating Crystal Palace at home this...,2020-10-03 13:32:07,[],,"Maryland, Lagos",37,251,2020-07-15 11:37:58,994
3,damyyllare,🏴󠁧󠁢󠁥󠁮󠁧󠁿 Ben Chilwell vs Crystal Palace\n\n90 m...,2020-10-03 13:32:07,[],I am a blessing for my generation... I love fi...,Nigeria,416,4410,2016-03-15 12:51:50,65
4,Younguzumaki1,I love how Havertz constantly roams around the...,2020-10-03 13:32:06,[],,"Federal Capital Territory, Nig",1346,50678,2012-07-21 15:12:44,7


In [7]:
df.duplicated().value_counts()

False    9703
dtype: int64

# Data Pre-Processing

In [8]:
def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text

def remove_usernames(text):
    text=re.sub(r'\B@\w+', 'USERNAME', text)
    return text

In [9]:
# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: give_emoji_free_text(x)

# Apply `call_emoji_free` which calls the function to remove all emoji's
df['emoji_free_tweets'] = df['text'].apply(call_emoji_free)

#Create a new column with url free tweets
df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)

#create new column with username free tweets
df['remove_usernames'] = df['url_free_tweets'].apply(remove_usernames)

In [10]:
df

Unnamed: 0,username,text,tweetcreatedts,hashtags,acctdesc,location,followers,totaltweets,usercreatedts,retweetcount,emoji_free_tweets,url_free_tweets,remove_usernames
0,titomar_758,Chelsea vs Crystal palace. \n\n• Mendy clean s...,2020-10-03 13:32:07,[],,Canaries st.lucia,973,52301,2014-02-01 20:02:37,77,Chelsea vs Crystal palace. • Mendy clean sheet...,Chelsea vs Crystal palace. • Mendy clean sheet...,Chelsea vs Crystal palace. • Mendy clean sheet...
1,Kennyroja191,"Wait, this same Crystal Palace flogged Man Utd...",2020-10-03 13:32:07,[],,"Osun, Nigeria",36,877,2019-12-30 17:20:58,59,"Wait, this same Crystal Palace flogged Man Utd...","Wait, this same Crystal Palace flogged Man Utd...","Wait, this same Crystal Palace flogged Man Utd..."
2,John38466297,Seems like beating Crystal Palace at home this...,2020-10-03 13:32:07,[],,"Maryland, Lagos",37,251,2020-07-15 11:37:58,994,Seems like beating Crystal Palace at home this...,Seems like beating Crystal Palace at home this...,Seems like beating Crystal Palace at home this...
3,damyyllare,🏴󠁧󠁢󠁥󠁮󠁧󠁿 Ben Chilwell vs Crystal Palace\n\n90 m...,2020-10-03 13:32:07,[],I am a blessing for my generation... I love fi...,Nigeria,416,4410,2016-03-15 12:51:50,65,Ben Chilwell vs Crystal Palace 90 minutes play...,Ben Chilwell vs Crystal Palace 90 minutes play...,Ben Chilwell vs Crystal Palace 90 minutes play...
4,Younguzumaki1,I love how Havertz constantly roams around the...,2020-10-03 13:32:06,[],,"Federal Capital Territory, Nig",1346,50678,2012-07-21 15:12:44,7,I love how Havertz constantly roams around the...,I love how Havertz constantly roams around the...,I love how Havertz constantly roams around the...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9698,schoolBoyfrm5th,Ben Chilwell has lost possession (5) more time...,2020-10-03 13:14:15,[],yooohooooo\n bastard of the east blue,village hidden in the tits,1906,36980,2017-06-29 16:20:52,587,Ben Chilwell has lost possession (5) more time...,Ben Chilwell has lost possession (5) more time...,Ben Chilwell has lost possession (5) more time...
9699,Tamar__lee,Is this the crystal palace that beat Man U?,2020-10-03 13:14:14,[],"watched by angels.,protected by God",Nigeria,1006,18683,2012-12-06 17:06:31,0,Is this the crystal palace that beat Man U?,Is this the crystal palace that beat Man U?,Is this the crystal palace that beat Man U?
9700,I_am_Tangeni,"Ano, TF is up with Palace? Can’t believe we go...",2020-10-03 13:14:14,['CHECRY'],"Beer Drinkers Hall of Fame, #Father #Son #Brot...",Namibia,1543,64100,2011-12-11 08:18:19,1,"Ano, TF is up with Palace? Can’t believe we go...","Ano, TF is up with Palace? Can’t believe we go...","Ano, TF is up with Palace? Can’t believe we go..."
9701,Arsenal_LINY,It just keeps getting worst for Crystal Palace...,2020-10-03 13:14:13,['checry'],Supporting Arsenal Football Club #COYG,"Long Island, NY",386,11668,2016-12-11 18:32:21,0,It just keeps getting worst for Crystal Palace...,It just keeps getting worst for Crystal Palace...,It just keeps getting worst for Crystal Palace...


In [11]:
# Load spacy
nlp = spacy.load('en_core_web_lg')

# First pass

In [12]:
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&amp;', '&gt;', '&lt', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


tokens = []

for doc in tokenizer.pipe(df['remove_usernames'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

In [13]:
# Make tokens a string again
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

In [14]:
# Make lemmas a string again
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    # Removing url's
    pattern = r"http\S+"
    
    tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
    tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
    tokens = re.sub('@*!*\$*', '', text) # Remove @ ! $
    tokens = tokens.strip(',') # TESTING THIS LINE
    tokens = tokens.strip('?') # TESTING THIS LINE
    tokens = tokens.strip('!') # TESTING THIS LINE
    tokens = tokens.strip("'") # TESTING THIS LINE
    tokens = tokens.strip(".") # TESTING THIS LINE

    tokens = tokens.lower().split() # Make text lowercase and split it
    
    return tokens

# Apply tokenizer
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)

In [15]:
# Create a id2word dictionary
id2word = Dictionary(df['lemma_tokens'])
print(len(id2word))

4136


In [16]:
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))

2408


In [17]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

In [18]:
# Instantiating a Base LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=5, id2word=id2word, workers=12, passes=5)

In [19]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]

In [20]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]

In [21]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
palace crystal chelsea checry 0 beat goal home 4 season

------ Topic 1 ------
palace crystal chelsea vs watch league live retweet mobile match

------ Topic 2 ------
palace chelsea crystal live ◉ checry vs today vs. username

------ Topic 3 ------
palace ◉ crystal 2 = chelsea win 0 havertz goal

------ Topic 4 ------
palace crystal username chilwell time lose player minute 10 open



In [22]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -5.2932225154451

Coherence Score:  0.3740063227699808


# References

This notebook is built with the help of this article

https://towardsdatascience.com/twitter-topic-modeling-e0e3315b12e2

# Further References

https://towardsdatascience.com/latent-dirichlet-allocation-lda-9d1cd064ffa2

https://radimrehurek.com/gensim/models/ldamodel.html

https://stackoverflow.com/questions/44177986/replacing-twitter-usernames-with-username-how-to/44178977#44178977