In [1]:
#Base and Cleaning 
import json
import requests
import pandas as pd
import numpy as np
import emoji
import regex
import re
import string
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis.gensim
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls

In [3]:
#Natural Language Processing (NLP)
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)
warnings.filterwarnings("ignore")

  and should_run_async(code)
  from collections import Mapping, defaultdict
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
  from collections import namedtuple, defaultdict, Iterable


# Dataset inspection

In [4]:
df=pd.read_csv('../datasets/tweetstreamresults.csv')

In [5]:
df.head()

Unnamed: 0,date,user,is_retweet,is_quote,text,quoted_text
0,4/10/2020 18:10,justincroser,False,False,COME ON REDS!! 🔴🔴 Have to sleep for work but h...,
1,4/10/2020 18:10,LFCYNWA125,True,False,RT @LFC: Jürgen Klopp provides detail on the s...,
2,4/10/2020 18:10,itstugenfinest,True,False,RT @SkySportsPL: 'I'm pretty sure he won't be ...,
3,4/10/2020 18:10,guu_mendees,True,False,RT @ludovicofans: Now follow the news L...,
4,4/10/2020 18:10,justindivine5,True,True,RT @AnfieldWatch: Jurgen Klopp: “It’s an inter...,Liverpool face an anxious wait on how long the...


In [6]:
df.duplicated().value_counts()

False    309532
True        133
dtype: int64

In [7]:
df['is_retweet']

0         False
1          True
2          True
3          True
4          True
          ...  
309660     True
309661    False
309662     True
309663    False
309664     True
Name: is_retweet, Length: 309665, dtype: bool

In [8]:
# Setting the dataframe to only look at original tweets for now
df = df.loc[df['is_retweet']==False]

In [9]:
villa = ['dean smith', 'martinez', 'emi','cash', 'konsa', 'ming', 'targett', 'luiz', 'mcginn', 'grealish', 'trezeguet', 'barkley', 'watkins','traore']
liverpool = ['klopp', 'salah', 'mane', 'firmino', 'adrian', 'vvd', 'van dijk', 'gomez', 'robertson', 'robbo', 'wijnaldum', 'gini', 'minamino', 'trent', 'taa', 'keita', 'fabinho', 'jones', 'milner']
playerlist=villa+liverpool

# Data Pre-Processing

In [10]:
def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text

def remove_usernames(text):
    text=re.sub(r'\B@\w+', 'username', text)
    return text

def player_regex(text, playerlist = playerlist):
    output=[]
    for name in playerlist:
        characters=['\\b']
        for letter in list(name.lower()):
            characters.append('([')
            characters.append(letter.upper())
            characters.append(letter.lower())
            characters.append(']+)')
        entry=''.join(characters)
        output.append(entry)
    for i,j in zip(output,playerlist):
        text = re.sub(i, j, text)
    return text

In [11]:
# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: give_emoji_free_text(x)

# Apply `call_emoji_free` which calls the function to remove all emoji's
df['cleantext'] = df['text'].apply(call_emoji_free).apply(url_free_text).apply(remove_usernames).apply(player_regex)

#Create a new column with url free tweets
#df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)

#create new column with username free tweets
#df['remove_usernames'] = df['url_free_tweets'].apply(remove_usernames)

In [12]:
df

Unnamed: 0,date,user,is_retweet,is_quote,text,quoted_text,cleantext
0,4/10/2020 18:10,justincroser,False,False,COME ON REDS!! 🔴🔴 Have to sleep for work but h...,,COME ON REDS!! Have to sleep for work but hopi...
5,4/10/2020 18:10,GlazersOutSzn,False,True,@samuelluckhurst #GlazersOut,+/- 7 years SEVEN YEARS AFTER SAF retired wh...,username #GlazersOut
6,4/10/2020 18:10,iSuperFrank,False,True,المتعة مع جاكي بوي و المربع,🟣 𝗧 𝗘 𝗔 𝗠 𝗡 𝗘 𝗪 𝗦 🟣 @RBarkley8 makes his Ast...,المتعة مع جاكي بوي و المربع
9,4/10/2020 18:10,KRSNQ1,False,True,LOL WE ARE CONCEDING GOALS TODAY PMDS 😤😤😤 IF W...,Jürgen Klopp provides detail on the shoulder i...,LOL WE ARE CONCEDING GOALS TODAY PMDS IF WE DO...
11,4/10/2020 18:10,WIANDJO,False,False,&gt;&gt;&gt; WATCH Aston Villa vs Liverpool LI...,,&gt;&gt;&gt; WATCH Aston Villa vs Liverpool LI...
...,...,...,...,...,...,...,...
309656,4/10/2020 20:20,benihime_sensei,False,False,Liverpool vient de se prendre 7 buts par Aston...,,Liverpool vient de se prendre 7 buts par Aston...
309658,4/10/2020 20:20,notbitterbetter,False,False,Villa were poor there should’ve scored 10 or 11.,,Villa were poor there should’ve scored 10 or 11.
309659,4/10/2020 20:20,artDante1,False,False,Good time to be alive... Manchester United lo...,,Good time to be alive... Manchester United los...
309661,4/10/2020 20:20,jonesy73,False,False,Gutted that we couldn’t all be there together ...,,Gutted that we couldn’t all be there together ...


In [13]:
# Load spacy
nlp = spacy.load('en_core_web_lg')

# First pass

In [31]:
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Custom stopwords
socialmedia_stopwords = ['hi','\n','\n\n', '&amp;', '&gt;', '&lt', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']
spanish_stopwords = ['de', 'que', 'el', 'o', 'la', 'en', 'al','del','dey', 'ini',' bu', 'ya', 'et', 'je', 'los', 'lo', 'por', 'le', 'se', 'es']
match_terms = ['liverpool', 'avfc', 'aston', 'villa', 'lfc', 'avlliv']
custom_stopwords = socialmedia_stopwords + spanish_stopwords + match_terms

# Customize stop words by adding to the default list
stop_words = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
final_stop_words = stop_words.union(SW).union(stopwords)

tokens = []

for doc in tokenizer.pipe(df['cleantext'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in final_stop_words:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

In [32]:
# Make tokens a string again
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

In [33]:
# Make lemmas a string again
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    # Removing url's
    pattern = r"http\S+"
    
    tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
    tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
    tokens = re.sub('@*!*\$*', '', text) # Remove @ ! $
    tokens = tokens.strip(',') # TESTING THIS LINE
    tokens = tokens.strip('?') # TESTING THIS LINE
    tokens = tokens.strip('!') # TESTING THIS LINE
    tokens = tokens.strip("'") # TESTING THIS LINE
    tokens = tokens.strip(".") # TESTING THIS LINE

    tokens = tokens.lower().split() # Make text lowercase and split it
    
    return tokens

# Apply tokenizer
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)

In [34]:
# Create a id2word dictionary
id2word = Dictionary(df['lemma_tokens'])
print(len(id2word))

87507


In [35]:
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))

32217


In [36]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

In [37]:
# Instantiating a Base LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, workers=12, passes=5)

In [38]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]

In [39]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]

In [40]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
username game watch grass 2 avlliv world lfc liverpool souness

------ Topic 1 ------
avlliv liverpool avfc goal watkins come score lfc barkley salah

------ Topic 2 ------
username villa lol wow klopp avlliv go mean come need

------ Topic 3 ------
1 6 2 lose 4 5 league win man 7

------ Topic 4 ------
y liverpool ni para premier mu está pero una si

------ Topic 5 ------
2 7 fuck 8 0 avlliv villa liverpool adrian goal

------ Topic 6 ------
go username fan bad game liverpool look fucking adrian need

------ Topic 7 ------
e é liverpool não 7 united tá mais um username

------ Topic 8 ------
fan united man username play team utd manchester walk today

------ Topic 9 ------
username vs live liverpool watch 2020 10 false villa avlliv



In [41]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -7.978118537881098

Coherence Score:  0.5638913819659411


# Second Pass

# References

This notebook is built with the help of this article

https://towardsdatascience.com/twitter-topic-modeling-e0e3315b12e2

# Further References

https://towardsdatascience.com/latent-dirichlet-allocation-lda-9d1cd064ffa2

https://radimrehurek.com/gensim/models/ldamodel.html

https://stackoverflow.com/questions/44177986/replacing-twitter-usernames-with-username-how-to/44178977#44178977