In [1]:
#Base and Cleaning 
import sys
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import time
import pickle
from datetime import datetime
from datetime import date
import json
import requests
import emoji
import re
import string
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess

# Creating a combined dataset

In [12]:
df=pd.read_csv('../datasets/tweetstreamresults.csv')

In [13]:
print(df.shape)
df.head()

(309665, 6)


Unnamed: 0,date,user,is_retweet,is_quote,text,quoted_text
0,4/10/2020 18:10,justincroser,False,False,COME ON REDS!! 🔴🔴 Have to sleep for work but h...,
1,4/10/2020 18:10,LFCYNWA125,True,False,RT @LFC: Jürgen Klopp provides detail on the s...,
2,4/10/2020 18:10,itstugenfinest,True,False,RT @SkySportsPL: 'I'm pretty sure he won't be ...,
3,4/10/2020 18:10,guu_mendees,True,False,RT @ludovicofans: Now follow the news L...,
4,4/10/2020 18:10,justindivine5,True,True,RT @AnfieldWatch: Jurgen Klopp: “It’s an inter...,Liverpool face an anxious wait on how long the...


# Initial Processing

In [14]:
def add_mentions(text):
    mentionlist = re.findall(r'\B@\w+', text)
    if not mentionlist:
        mentionlist = np.NaN
    return mentionlist

def add_hashtags(text):
    hashtaglist = re.findall(r'\B(\#[a-zA-Z0-9]+\b)',text)
    if not hashtaglist:
        hashtaglist = np.NaN
    return hashtaglist

In [15]:
df['mentions'] = df['text'].apply(add_mentions)
df['hashtags'] = df['text'].apply(add_hashtags)

In [16]:
df['text'] = df['text'].str.lower()

In [17]:
df.head(10)

Unnamed: 0,date,user,is_retweet,is_quote,text,quoted_text,mentions,hashtags
0,4/10/2020 18:10,justincroser,False,False,come on reds!! 🔴🔴 have to sleep for work but h...,,,"[#AVLLIV, #LFC, #LFCFamily, #YNWA]"
1,4/10/2020 18:10,LFCYNWA125,True,False,rt @lfc: jürgen klopp provides detail on the s...,,"[@LFC, @Alissonbecker]","[#LFC, #AVLLIV]"
2,4/10/2020 18:10,itstugenfinest,True,False,rt @skysportspl: 'i'm pretty sure he won't be ...,,[@SkySportsPL],
3,4/10/2020 18:10,guu_mendees,True,False,rt @ludovicofans: now follow the news l...,,[@ludovicofans],
4,4/10/2020 18:10,justindivine5,True,True,rt @anfieldwatch: jurgen klopp: “it’s an inter...,Liverpool face an anxious wait on how long the...,[@AnfieldWatch],
5,4/10/2020 18:10,GlazersOutSzn,False,True,@samuelluckhurst #glazersout,+/- 7 years SEVEN YEARS AFTER SAF retired wh...,[@samuelluckhurst],[#GlazersOut]
6,4/10/2020 18:10,iSuperFrank,False,True,المتعة مع جاكي بوي و المربع,🟣 𝗧 𝗘 𝗔 𝗠 𝗡 𝗘 𝗪 𝗦 🟣 @RBarkley8 makes his Ast...,,
7,4/10/2020 18:10,Sir_EbubeEleazu,True,False,rt @lfc: 📋 tonight's team news... two change...,,"[@LFC, @premierleague]",
8,4/10/2020 18:10,avfcnewsgossip,True,False,rt @villaandproud: with kick-off less than 30m...,,"[@VillaAndProud, @LFC_LGBT]",
9,4/10/2020 18:10,KRSNQ1,False,True,lol we are conceding goals today pmds 😤😤😤 if w...,Jürgen Klopp provides detail on the shoulder i...,,


# Text Cleaning

In [18]:
def twitter_cleaner(text):
    text = re.sub(r'\B@\w+', 'username', text) # Replace usernames with placeholder
    text = re.sub(r'(rt)\b', '', text) # Remove retweets
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)]) # Remove emojis
    text = re.sub(r'http\S+', '', text) # Remove urls
    text = re.sub(r'#*', '', text) # Remove hashtags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove all punctuation
    text = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
    text = re.sub('[^a-zA-Z 0-9]', '', text) # Remove all non letters or numbers
    return text   


In [19]:
df['cleantext'] = df['text'].apply(twitter_cleaner)

In [20]:
df.head(10)

Unnamed: 0,date,user,is_retweet,is_quote,text,quoted_text,mentions,hashtags,cleantext
0,4/10/2020 18:10,justincroser,False,False,come on reds!! 🔴🔴 have to sleep for work but h...,,,"[#AVLLIV, #LFC, #LFCFamily, #YNWA]",come on reds have to sleep for work but hoping...
1,4/10/2020 18:10,LFCYNWA125,True,False,rt @lfc: jürgen klopp provides detail on the s...,,"[@LFC, @Alissonbecker]","[#LFC, #AVLLIV]",username jrgen klopp provides detail on the sh...
2,4/10/2020 18:10,itstugenfinest,True,False,rt @skysportspl: 'i'm pretty sure he won't be ...,,[@SkySportsPL],,username im pretty sure he wont be ready after...
3,4/10/2020 18:10,guu_mendees,True,False,rt @ludovicofans: now follow the news l...,,[@ludovicofans],,username now follow the news live live streami...
4,4/10/2020 18:10,justindivine5,True,True,rt @anfieldwatch: jurgen klopp: “it’s an inter...,Liverpool face an anxious wait on how long the...,[@AnfieldWatch],,username jurgen klopp its an international bre...
5,4/10/2020 18:10,GlazersOutSzn,False,True,@samuelluckhurst #glazersout,+/- 7 years SEVEN YEARS AFTER SAF retired wh...,[@samuelluckhurst],[#GlazersOut],username glazersout
6,4/10/2020 18:10,iSuperFrank,False,True,المتعة مع جاكي بوي و المربع,🟣 𝗧 𝗘 𝗔 𝗠 𝗡 𝗘 𝗪 𝗦 🟣 @RBarkley8 makes his Ast...,,,
7,4/10/2020 18:10,Sir_EbubeEleazu,True,False,rt @lfc: 📋 tonight's team news... two change...,,"[@LFC, @premierleague]",,username tonights team news two changes from o...
8,4/10/2020 18:10,avfcnewsgossip,True,False,rt @villaandproud: with kick-off less than 30m...,,"[@VillaAndProud, @LFC_LGBT]",,username with kickoff less than away we would...
9,4/10/2020 18:10,KRSNQ1,False,True,lol we are conceding goals today pmds 😤😤😤 if w...,Jürgen Klopp provides detail on the shoulder i...,,,lol we are conceding goals today pmds if we do...


# Language Processing for Non-english tweets

In [21]:
# Dealing with non english tweets
from langdetect import detect
def isenglish(text):
    try:
        if detect(text) == 'en':
            return 1
        else:
            return 0
    except:
        return 0

In [22]:
df['lang'] = df['cleantext'].apply(isenglish)

In [23]:
df.head(10)

Unnamed: 0,date,user,is_retweet,is_quote,text,quoted_text,mentions,hashtags,cleantext,lang
0,4/10/2020 18:10,justincroser,False,False,come on reds!! 🔴🔴 have to sleep for work but h...,,,"[#AVLLIV, #LFC, #LFCFamily, #YNWA]",come on reds have to sleep for work but hoping...,1
1,4/10/2020 18:10,LFCYNWA125,True,False,rt @lfc: jürgen klopp provides detail on the s...,,"[@LFC, @Alissonbecker]","[#LFC, #AVLLIV]",username jrgen klopp provides detail on the sh...,1
2,4/10/2020 18:10,itstugenfinest,True,False,rt @skysportspl: 'i'm pretty sure he won't be ...,,[@SkySportsPL],,username im pretty sure he wont be ready after...,1
3,4/10/2020 18:10,guu_mendees,True,False,rt @ludovicofans: now follow the news l...,,[@ludovicofans],,username now follow the news live live streami...,1
4,4/10/2020 18:10,justindivine5,True,True,rt @anfieldwatch: jurgen klopp: “it’s an inter...,Liverpool face an anxious wait on how long the...,[@AnfieldWatch],,username jurgen klopp its an international bre...,1
5,4/10/2020 18:10,GlazersOutSzn,False,True,@samuelluckhurst #glazersout,+/- 7 years SEVEN YEARS AFTER SAF retired wh...,[@samuelluckhurst],[#GlazersOut],username glazersout,1
6,4/10/2020 18:10,iSuperFrank,False,True,المتعة مع جاكي بوي و المربع,🟣 𝗧 𝗘 𝗔 𝗠 𝗡 𝗘 𝗪 𝗦 🟣 @RBarkley8 makes his Ast...,,,,0
7,4/10/2020 18:10,Sir_EbubeEleazu,True,False,rt @lfc: 📋 tonight's team news... two change...,,"[@LFC, @premierleague]",,username tonights team news two changes from o...,1
8,4/10/2020 18:10,avfcnewsgossip,True,False,rt @villaandproud: with kick-off less than 30m...,,"[@VillaAndProud, @LFC_LGBT]",,username with kickoff less than away we would...,1
9,4/10/2020 18:10,KRSNQ1,False,True,lol we are conceding goals today pmds 😤😤😤 if w...,Jürgen Klopp provides detail on the shoulder i...,,,lol we are conceding goals today pmds if we do...,1


In [24]:
df['lang'].value_counts()

1    207727
0    101938
Name: lang, dtype: int64

In [25]:
df.to_csv('../datasets/avlliv_cleaned.csv', index_label=False)

# Processing for Twitter Specific 

In [26]:
df = pd.read_csv('../datasets/avlliv_cleaned.csv')

In [27]:
# Using only the tweets identified as English
df = df.loc[df['lang'] == 1]

In [28]:
df.reset_index(inplace = True, drop = True)

In [29]:
with open('pickle/avlliv_playerlist.pickle', 'rb') as f:
    playerlist = pickle.load(f)

In [30]:
def clean_players(text, playerlist = playerlist):
    output=[]
    try:
        for name in playerlist:
            characters=['\\b']
            for letter in list(name.lower()):
                characters.append('([')
                characters.append(letter.upper())
                characters.append(letter.lower())
                characters.append(']+)')
            entry=''.join(characters)
            output.append(entry)
        for i,j in zip(output,playerlist):
            clean_text = re.sub(i, j, text)
    except TypeError:
        clean_text= ''
    return clean_text

In [31]:
df['cleantext'] = df['cleantext'].apply(lambda x: clean_players(x))

# Tokenizing and Lemmatizing



In [32]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [33]:
with open('pickle/avlliv.pickle', 'rb') as f:
    matchterms = pickle.load(f)
stop_words.extend(matchterms)

In [34]:
def tokenize_lemmatize(df=df, textcolumn='cleantext'):
    tweetlist = []
    for tweet in df[textcolumn].values.tolist():
        tweet = simple_preprocess(str(tweet), deacc=True)
        tweetlist.append(tweet)
    
    # Instantiate and build the bigram and trigram models
    bigram = gensim.models.Phrases(tweetlist, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[tweetlist], threshold=100)  
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    
    # Form bigrams and trigrams with the tweet content
    tweetlist = [[word for word in simple_preprocess(str(tweet)) if word not in stop_words] for tweet in tweetlist]
    tweetlist = [bigram_mod[tweet] for tweet in tweetlist]
    tweetlist = [trigram_mod[bigram_mod[tweet]] for tweet in tweetlist]
    
    output = []
    nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
    
    for word in tweetlist:
        fulltweet = nlp(" ".join(word)) 
        output.append([token.lemma_ for token in fulltweet])
        
    # Final pass to filter the tokens for stopwords
    output = [[word for word in simple_preprocess(str(tweet)) if word not in stop_words] for tweet in output]
    df['tokens'] = output
    return df

df = tokenize_lemmatize()
    

In [None]:
# Adjusting the time

In [85]:
df['datetime']=pd.to_datetime(df['date'],format='%d/%m/%Y %H:%M')

In [87]:
df['time'] = df['datetime'].dt.time

In [89]:
df.to_csv('../datasets/avlliv_cleaned_tokenized.csv', index_label=False)

In [37]:
df['tokens']

['username', 'jurgen', 'klopp', 'pretty', 'sure', 'ready', 'see', 'week']

In [88]:
df

Unnamed: 0,date,user,is_retweet,is_quote,text,quoted_text,mentions,hashtags,cleantext,lang,tokens,datetime,time
0,4/10/2020 18:10,justincroser,False,False,come on reds!! 🔴🔴 have to sleep for work but h...,,,"['#AVLLIV', '#LFC', '#LFCFamily', '#YNWA']",come on reds have to sleep for work but hoping...,1,"[come, red, sleep, work, hope, win, go, watch,...",2020-10-04 18:10:00,18:10:00
1,4/10/2020 18:10,LFCYNWA125,True,False,rt @lfc: jürgen klopp provides detail on the s...,,"['@LFC', '@Alissonbecker']","['#LFC', '#AVLLIV']",username jrgen klopp provides detail on the sh...,1,"[username, detail, shoulder_injury, sideline, ...",2020-10-04 18:10:00,18:10:00
2,4/10/2020 18:10,itstugenfinest,True,False,rt @skysportspl: 'i'm pretty sure he won't be ...,,['@SkySportsPL'],,username im pretty sure he wont be ready after...,1,"[username, pretty, sure, ready, set, week, sid...",2020-10-04 18:10:00,18:10:00
3,4/10/2020 18:10,guu_mendees,True,False,rt @ludovicofans: now follow the news l...,,['@ludovicofans'],,username now follow the news live live streami...,1,"[username, follow, news, live, live, streaming...",2020-10-04 18:10:00,18:10:00
4,4/10/2020 18:10,justindivine5,True,True,rt @anfieldwatch: jurgen klopp: “it’s an inter...,Liverpool face an anxious wait on how long the...,['@AnfieldWatch'],,username jurgen klopp its an international bre...,1,"[username, jurgen, klopp, pretty, sure, ready,...",2020-10-04 18:10:00,18:10:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
207722,4/10/2020 20:20,notbitterbetter,False,False,villa were poor there should’ve scored 10 or 11.,,,,villa were poor there shouldve scored or,1,"[poor, score]",2020-10-04 20:20:00,20:20:00
207723,4/10/2020 20:20,artDante1,False,False,good time to be alive... manchester united lo...,,,['#AVLLFC'],good time to be alive manchester united loses ...,1,"[good, time, alive, manchester, united, lose, ...",2020-10-04 20:20:00,20:20:00
207724,4/10/2020 20:20,jonesy73,False,False,gutted that we couldn’t all be there together ...,,,,gutted that we couldnt all be there together t...,1,"[gutte, could, together, enjoy, tonight, trip,...",2020-10-04 20:20:00,20:20:00
207725,4/10/2020 20:20,ryanYNWA,True,False,rt @elliothackney: everyone: last season was b...,,['@ElliotHackney'],,username everyone last season was boring liver...,1,"[username, everyone, last, season, bore, run, ...",2020-10-04 20:20:00,20:20:00


In [92]:
terms = df[['datetime','time','tokens']].explode('tokens')

In [93]:
terms.to_csv('../datasets/avlliv_terms.csv', index_label = False)