<a href="https://colab.research.google.com/github/dbckz/dissertation/blob/master/notebooks/regression_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regression prep

In [17]:
import pandas as pd
import numpy as np
import ast
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import dateutil
from tqdm import tqdm
from google.colab import drive
import plotly.graph_objects as go

In [18]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
# Load in regression_data.csv
root_path = "/content/drive/MyDrive/University/Dissertation"
regression_path = "/regression"
regression_file = "/regression_table.csv"

print("Loading regression table CSV...")
reg_df = pd.read_csv(root_path + regression_path + regression_file,
                     parse_dates=['date'])

reg_df['featured'] = reg_df['featured'].eq('Y')
reg_df['featured_in_previous_game'] = reg_df['featured_in_previous_game'].eq('Y')
reg_df['matchday'] = reg_df['matchday'].eq('Y')
reg_df['red_card'] = reg_df['red_card'].eq('Y')
reg_df['penalty'] = reg_df['penalty'].eq('Y')
reg_df['penalty_outcome'] = reg_df['penalty_outcome'].eq('Y')

Loading regression table CSV...


In [20]:
def load_tweet_data(path):
  return  pd.read_csv(path,
                      usecols = [
                                  'created_at',
                                  'JPickford1',
                                  'kylewalker2',
                                  'LukeShaw23',
                                  '_DeclanRice',
                                  'HarryMaguire93',
                                  'JackGrealish',
                                  'JHenderson',
                                  'HKane',
                                  'sterling7',
                                  'MarcusRashford',
                                  'trippier2',
                                  'deanhenderson',
                                  'Kalvinphillips',
                                  'OfficialTM_3',
                                  'Sanchooo10',
                                  'CalvertLewin14',
                                  'masonmount_10',
                                  'PhilFoden',
                                  'BenChilwell',
                                  'ben6white',
                                  'samjohnstone50',
                                  'reecejames_24',
                                  'BukayoSaka87',
                                  'BellinghamJude',
                                  'joel_veltman',
                                  'mdeligt_04',
                                  'NathanAke',
                                  'Stefandevrij',
                                  'GWijnaldum',
                                  'LuukdeJong9',
                                  'Memphis',
                                  'QPromes',
                                  'pvanaanholt',
                                  'TimKrul',
                                  'DavyKlaassen',
                                  'Dirono',
                                  'RGravenberch',
                                  'BlindDaley',
                                  'DeJongFrenkie21',
                                  'DenzelJMD2',
                                  'Manuel_Neuer',
                                  'ToniRuediger',
                                  'MatzeGinter',
                                  'matshummels',
                                  'kaihavertz29',
                                  'ToniKroos',
                                  'KeVolland',
                                  'SergeGnabry',
                                  'Bernd_Leno',
                                  'JamalMusiala',
                                  'lukaskl96',
                                  'leongoretzka_',
                                  'leroy_sane',
                                  'IlkayGuendogan',
                                  'emrecan_',
                                  'RobinKoch25',
                                  'esmuellert_',
                                  'MarshallDavid23',
                                  'sodonnell15',
                                  'andrewrobertso5',
                                  'mctominay10',
                                  'granthanley5',
                                  'kierantierney1',
                                  'jmcginn7',
                                  'Callummcgregor8',
                                  'Lyndon_Dykes',
                                  'CheAdams_',
                                  'CraigGordon01',
                                  'declang31',
                                  'LiamCooper__',
                                  '10DavidTurnbull',
                                  'kevinnisbet16',
                                  'np4tterson',
                                  'billygilmourrr',
                                  'Jack_Hendry2',
                                  'Scottmckenna3',
                                  'BenPavard28',
                                  'kimpembe_3',
                                  'raphaelvarane',
                                  'clement_lenglet',
                                  'paulpogba',
                                  'AntoGriezmann',
                                  '_OlivierGiroud_',
                                  'KMbappe',
                                  'CorentinTolisso',
                                  'nglkante',
                                  'KurtZouma',
                                  'SteveMandanda',
                                  'MoussaSissoko',
                                  'LucasDigne',
                                  'Benzema',
                                  'LucasHernandez',
                                  'WissBenYedder',
                                  'mmseize',
                                  'leodubois15',
                                  'jkeey4',
                                  'MarcusThuram',
                                  'thibautcourtois',
                                  'AlderweireldTob',
                                  'thomasvermaelen',
                                  'JanVertonghen',
                                  'axelwitsel28',
                                  'DeBruyneKev',
                                  'RomeluLukaku9',
                                  'hazardeden10',
                                  'CarrascoY21',
                                  'SMignolet',
                                  'dries_mertens14',
                                  'ThomMills',
                                  'HazardThorgan8',
                                  'VanakenHans',
                                  'Jasondenayer',
                                  'chrisbenteke',
                                  'NChadli',
                                  'mbatshuayi',
                                  'LTrossard',
                                  'JeremyDoku',
                                  'dennispraet',
                                  'JPickford1_offensive',
                                  'kylewalker2_offensive',
                                  'LukeShaw23_offensive',
                                  '_DeclanRice_offensive',
                                  'HarryMaguire93_offensive',
                                  'JackGrealish_offensive',
                                  'JHenderson_offensive',
                                  'HKane_offensive',
                                  'sterling7_offensive',
                                  'MarcusRashford_offensive',
                                  'trippier2_offensive',
                                  'deanhenderson_offensive',
                                  'Kalvinphillips_offensive',
                                  'OfficialTM_3_offensive',
                                  'Sanchooo10_offensive',
                                  'CalvertLewin14_offensive',
                                  'masonmount_10_offensive',
                                  'PhilFoden_offensive',
                                  'BenChilwell_offensive',
                                  'ben6white_offensive',
                                  'samjohnstone50_offensive',
                                  'reecejames_24_offensive',
                                  'BukayoSaka87_offensive',
                                  'BellinghamJude_offensive',
                                  'joel_veltman_offensive',
                                  'mdeligt_04_offensive',
                                  'NathanAke_offensive',
                                  'Stefandevrij_offensive',
                                  'GWijnaldum_offensive',
                                  'LuukdeJong9_offensive',
                                  'Memphis_offensive',
                                  'QPromes_offensive',
                                  'pvanaanholt_offensive',
                                  'TimKrul_offensive',
                                  'DavyKlaassen_offensive',
                                  'Dirono_offensive',
                                  'RGravenberch_offensive',
                                  'BlindDaley_offensive',
                                  'DeJongFrenkie21_offensive',
                                  'DenzelJMD2_offensive',
                                  'Manuel_Neuer_offensive',
                                  'ToniRuediger_offensive',
                                  'MatzeGinter_offensive',
                                  'matshummels_offensive',
                                  'kaihavertz29_offensive',
                                  'ToniKroos_offensive',
                                  'KeVolland_offensive',
                                  'SergeGnabry_offensive',
                                  'Bernd_Leno_offensive',
                                  'JamalMusiala_offensive',
                                  'lukaskl96_offensive',
                                  'leongoretzka__offensive',
                                  'leroy_sane_offensive',
                                  'IlkayGuendogan_offensive',
                                  'emrecan__offensive',
                                  'RobinKoch25_offensive',
                                  'esmuellert__offensive',
                                  'MarshallDavid23_offensive',
                                  'sodonnell15_offensive',
                                  'andrewrobertso5_offensive',
                                  'mctominay10_offensive',
                                  'granthanley5_offensive',
                                  'kierantierney1_offensive',
                                  'jmcginn7_offensive',
                                  'Callummcgregor8_offensive',
                                  'Lyndon_Dykes_offensive',
                                  'CheAdams__offensive',
                                  'CraigGordon01_offensive',
                                  'declang31_offensive',
                                  'LiamCooper___offensive',
                                  '10DavidTurnbull_offensive',
                                  'kevinnisbet16_offensive',
                                  'np4tterson_offensive',
                                  'billygilmourrr_offensive',
                                  'Jack_Hendry2_offensive',
                                  'Scottmckenna3_offensive',
                                  'BenPavard28_offensive',
                                  'kimpembe_3_offensive',
                                  'raphaelvarane_offensive',
                                  'clement_lenglet_offensive',
                                  'paulpogba_offensive',
                                  'AntoGriezmann_offensive',
                                  '_OlivierGiroud__offensive',
                                  'KMbappe_offensive',
                                  'CorentinTolisso_offensive',
                                  'nglkante_offensive',
                                  'KurtZouma_offensive',
                                  'SteveMandanda_offensive',
                                  'MoussaSissoko_offensive',
                                  'LucasDigne_offensive',
                                  'Benzema_offensive',
                                  'LucasHernandez_offensive',
                                  'WissBenYedder_offensive',
                                  'mmseize_offensive',
                                  'leodubois15_offensive',
                                  'jkeey4_offensive',
                                  'MarcusThuram_offensive',
                                  'thibautcourtois_offensive',
                                  'AlderweireldTob_offensive',
                                  'thomasvermaelen_offensive',
                                  'JanVertonghen_offensive',
                                  'axelwitsel28_offensive',
                                  'DeBruyneKev_offensive',
                                  'RomeluLukaku9_offensive',
                                  'hazardeden10_offensive',
                                  'CarrascoY21_offensive',
                                  'SMignolet_offensive',
                                  'dries_mertens14_offensive',
                                  'ThomMills_offensive',
                                  'HazardThorgan8_offensive',
                                  'VanakenHans_offensive',
                                  'Jasondenayer_offensive',
                                  'chrisbenteke_offensive',
                                  'NChadli_offensive',
                                  'mbatshuayi_offensive',
                                  'LTrossard_offensive',
                                  'JeremyDoku_offensive',
                                  'dennispraet_offensive'
                                ],
                      parse_dates=['created_at'],
                      date_parser=lambda x: dateutil.parser.parse(x, ignoretz=True))

In [21]:
persp_df = load_tweet_data(root_path + '/data_collection/regression_tweets.csv')

In [22]:
hb_df = load_tweet_data(root_path + '/data_collection/regression_tweets_hb.csv')

In [23]:
persp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1046319 entries, 0 to 1046318
Columns: 237 entries, created_at to dennispraet_offensive
dtypes: bool(236), datetime64[ns](1)
memory usage: 243.5 MB


In [24]:
# Calculate appropriate tweet counts, and add them to the dataframe

# Aggregate tweets per day
persp_df = persp_df[(persp_df['created_at'] >= '2021-06-20 00:00:00') & (persp_df['created_at'] < '2021-07-17 00:00:00')].resample(pd.offsets.Day(1), on='created_at').sum().reset_index()
hb_df = hb_df[(hb_df['created_at'] >= '2021-06-20 00:00:00') & (hb_df['created_at'] < '2021-07-17 00:00:00')].resample(pd.offsets.Day(1), on='created_at').sum().reset_index()

# Get rid of 19th June as we don't have the full day's tweets
reg_df = reg_df[reg_df['date'] > '2021-06-19']


In [25]:
for i, row in reg_df.iterrows():
  date = row['date']
  handle = row['handle']
  reg_df.at[i, 'total_perspective_tweets'] = persp_df[persp_df['created_at'] == date][handle]
  reg_df.at[i, 'tweets_flagged_perspective'] = persp_df[persp_df['created_at'] == date][handle + '_offensive']
  reg_df.at[i, 'total_tweets'] = hb_df[hb_df['created_at'] == date][handle]
  reg_df.at[i, 'tweets_containing_slurs'] = hb_df[hb_df['created_at'] == date][handle + '_offensive']

In [26]:
reg_df['total_perspective_tweets'] = reg_df['total_perspective_tweets'].astype(np.int32)
reg_df['tweets_flagged_perspective'] = reg_df['tweets_flagged_perspective'].astype(np.int32)
reg_df['total_tweets'] = reg_df['total_tweets'].astype(np.int32)
reg_df['tweets_containing_slurs'] = reg_df['tweets_containing_slurs'].astype(np.int32)

In [27]:
reg_df.head()

Unnamed: 0,name,country,country_ranking_points,club,club_coefficient,handle,ethnicity,date,days_since_last_game,featured,opponent,player_rating,matchday,result,featured_in_previous_game,player_rating_in_previous_game,result_in_previous_game,pen_in_previous_game,round,red_card,penalty,penalty_outcome,pen,total_tweets,total_perspective_tweets,tweets_containing_slurs,tweets_flagged_perspective
118,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-20,,False,,,False,,True,7.58,W,0,,False,False,False,0,30,30,0,0
119,Toby Alderweireld,Belgium,1783,England Tottenham Hotspur,88.0,AlderweireldTob,white,2021-06-20,,False,,,False,,True,6.78,W,0,,False,False,False,0,3,3,0,0
120,Thomas Vermaelen,Belgium,1783,Japan Vissel Kobe,,thomasvermaelen,white,2021-06-20,,False,,,False,,True,6.0,W,0,,False,False,False,0,10,10,0,0
121,Jan Vertonghen,Belgium,1783,Portugal Benfica,58.0,JanVertonghen,white,2021-06-20,,False,,,False,,True,6.97,W,0,,False,False,False,0,2,2,0,0
122,Axel Witsel,Belgium,1783,Germany Borussia Dortmund,90.0,axelwitsel28,non_white,2021-06-20,,False,,,False,,True,6.26,W,0,,False,False,False,0,27,27,0,0


In [28]:
# Write the dataframe to a *NEW* csv
new_regression_file = "/regression_table_with_persp.csv"
output_file = root_path + regression_path + new_regression_file
reg_df.to_csv(output_file, index=False)
