<a href="https://colab.research.google.com/github/dbckz/dissertation/blob/master/notebooks/descriptive_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [222]:
import pandas as pd
import numpy as np
import ast
import os
import matplotlib.pyplot as plt
import statsmodels.api as sm
from wordcloud import WordCloud
import dateutil
from tqdm import tqdm
from google.colab import drive
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.linear_model import LinearRegression
from statsmodels.genmod.generalized_estimating_equations import GEE
from statsmodels.genmod.cov_struct import (Exchangeable,
    Independence,Autoregressive)
from statsmodels.genmod.families import Poisson

In [223]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [224]:
root_path = "/content/drive/MyDrive/University/Dissertation"
regression_path = "/regression"

pens_df = pd.read_csv(root_path + regression_path + "/pens.csv",
                     parse_dates=['date'])

In [20]:
pens_df['hatebase_proportion'] = pens_df['tweets_containing_slurs'] / pens_df['total_tweets']
pens_df['perspective_proportion'] = pens_df['tweets_flagged_perspective'] / pens_df['total_perspective_tweets']

# Set to 0 where there's 0 tweets received
pens_df['hatebase_proportion'].fillna(0.0, inplace=True)
pens_df['perspective_proportion'].fillna(0.0, inplace=True)
pens_df['player_rating'].fillna(0.0, inplace=True)
pens_df['player_rating_in_previous_game'].fillna(0.0, inplace=True)
pens_df['club_coefficient'].fillna(0.0, inplace=True)

In [21]:
pens_df = pens_df[pens_df['days_since_defeat'] <= 2][pens_df['featured'] == True]


Boolean Series key will be reindexed to match DataFrame index.



In [22]:
agged = pens_df.groupby(['name']).agg('sum')

agged['hatebase_proportion'] = agged['tweets_containing_slurs'] / agged['total_tweets']
agged['perspective_proportion'] = agged['tweets_flagged_perspective'] / agged['total_perspective_tweets']

In [23]:
agged['color'] = np.where((agged.pen == -1),'red',agged.pen)
agged['color'] = np.where((agged.pen == 0),'orange',agged.color)
agged['color'] = np.where((agged.pen == 1),'green',agged.color)


In [24]:
agged[['pen','hatebase_proportion', 'color']].sort_values('hatebase_proportion', ascending=False)

Unnamed: 0_level_0,pen,hatebase_proportion,color
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Kyle Walker,0,0.009449,orange
Jordan Henderson,0,0.007511,orange
Jadon Sancho,-1,0.006977,red
Marcus Rashford,-1,0.006443,red
Raheem Sterling,0,0.005474,orange
Bukayo Saka,-1,0.005314,red
Jack Grealish,0,0.005304,orange
Harry Maguire,1,0.004996,green
Harry Kane,1,0.00458,green
Jordan Pickford,0,0.003025,orange


In [25]:
data = go.Bar(
    x = agged[['pen','hatebase_proportion']][agged['hatebase_proportion'] > 0].sort_values('hatebase_proportion', ascending=False).index,
    y = agged[['pen','hatebase_proportion']][agged['hatebase_proportion'] > 0].sort_values('hatebase_proportion', ascending=False).hatebase_proportion,
    marker_color=agged[['pen','hatebase_proportion', 'color']][agged['hatebase_proportion'] > 0].sort_values('hatebase_proportion', ascending=False).color
)

layout = go.Layout(yaxis_title="Proportion of tweets containing Hatebase slurs")

figure = go.Figure(data = data, layout = layout)
figure.show()

In [26]:
data = go.Bar(
    x = agged[['pen','perspective_proportion']][agged['perspective_proportion'] > 0].sort_values('perspective_proportion', ascending=False).index,
    y = agged[['pen','perspective_proportion']][agged['perspective_proportion'] > 0].sort_values('perspective_proportion', ascending=False).perspective_proportion,
    marker_color=agged[['pen','perspective_proportion', 'color']][agged['perspective_proportion'] > 0].sort_values('perspective_proportion', ascending=False).color
)

layout = go.Layout(yaxis_title="Proportion of tweets flagged by Perspective")

figure = go.Figure(data = data, layout = layout)
figure.show()

In [27]:
data = go.Bar(
    x = agged[['pen','tweets_containing_slurs']][agged['tweets_containing_slurs'] > 0].sort_values('tweets_containing_slurs', ascending=False).index,
    y = agged[['pen','tweets_containing_slurs']][agged['tweets_containing_slurs'] > 0].sort_values('tweets_containing_slurs', ascending=False).tweets_containing_slurs,
    marker_color=agged[['pen','tweets_containing_slurs', 'color']][agged['tweets_containing_slurs'] > 0].sort_values('tweets_containing_slurs', ascending=False).color
)

layout = go.Layout(yaxis_title="Number of tweets containing Hatebase slurs")

figure = go.Figure(data = data, layout = layout)
figure.show()

In [28]:
data = go.Bar(
    x = agged[['pen','tweets_flagged_perspective']][agged['tweets_flagged_perspective'] > 0].sort_values('tweets_flagged_perspective', ascending=False).index,
    y = agged[['pen','tweets_flagged_perspective']][agged['tweets_flagged_perspective'] > 0].sort_values('tweets_flagged_perspective', ascending=False).tweets_flagged_perspective,
    marker_color=agged[['pen','tweets_flagged_perspective', 'color']][agged['tweets_flagged_perspective'] > 0].sort_values('tweets_flagged_perspective', ascending=False).color
)

layout = go.Layout(yaxis_title="Number of tweets flagged by Perspective")

figure = go.Figure(data = data, layout = layout)
figure.show()

# Pens

In [None]:
for x in range(2):
  print(pens_df[['name', 'pen','tweets_flagged_perspective']][pens_df['days_since_defeat'] == x].sort_values('tweets_flagged_perspective', ascending=False))

                  name  pen  tweets_flagged_perspective
9      Marcus Rashford   -1                         667
22         Bukayo Saka   -1                         202
14        Jadon Sancho   -1                          99
151      Kylian Mbappé   -1                          59
8      Raheem Sterling    0                          52
7           Harry Kane    1                          38
2            Luke Shaw    0                          25
148         Paul Pogba    1                          22
4        Harry Maguire    1                          20
0      Jordan Pickford    0                          17
5        Jack Grealish    0                          16
158      Karim Benzema    0                          16
3          Declan Rice    0                          12
6     Jordan Henderson    0                           9
12     Kalvin Phillips    0                           6
147    Clément Lenglet    0                           2
16         Mason Mount    0                     

In [142]:
fig = px.line(pens_df, x="days_since_defeat", y="perspective_proportion", color='name')
fig.show()

# Race

In [306]:
reg_df = pd.read_csv(root_path + regression_path + "/regression_table_with_persp.csv",
                     parse_dates=['date'])

reg_df['hatebase_proportion'] = reg_df['tweets_containing_slurs'] / reg_df['total_tweets']
reg_df['perspective_proportion'] = reg_df['tweets_flagged_perspective'] / reg_df['total_perspective_tweets']

# Set to 0 where there's 0 tweets received
reg_df['hatebase_proportion'].fillna(0.0, inplace=True)
reg_df['perspective_proportion'].fillna(0.0, inplace=True)
reg_df['player_rating'].fillna(0.0, inplace=True)
reg_df['player_rating_in_previous_game'].fillna(0.0, inplace=True)
reg_df['club_coefficient'].fillna(0.0, inplace=True)

In [307]:
reg_df = reg_df[(reg_df['days_since_last_game'] == 0) | (reg_df['days_since_last_game'] == 1) | (reg_df['days_since_last_game'] == 2)]
reg_df[reg_df['name'] == 'Thibaut Courtois']

Unnamed: 0,name,country,country_ranking_points,club,club_coefficient,handle,ethnicity,date,days_since_last_game,featured,opponent,player_rating,matchday,result,featured_in_previous_game,player_rating_in_previous_game,result_in_previous_game,pen_in_previous_game,round,red_card,penalty,penalty_outcome,pen,total_tweets,total_perspective_tweets,tweets_containing_slurs,tweets_flagged_perspective,hatebase_proportion,perspective_proportion
118,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-21,0.0,True,Finland,6.6,True,W,True,7.58,W,0,group_stage,False,False,False,0,238,238,0,0,0.0,0.0
236,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-22,1.0,False,Finland,0.0,False,,True,6.6,W,0,,False,False,False,0,23,23,0,0,0.0,0.0
354,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-23,2.0,False,Finland,0.0,False,,True,6.6,W,0,,False,False,False,0,116,116,0,0,0.0,0.0
826,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-27,0.0,True,Portugal,7.82,True,W,True,6.6,W,0,round_of_16,False,False,False,0,579,579,1,0,0.001727,0.0
944,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-28,1.0,False,Portugal,0.0,False,,True,7.82,W,0,,False,False,False,0,208,208,0,1,0.0,0.004808
1062,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-29,2.0,False,Portugal,0.0,False,,True,7.82,W,0,,False,False,False,0,61,61,0,1,0.0,0.016393
1416,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-07-02,0.0,True,Italy,5.86,True,L,True,7.82,W,0,quarter_final,False,False,False,0,339,339,0,0,0.0,0.0
1534,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-07-03,1.0,False,Italy,0.0,False,,True,5.86,L,0,,False,False,False,0,120,120,1,0,0.008333,0.0
1652,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-07-04,2.0,False,Italy,0.0,False,,True,5.86,L,0,,False,False,False,0,38,38,0,1,0.0,0.026316


In [308]:
# just do England, Netherlands, Belgium - as they won and lost games

sco = reg_df[reg_df['country'] == 'Scotland'].index
fra = reg_df[reg_df['country'] == 'France'].index
ger = reg_df[reg_df['country'] == 'Germany'].index
reg_df.drop(sco, inplace = True)
reg_df.drop(fra, inplace = True)
reg_df.drop(ger, inplace = True)

saka = reg_df[reg_df['name'] == 'Bukayo Saka'].index
rashford = reg_df[reg_df['name'] == 'Marcus Rashford'].index
sancho = reg_df[reg_df['name'] == 'Jadon Sancho'].index
reg_df.drop(saka, inplace = True)
reg_df.drop(rashford, inplace = True)
reg_df.drop(sancho, inplace = True)

In [309]:
reg_df['result'] = reg_df['result'].fillna('A')

In [310]:
reg_df = reg_df.groupby(['name', 'opponent']).agg(({
    'total_tweets': sum,
    'tweets_containing_slurs': sum,
    'tweets_flagged_perspective': sum,
    'hatebase_proportion': 'mean',
    'perspective_proportion': 'mean',
    'ethnicity': max,
    'result': max
    }
    ))

reg_df['hatebase_proportion'] = reg_df['tweets_containing_slurs'] / reg_df['total_tweets']
reg_df['perspective_proportion'] = reg_df['tweets_flagged_perspective'] / reg_df['total_tweets']


In [311]:
tmp_L = reg_df[reg_df['result'] == 'L']
print(tmp_L['tweets_containing_slurs'].mean())
print(tmp_L['tweets_flagged_perspective'].mean())
print(tmp_L['hatebase_proportion'].mean() * 100)
print(tmp_L['perspective_proportion'].mean()* 100) 

ethnicity_df_L = tmp_L.groupby('ethnicity').agg(({
    'tweets_containing_slurs': 'mean',
    'tweets_flagged_perspective': 'mean',
    'hatebase_proportion': 'mean',
    'perspective_proportion': 'mean'
    }
    ))

13.362068965517242
8.568965517241379
0.9107059636116089
0.2115524531236946


In [312]:
ethnicity_df_L['ethnicity_text'] = np.where((ethnicity_df_L.index == 'white'),'White',ethnicity_df_L.index)
ethnicity_df_L['ethnicity_text'] = np.where((ethnicity_df_L.index == 'non_white'),'Other than White',ethnicity_df_L.ethnicity_text)

ethnicity_df_L

Unnamed: 0_level_0,tweets_containing_slurs,tweets_flagged_perspective,hatebase_proportion,perspective_proportion,ethnicity_text
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
non_white,15.65,9.85,0.011404,0.003828,Other than White
white,12.157895,7.894737,0.007831,0.001164,White


In [313]:
normalising_factor = 100 / ethnicity_df_L['perspective_proportion'].max()

data = go.Bar(
    y = ethnicity_df_L.ethnicity_text,
    x = ethnicity_df_L.perspective_proportion * 100,
    text = ethnicity_df.perspective_proportion * 100,
    textposition = 'outside',
    texttemplate = '%{text:.5f}',
    orientation='h',
    width=[0.5]*2
)

layout = go.Layout(xaxis = dict(
        showticklabels = False,
        title = "Proportion of tweets flagged by Perspective"
    ))



figure = go.Figure(data = data, layout = layout)

figure.update_traces(marker_color='green')
figure.show()

# This basically shows that on matchdays where they lose, non white players get twice as much abuse as white players - this omits penalty missers Saka, Sancho, Mbappe, and Rashford

In [314]:
normalising_factor = 100 / ethnicity_df_L['hatebase_proportion'].max()

data = go.Bar(
    y = ethnicity_df_L.ethnicity_text,
    x = ethnicity_df_L.hatebase_proportion * 100,
    text = ethnicity_df_L.hatebase_proportion * 100,
    textposition = 'outside',
    texttemplate = '%{text:.5f}',
    orientation='h',
    width=[0.5]*2
)

layout = go.Layout(xaxis = dict(
        showticklabels = False,
        title = "Proportion of tweets containing Hatebase slurs"
    ))

figure = go.Figure(data = data, layout = layout)
figure.show()

# This basically shows that on matchdays where they lose, non white players get twice as much abuse as white players - this omits penalty missers Saka, Sancho, Mbappe, and Rashford

In [315]:
data = go.Bar(
    y = ethnicity_df_L.ethnicity_text,
    x = ethnicity_df_L.tweets_containing_slurs,
    text = ethnicity_df_L.tweets_containing_slurs,
    textposition = 'outside',
    texttemplate = '%{text:.1f}',
    orientation='h',
    width=[0.5]*2
)

layout = go.Layout(xaxis = dict(
        showticklabels = False,
        title = "Number of tweets containing Hatebase slurs (per player)"
    ))




figure = go.Figure(data = data, layout = layout)
figure.show()

# need to do this per player - as we have more white than non white

In [316]:
data = go.Bar(
    y = ethnicity_df_L.ethnicity_text,
    x = ethnicity_df_L.tweets_flagged_perspective,
    text = ethnicity_df_L.tweets_flagged_perspective,
    textposition = 'outside',
    texttemplate = '%{text:.1f}',
    orientation='h',
    width=[0.5]*2
)

layout = go.Layout(xaxis = dict(
                showticklabels = False,
                title = "Number of tweets flagged by Perspective (per player)"

    ))

figure = go.Figure(data = data, layout = layout)
figure.update_traces(marker_color='green')
figure.show()



In [318]:
tmp_W = reg_df[reg_df['result'] == 'W']
print(tmp_W['tweets_containing_slurs'].mean())
print(tmp_W['tweets_flagged_perspective'].mean())
print(tmp_W['hatebase_proportion'].mean() * 100)
print(tmp_W['perspective_proportion'].mean()* 100) 

ethnicity_df_W = tmp_W.groupby('ethnicity').agg(({
    'tweets_containing_slurs': 'mean',
    'tweets_flagged_perspective': 'mean',
    'hatebase_proportion': 'mean',
    'perspective_proportion': 'mean'
    }
    ))

8.556338028169014
6.753521126760563
0.44411684194821777
0.1221615696400339


In [319]:
ethnicity_df_W['ethnicity_text'] = np.where((ethnicity_df_W.index == 'white'),'White',ethnicity_df_W.index)
ethnicity_df_W['ethnicity_text'] = np.where((ethnicity_df_W.index == 'non_white'),'Other than White', ethnicity_df_W.ethnicity_text)

ethnicity_df_W

Unnamed: 0_level_0,tweets_containing_slurs,tweets_flagged_perspective,hatebase_proportion,perspective_proportion,ethnicity_text
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
non_white,11.466667,8.488889,0.007281,0.000921,Other than White
white,7.206186,5.948454,0.003124,0.001361,White


In [320]:
normalising_factor = 100 / ethnicity_df_W['perspective_proportion'].max()

data = go.Bar(
    y = ethnicity_df_W.ethnicity_text,
    x = ethnicity_df_W.perspective_proportion * 100,
    text = ethnicity_df_W.perspective_proportion * 100,
    textposition = 'outside',
    texttemplate = '%{text:.5f}',
    orientation = 'h',
    width = [0.5]*2
)

layout = go.Layout(xaxis = dict(
                showticklabels = False,
                title = "Proportion of tweets flagged by Perspective"))

figure = go.Figure(data = data, layout = layout)
figure.update_traces(marker_color='green')

figure.show()

In [322]:
normalising_factor = 100 / ethnicity_df_W['hatebase_proportion'].max()

data = go.Bar(
    y = ethnicity_df_W.ethnicity_text,
    x = ethnicity_df_W.hatebase_proportion * 100,
    text = ethnicity_df_W.hatebase_proportion * 100,
    textposition = 'outside',
    texttemplate = '%{text:.5f}',
    orientation = 'h',
    width = [0.5]*2
)

layout = go.Layout(xaxis = dict(
                showticklabels = False,
                title = "Proportion of tweets containing Hatebase slurs"))

figure = go.Figure(data = data, layout = layout)

figure.show()

In [323]:
data = go.Bar(
    y = ethnicity_df_W.ethnicity_text,
    x = ethnicity_df_W.tweets_flagged_perspective,
    text = ethnicity_df_W.tweets_flagged_perspective,
    textposition = 'outside',
    texttemplate = '%{text:.1f}',
    orientation='h',
    width=[0.5]*2
)

layout = go.Layout(xaxis = dict(
                showticklabels = False,
                title = "Number of tweets flagged by Perspective (per player)"

    ))

figure = go.Figure(data = data, layout = layout)
figure.update_traces(marker_color='green')
figure.show()



In [324]:
data = go.Bar(
    y = ethnicity_df_W.ethnicity_text,
    x = ethnicity_df_W.tweets_containing_slurs,
    text = ethnicity_df_W.tweets_containing_slurs,
    textposition = 'outside',
    texttemplate = '%{text:.1f}',
    orientation='h',
    width=[0.5]*2
)

layout = go.Layout(xaxis = dict(
                showticklabels = False,
                title = "Number of tweets containing Hatebase slurs (per player)"

    ))

figure = go.Figure(data = data, layout = layout)
figure.show()



In [325]:
tmp_WL = reg_df[(reg_df['result'] == 'W') | (reg_df['result'] == 'L')]
print(tmp_WL['tweets_containing_slurs'].mean())
print(tmp_WL['tweets_flagged_perspective'].mean())
print(tmp_WL['hatebase_proportion'].mean() * 100)
print(tmp_WL['perspective_proportion'].mean()* 100) 

ethnicity_df_WL = tmp_WL.groupby('ethnicity').agg(({
    'tweets_containing_slurs': 'mean',
    'tweets_flagged_perspective': 'mean',
    'hatebase_proportion': 'mean',
    'perspective_proportion': 'mean'
    }
    ))

9.95
7.28
0.5760814420146313
0.1474438397162208


In [326]:
ethnicity_df_WL

Unnamed: 0_level_0,tweets_containing_slurs,tweets_flagged_perspective,hatebase_proportion,perspective_proportion
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
non_white,12.753846,8.907692,0.008549,0.001815
white,8.6,6.496296,0.004398,0.001308


In [327]:
ethnicity_df_WL['ethnicity_text'] = np.where((ethnicity_df_WL.index == 'white'),'White',ethnicity_df_WL.index)
ethnicity_df_WL['ethnicity_text'] = np.where((ethnicity_df_WL.index == 'non_white'),'Other than White', ethnicity_df_WL.ethnicity_text)

ethnicity_df_WL

Unnamed: 0_level_0,tweets_containing_slurs,tweets_flagged_perspective,hatebase_proportion,perspective_proportion,ethnicity_text
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
non_white,12.753846,8.907692,0.008549,0.001815,Other than White
white,8.6,6.496296,0.004398,0.001308,White


In [328]:
normalising_factor = 100 / ethnicity_df_WL['perspective_proportion'].max()

data = go.Bar(
    y = ethnicity_df_WL.ethnicity_text,
    x = ethnicity_df_WL.perspective_proportion * 100,
    text = ethnicity_df_WL.perspective_proportion * 100,
    textposition = 'outside',
    texttemplate = '%{text:.5f}',
    orientation = 'h',
    width = [0.5]*2
)

layout = go.Layout(xaxis = dict(
                showticklabels = False,
                title = "Proportion of tweets flagged by Perspective"))

figure = go.Figure(data = data, layout = layout)
figure.update_traces(marker_color='green')

figure.show()

In [329]:
normalising_factor = 100 / ethnicity_df_WL['hatebase_proportion'].max()

data = go.Bar(
    y = ethnicity_df_WL.ethnicity_text,
    x = ethnicity_df_WL.hatebase_proportion * 100,
    text = ethnicity_df_WL.hatebase_proportion * 100,
    textposition = 'outside',
    texttemplate = '%{text:.5f}',
    orientation = 'h',
    width = [0.5]*2
)

layout = go.Layout(xaxis = dict(
                showticklabels = False,
                title = "Proportion of tweets containing Hatebase slurs"))

figure = go.Figure(data = data, layout = layout)

figure.show()

In [330]:
data = go.Bar(
    y = ethnicity_df_WL.ethnicity_text,
    x = ethnicity_df_WL.tweets_flagged_perspective,
    text = ethnicity_df_WL.tweets_flagged_perspective,
    textposition = 'outside',
    texttemplate = '%{text:.1f}',
    orientation='h',
    width=[0.5]*2
)

layout = go.Layout(xaxis = dict(
                showticklabels = False,
                title = "Number of tweets flagged by Perspective (per player)"

    ))

figure = go.Figure(data = data, layout = layout)
figure.update_traces(marker_color='green')
figure.show()



In [331]:
data = go.Bar(
    y = ethnicity_df_WL.ethnicity_text,
    x = ethnicity_df_WL.tweets_containing_slurs,
    text = ethnicity_df_WL.tweets_containing_slurs,
    textposition = 'outside',
    texttemplate = '%{text:.1f}',
    orientation='h',
    width=[0.5]*2
)

layout = go.Layout(xaxis = dict(
                showticklabels = False,
                title = "Number of tweets containing Hatebase slurs (per player)"

    ))

figure = go.Figure(data = data, layout = layout)
figure.show()

