<a href="https://colab.research.google.com/github/dbckz/crossing-the-line/blob/master/notebooks/descriptive_analysis_england.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
import pandas as pd
import numpy as np
import ast
import os
import matplotlib.pyplot as plt
import statsmodels.api as sm
from wordcloud import WordCloud
import dateutil
from tqdm import tqdm
from google.colab import drive
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.linear_model import LinearRegression
from statsmodels.genmod.generalized_estimating_equations import GEE
from statsmodels.genmod.cov_struct import (Exchangeable,
    Independence,Autoregressive)
from statsmodels.genmod.families import Poisson

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
root_path = "/content/drive/MyDrive/University/Dissertation"
regression_path = "/regression"

pens_df = pd.read_csv(root_path + regression_path + "/pens.csv",
                     parse_dates=['date'])

In [4]:
pens_df['hatebase_proportion'] = pens_df['tweets_containing_slurs'] / pens_df['total_tweets']
pens_df['perspective_proportion'] = pens_df['tweets_flagged_perspective'] / pens_df['total_perspective_tweets']

# Set to 0 where there's 0 tweets received
pens_df['hatebase_proportion'].fillna(0.0, inplace=True)
pens_df['perspective_proportion'].fillna(0.0, inplace=True)
pens_df['player_rating'].fillna(0.0, inplace=True)
pens_df['player_rating_in_previous_game'].fillna(0.0, inplace=True)
pens_df['club_coefficient'].fillna(0.0, inplace=True)

In [5]:
pens_df = pens_df[pens_df['days_since_defeat'] <= 2][pens_df['featured'] == True]

  pens_df = pens_df[pens_df['days_since_defeat'] <= 2][pens_df['featured'] == True]


In [6]:
agged = pens_df.groupby(['name']).agg('sum')

agged['hatebase_proportion'] = agged['tweets_containing_slurs'] / agged['total_tweets']
agged['perspective_proportion'] = agged['tweets_flagged_perspective'] / agged['total_perspective_tweets']

agged.drop(['Antoine Griezmann', 'Benjamin Pavard', 'Clément Lenglet', 'Karim Benzema', 'Kylian Mbappé', 'Marcus Thuram', 'Moussa Sissoko', "N'Golo Kanté", 'Olivier Giroud', 'Paul Pogba', 'Presnel Kimpembe', 'Raphaël Varane'], inplace=True)

In [7]:
agged['color'] = np.where((agged.pen == -1),'red',agged.pen)
agged['color'] = np.where((agged.pen == 0),'orange',agged.color)
agged['color'] = np.where((agged.pen == 1),'green',agged.color)

In [29]:
agged

Unnamed: 0_level_0,country_ranking_points,club_coefficient,days_since_defeat,featured,player_rating,matchday,featured_in_previous_game,player_rating_in_previous_game,pen_in_previous_game,red_card,penalty,penalty_outcome,pen,total_tweets,total_perspective_tweets,tweets_containing_slurs,tweets_flagged_perspective,hatebase_proportion,perspective_proportion,color
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Bukayo Saka,1687,99.0,0,1,5.79,1,1,6.18,0,0,1,0,-1,44790,44790,238,202,0.005314,0.00451,red
Declan Rice,1687,1.5,0,1,7.5,1,1,6.36,0,0,0,0,0,2666,2666,8,12,0.003001,0.004501,orange
Harry Kane,1687,88.0,0,1,7.2,1,1,7.46,-1,0,1,1,1,7206,7206,33,38,0.00458,0.005273,green
Harry Maguire,1687,113.0,0,1,7.11,1,1,8.13,0,0,1,1,1,4604,4604,23,20,0.004996,0.004344,green
Jack Grealish,1687,0.0,0,1,6.46,1,1,6.38,0,0,0,0,0,4525,4525,24,16,0.005304,0.003536,orange
Jadon Sancho,1687,90.0,0,1,5.66,1,0,0.0,0,0,1,0,-1,15909,15909,111,99,0.006977,0.006223,red
Jordan Henderson,1687,101.0,0,1,6.21,1,1,6.28,0,0,0,0,0,1864,1864,14,9,0.007511,0.004828,orange
Jordan Pickford,1687,3.0,0,1,7.73,1,1,5.84,0,0,0,0,0,3636,3636,11,17,0.003025,0.004675,orange
Kalvin Phillips,1687,0.0,0,1,6.46,1,1,6.75,0,0,0,0,0,2505,2505,2,6,0.000798,0.002395,orange
Kieran Trippier,1687,115.0,0,1,7.04,1,1,5.96,0,0,0,0,0,514,514,1,0,0.001946,0.0,orange


In [30]:
data = go.Bar(
    x = agged[['pen','tweets_flagged_perspective']][agged['tweets_flagged_perspective'] > 0].sort_values('tweets_flagged_perspective', ascending=False).index,
    y = agged[['pen','tweets_flagged_perspective']][agged['tweets_flagged_perspective'] > 0].sort_values('tweets_flagged_perspective', ascending=False).tweets_flagged_perspective,
    marker_color=agged[['pen','tweets_flagged_perspective', 'color']][agged['tweets_flagged_perspective'] > 0].sort_values('tweets_flagged_perspective', ascending=False).color
)

layout = go.Layout(yaxis_title="Number of tweets flagged by Perspective")

figure = go.Figure(data = data, layout = layout)
figure.write_html("/content/drive/MyDrive/University/Dissertation/images/figure4d.html", include_plotlyjs='cdn')
figure.show()

# Pens

In [31]:
for x in range(2):
  print(pens_df[['name', 'pen','tweets_flagged_perspective']][pens_df['days_since_defeat'] == x].sort_values('tweets_flagged_perspective', ascending=False))

                  name  pen  tweets_flagged_perspective
9      Marcus Rashford   -1                         667
22         Bukayo Saka   -1                         202
14        Jadon Sancho   -1                          99
151      Kylian Mbappé   -1                          59
8      Raheem Sterling    0                          52
7           Harry Kane    1                          38
2            Luke Shaw    0                          25
148         Paul Pogba    1                          22
4        Harry Maguire    1                          20
0      Jordan Pickford    0                          17
5        Jack Grealish    0                          16
158      Karim Benzema    0                          16
3          Declan Rice    0                          12
6     Jordan Henderson    0                           9
12     Kalvin Phillips    0                           6
147    Clément Lenglet    0                           2
16         Mason Mount    0                     

In [32]:
fig = px.line(pens_df, x="days_since_defeat", y="tweets_flagged_perspective", color='name')
fig.show()

# Race

In [33]:
reg_df = pd.read_csv(root_path + regression_path + "/regression_table_with_persp.csv",
                     parse_dates=['date'])

reg_df['hatebase_proportion'] = reg_df['tweets_containing_slurs'] / reg_df['total_tweets']
reg_df['perspective_proportion'] = reg_df['tweets_flagged_perspective'] / reg_df['total_perspective_tweets']

# Set to 0 where there's 0 tweets received
reg_df['hatebase_proportion'].fillna(0.0, inplace=True)
reg_df['perspective_proportion'].fillna(0.0, inplace=True)
reg_df['player_rating'].fillna(0.0, inplace=True)
reg_df['player_rating_in_previous_game'].fillna(0.0, inplace=True)
reg_df['club_coefficient'].fillna(0.0, inplace=True)

In [34]:
reg_df = reg_df[(reg_df['days_since_last_game'] == 0) | (reg_df['days_since_last_game'] == 1) | (reg_df['days_since_last_game'] == 2)]
reg_df[reg_df['name'] == 'Thibaut Courtois']

Unnamed: 0,name,country,country_ranking_points,club,club_coefficient,handle,ethnicity,date,days_since_last_game,featured,...,red_card,penalty,penalty_outcome,pen,total_tweets,total_perspective_tweets,tweets_containing_slurs,tweets_flagged_perspective,hatebase_proportion,perspective_proportion
118,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-21,0.0,True,...,False,False,False,0,238,238,0,0,0.0,0.0
236,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-22,1.0,False,...,False,False,False,0,23,23,0,0,0.0,0.0
354,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-23,2.0,False,...,False,False,False,0,116,116,0,0,0.0,0.0
826,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-27,0.0,True,...,False,False,False,0,579,579,1,0,0.001727,0.0
944,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-28,1.0,False,...,False,False,False,0,208,208,0,1,0.0,0.004808
1062,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-29,2.0,False,...,False,False,False,0,61,61,0,1,0.0,0.016393
1416,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-07-02,0.0,True,...,False,False,False,0,339,339,0,0,0.0,0.0
1534,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-07-03,1.0,False,...,False,False,False,0,120,120,1,0,0.008333,0.0
1652,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-07-04,2.0,False,...,False,False,False,0,38,38,0,1,0.0,0.026316


In [35]:
sco = reg_df[reg_df['country'] == 'Scotland'].index
fra = reg_df[reg_df['country'] == 'France'].index
ger = reg_df[reg_df['country'] == 'Germany'].index
ned = reg_df[reg_df['country'] == 'Netherlands'].index
bel = reg_df[reg_df['country'] == 'Belgium'].index
reg_df.drop(sco, inplace = True)
reg_df.drop(fra, inplace = True)
reg_df.drop(ger, inplace = True)
reg_df.drop(ned, inplace = True)
reg_df.drop(bel, inplace = True)


# saka = reg_df[reg_df['name'] == 'Bukayo Saka'].index
# rashford = reg_df[reg_df['name'] == 'Marcus Rashford'].index
# sancho = reg_df[reg_df['name'] == 'Jadon Sancho'].index
# reg_df.drop(saka, inplace = True)
# reg_df.drop(rashford, inplace = True)
# reg_df.drop(sancho, inplace = True)

In [36]:
reg_df['result'] = reg_df['result'].fillna('A')

In [37]:
reg_df = reg_df.groupby(['name', 'opponent']).agg(({
    'total_tweets': sum,
    'tweets_containing_slurs': sum,
    'tweets_flagged_perspective': sum,
    'hatebase_proportion': 'mean',
    'perspective_proportion': 'mean',
    'ethnicity': max,
    'result': max
    }
    ))

reg_df['hatebase_proportion'] = reg_df['tweets_containing_slurs'] / reg_df['total_tweets']
reg_df['perspective_proportion'] = reg_df['tweets_flagged_perspective'] / reg_df['total_tweets']


In [38]:
tmp_L = reg_df[reg_df['result'] == 'L']
print(tmp_L['tweets_containing_slurs'].mean())
print(tmp_L['tweets_flagged_perspective'].mean())
print(tmp_L['hatebase_proportion'].mean() * 100)
print(tmp_L['perspective_proportion'].mean()* 100) 

ethnicity_df_L = tmp_L.groupby('ethnicity').agg(({
    'tweets_containing_slurs': 'mean',
    'tweets_flagged_perspective': 'mean',
    'hatebase_proportion': 'mean',
    'perspective_proportion': 'mean'
    }
    ))

98.25
86.83333333333333
0.3237935653209099
0.24622241478249543


In [39]:
ethnicity_df_L['ethnicity_text'] = np.where((ethnicity_df_L.index == 'white'),'White',ethnicity_df_L.index)
ethnicity_df_L['ethnicity_text'] = np.where((ethnicity_df_L.index == 'non_white'),'Other than White',ethnicity_df_L.ethnicity_text)

ethnicity_df_L

Unnamed: 0_level_0,tweets_containing_slurs,tweets_flagged_perspective,hatebase_proportion,perspective_proportion,ethnicity_text
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
non_white,217.111111,199.222222,0.004197,0.003307,Other than White
white,26.933333,19.4,0.002663,0.001955,White


In [40]:
normalising_factor = 100 / ethnicity_df_L['perspective_proportion'].max()

data = go.Bar(
    y = ethnicity_df_L.ethnicity_text,
    x = ethnicity_df_L.perspective_proportion * 100,
    text = ethnicity_df_L.perspective_proportion * 100,
    textposition = 'outside',
    texttemplate = '%{text:.5f}',
    orientation='h',
    width=[0.5]*2
)

layout = go.Layout(xaxis = dict(
        showticklabels = False,
        title = "Proportion of tweets flagged by Perspective"
    ))



figure = go.Figure(data = data, layout = layout)

figure.update_traces(marker_color='green')
figure.show()

# This basically shows that on matchdays where they lose, non white players get twice as much abuse as white players

In [41]:
data = go.Bar(
    y = ethnicity_df_L.ethnicity_text,
    x = ethnicity_df_L.tweets_flagged_perspective,
    text = ethnicity_df_L.tweets_flagged_perspective,
    textposition = 'outside',
    texttemplate = '%{text:.1f}',
    orientation='h',
    width=[0.5]*2
)

layout = go.Layout(xaxis = dict(
                showticklabels = False,
                title = "Number of tweets flagged by Perspective (per player)"

    ))

figure = go.Figure(data = data, layout = layout)
figure.update_traces(marker_color='green')
figure.show()



In [42]:
tmp_W = reg_df[reg_df['result'] == 'W']
print(tmp_W['tweets_containing_slurs'].mean())
print(tmp_W['tweets_flagged_perspective'].mean())
print(tmp_W['hatebase_proportion'].mean() * 100)
print(tmp_W['perspective_proportion'].mean()* 100) 

ethnicity_df_W = tmp_W.groupby('ethnicity').agg(({
    'tweets_containing_slurs': 'mean',
    'tweets_flagged_perspective': 'mean',
    'hatebase_proportion': 'mean',
    'perspective_proportion': 'mean'
    }
    ))

11.645833333333334
10.34375
0.28336080886638265
0.15698201324446812


In [43]:
ethnicity_df_W['ethnicity_text'] = np.where((ethnicity_df_W.index == 'white'),'White',ethnicity_df_W.index)
ethnicity_df_W['ethnicity_text'] = np.where((ethnicity_df_W.index == 'non_white'),'Other than White', ethnicity_df_W.ethnicity_text)

ethnicity_df_W

Unnamed: 0_level_0,tweets_containing_slurs,tweets_flagged_perspective,hatebase_proportion,perspective_proportion,ethnicity_text
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
non_white,13.055556,12.555556,0.002728,0.001318,Other than White
white,10.8,9.016667,0.002897,0.001721,White


In [44]:
normalising_factor = 100 / ethnicity_df_W['perspective_proportion'].max()

data = go.Bar(
    y = ethnicity_df_W.ethnicity_text,
    x = ethnicity_df_W.perspective_proportion * 100,
    text = ethnicity_df_W.perspective_proportion * 100,
    textposition = 'outside',
    texttemplate = '%{text:.5f}',
    orientation = 'h',
    width = [0.5]*2
)

layout = go.Layout(xaxis = dict(
                showticklabels = False,
                title = "Proportion of tweets flagged by Perspective"))

figure = go.Figure(data = data, layout = layout)
figure.update_traces(marker_color='green')

figure.show()

In [54]:
data = go.Bar(
    y = ethnicity_df_W.ethnicity_text,
    x = ethnicity_df_W.tweets_flagged_perspective,
    text = ethnicity_df_W.tweets_flagged_perspective,
    textposition = 'outside',
    texttemplate = '%{text:.1f}',
    orientation='h',
    width=[0.5]*2
)

layout = go.Layout(xaxis = dict(
                showticklabels = False,
                title = "Number of tweets flagged by Perspective (per player)"

    ))

figure = go.Figure(data = data, layout = layout)
figure.update_traces(marker_color='green')
figure.show()



In [47]:
tmp_WL = reg_df[(reg_df['result'] == 'W') | (reg_df['result'] == 'L')]
print(tmp_WL['tweets_containing_slurs'].mean())
print(tmp_WL['tweets_flagged_perspective'].mean())
print(tmp_WL['hatebase_proportion'].mean() * 100)
print(tmp_WL['perspective_proportion'].mean()* 100) 

ethnicity_df_WL = tmp_WL.groupby('ethnicity').agg(({
    'tweets_containing_slurs': 'mean',
    'tweets_flagged_perspective': 'mean',
    'hatebase_proportion': 'mean',
    'perspective_proportion': 'mean'
    }
    ))

28.966666666666665
25.641666666666666
0.2914473601572881
0.1748300935520736


In [48]:
ethnicity_df_WL

Unnamed: 0_level_0,tweets_containing_slurs,tweets_flagged_perspective,hatebase_proportion,perspective_proportion
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
non_white,53.866667,49.888889,0.003021,0.001716
white,14.026667,11.093333,0.00285,0.001768


In [49]:
ethnicity_df_WL['ethnicity_text'] = np.where((ethnicity_df_WL.index == 'white'),'White',ethnicity_df_WL.index)
ethnicity_df_WL['ethnicity_text'] = np.where((ethnicity_df_WL.index == 'non_white'),'Other than White', ethnicity_df_WL.ethnicity_text)

ethnicity_df_WL

Unnamed: 0_level_0,tweets_containing_slurs,tweets_flagged_perspective,hatebase_proportion,perspective_proportion,ethnicity_text
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
non_white,53.866667,49.888889,0.003021,0.001716,Other than White
white,14.026667,11.093333,0.00285,0.001768,White


In [51]:
data = go.Bar(
    y = ethnicity_df_WL.ethnicity_text,
    x = ethnicity_df_WL.tweets_flagged_perspective,
    text = ethnicity_df_WL.tweets_flagged_perspective,
    textposition = 'outside',
    texttemplate = '%{text:.1f}',
    orientation='h',
    width=[0.5]*2
)

layout = go.Layout(xaxis = dict(
                showticklabels = False,
                title = "Number of tweets flagged by Perspective (per player)"

    ))

figure = go.Figure(data = data, layout = layout)
figure.update_traces(marker_color='green')
figure.show()

