<a href="https://colab.research.google.com/github/dbckz/dissertation/blob/master/notebooks/zinb_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regression

In [1]:
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
from google.colab import drive
from statsmodels.genmod import families
import statsmodels.discrete.count_model as reg_models

  import pandas.util.testing as tm


In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
root_path = "/content/drive/MyDrive/University/Dissertation"
regression_path = "/regression"
regression_file = "/regression_table_with_persp.csv"

reg_df = pd.read_csv(root_path + regression_path + regression_file,
                     parse_dates=['date'])

In [4]:
reg_df['hatebase_proportion'] = reg_df['tweets_containing_slurs'] / reg_df['total_tweets']
reg_df['perspective_proportion'] = reg_df['tweets_flagged_perspective'] / reg_df['total_perspective_tweets']

# Set to 0 where there's 0 tweets received
reg_df['hatebase_proportion'].fillna(0.0, inplace=True)
reg_df['perspective_proportion'].fillna(0.0, inplace=True)
reg_df['player_rating'].fillna(0.0, inplace=True)
reg_df['player_rating_in_previous_game'].fillna(0.0, inplace=True)
reg_df['club_coefficient'].fillna(0.0, inplace=True)

In [5]:
reg_df.head()

Unnamed: 0,name,country,country_ranking_points,club,club_coefficient,handle,ethnicity,date,days_since_last_game,featured,opponent,player_rating,matchday,result,featured_in_previous_game,player_rating_in_previous_game,result_in_previous_game,pen_in_previous_game,round,red_card,penalty,penalty_outcome,pen,total_tweets,total_perspective_tweets,tweets_containing_slurs,tweets_flagged_perspective,hatebase_proportion,perspective_proportion
0,Thibaut Courtois,Belgium,1783,Spain Real Madrid,127.0,thibautcourtois,white,2021-06-20,,False,,0.0,False,,True,7.58,W,0,,False,False,False,0,30,30,0,0,0.0,0.0
1,Toby Alderweireld,Belgium,1783,England Tottenham Hotspur,88.0,AlderweireldTob,white,2021-06-20,,False,,0.0,False,,True,6.78,W,0,,False,False,False,0,3,3,0,0,0.0,0.0
2,Thomas Vermaelen,Belgium,1783,Japan Vissel Kobe,0.0,thomasvermaelen,white,2021-06-20,,False,,0.0,False,,True,6.0,W,0,,False,False,False,0,10,10,0,0,0.0,0.0
3,Jan Vertonghen,Belgium,1783,Portugal Benfica,58.0,JanVertonghen,white,2021-06-20,,False,,0.0,False,,True,6.97,W,0,,False,False,False,0,2,2,0,0,0.0,0.0
4,Axel Witsel,Belgium,1783,Germany Borussia Dortmund,90.0,axelwitsel28,non_white,2021-06-20,,False,,0.0,False,,True,6.26,W,0,,False,False,False,0,27,27,0,0,0.0,0.0


In [6]:
reg_df['day_of_week'] = reg_df['date'].dt.day_name()
reg_df["featured"] = reg_df["featured"].astype(int)
reg_df["featured_in_previous_game"] = reg_df["featured_in_previous_game"].astype(int)
reg_df["matchday"] = reg_df["matchday"].astype(int)
reg_df["red_card"] = reg_df["red_card"].astype(int)
reg_df["penalty"] = reg_df["penalty"].astype(int)
reg_df["penalty_outcome"] = reg_df["penalty_outcome"].astype(int)

reg_df = pd.get_dummies(reg_df, columns=['ethnicity'])
reg_df = pd.get_dummies(reg_df, columns=['result'])
reg_df = pd.get_dummies(reg_df, columns=['result_in_previous_game'])
reg_df = pd.get_dummies(reg_df, columns=['day_of_week'])
reg_df = pd.get_dummies(reg_df, columns=['country'])
reg_df.drop(columns=['country_ranking_points', 'club', 'name', 'date', 'opponent', 'round', 'ethnicity_white', 'result_D', 'result_W'], inplace=True)

In [7]:
print('Hatebase absolute: Mean='+str(np.mean(reg_df['tweets_containing_slurs'])) + ' Variance='+str(np.var(reg_df['tweets_containing_slurs'])))
print('Hatebase proportion: Mean='+str(np.mean(reg_df['hatebase_proportion'])) + ' Variance='+str(np.var(reg_df['hatebase_proportion'])))
print('Perspective absolute: Mean='+str(np.mean(reg_df['tweets_flagged_perspective'])) + ' Variance='+str(np.var(reg_df['tweets_flagged_perspective'])))
print('Perspective proportion: Mean='+str(np.mean(reg_df['perspective_proportion'])) + ' Variance='+str(np.var(reg_df['perspective_proportion'])))


Hatebase absolute: Mean=1.610483364720653 Variance=160.09090077075075
Hatebase proportion: Mean=0.00283838413158561 Variance=0.0005021105139132655
Perspective absolute: Mean=1.3433772755806654 Variance=222.44392506608068
Perspective proportion: Mean=0.0013648968183750713 Variance=4.561495769822913e-05


In [8]:
num_obs = len(reg_df)

h_zeroes_abs = len(reg_df[reg_df['tweets_containing_slurs'] == 0])
h_zeroes_prop = len(reg_df[reg_df['hatebase_proportion'] == 0])
p_zeroes_abs = len(reg_df[reg_df['tweets_flagged_perspective'] == 0])
p_zeroes_prop = len(reg_df[reg_df['perspective_proportion'] == 0])

print(f'Total observations: {num_obs}')
print(f'Hatebase absolute: {h_zeroes_abs} zeroes, {100*(h_zeroes_abs / num_obs)}%')
print(f'Hatebase proportion: {h_zeroes_prop} zeroes, {100*(h_zeroes_prop / num_obs)}%')
print(f'Perspective absolute: {p_zeroes_abs} zeroes, {100*(p_zeroes_abs / num_obs)}%')
print(f'Perspective proportion: {p_zeroes_prop} zeroes, {100*(p_zeroes_prop / num_obs)}%')

Total observations: 3186
Hatebase absolute: 2563 zeroes, 80.44569993722536%
Hatebase proportion: 2563 zeroes, 80.44569993722536%
Perspective absolute: 2676 zeroes, 83.9924670433145%
Perspective proportion: 2676 zeroes, 83.9924670433145%


In [9]:
X = reg_df[['club_coefficient', 'ethnicity_non_white', 'result_L', 'pen', 'day_of_week_Monday', 'day_of_week_Tuesday', 'day_of_week_Wednesday', 'day_of_week_Thursday', 'day_of_week_Friday', 'day_of_week_Saturday', 'day_of_week_Sunday']]
# X = reg_df[['club_coefficient', 'ethnicity_non_white', 'result_L', 'pen']]
X = sm.add_constant(X)

In [10]:
# ZINB for Hatebase absolute number of tweets (since mostly zeroes, and overdispersed)
h_abs_model=reg_models.ZeroInflatedNegativeBinomialP(reg_df['tweets_containing_slurs'], X)
h_abs_res = h_abs_model.fit_regularized()
print(h_abs_res.summary())

  return np.sum(-np.exp(XB) +  endog*XB - gammaln(endog+1))


Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.9602087874472555
            Iterations: 101
            Function evaluations: 107
            Gradient evaluations: 101


  a1 * np.log(a1) + y * np.log(mu) -


                     ZeroInflatedNegativeBinomialP Regression Results                    
Dep. Variable:           tweets_containing_slurs   No. Observations:                 3186
Model:             ZeroInflatedNegativeBinomialP   Df Residuals:                     3174
Method:                                      MLE   Df Model:                           11
Date:                           Fri, 27 Aug 2021   Pseudo R-squ.:                 0.03125
Time:                                   21:47:31   Log-Likelihood:                -3059.2
converged:                                  True   LL-Null:                       -3157.9
Covariance Type:                       nonrobust   LLR p-value:                 2.595e-36
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
inflate_const           -15.8245    644.904     -0.025      0.980   -1279.814    1248.165
const     



In [None]:
# ZIGP for Hatebase proportion of tweets (since mostly zeroes, underdispersed)
h_prop_model=reg_models.ZeroInflatedGeneralizedPoisson(reg_df['hatebase_proportion'], X)
h_prop_res = h_prop_model.fit_regularized()
print(h_prop_res.summary())

  np.log(a1) - gammaln(endog + 1) - a2 / a1)
  np.log(a1) - gammaln(endog + 1) - a2 / a1)
  np.log(a1) - gammaln(endog + 1) - a2 / a1)
  np.log(a1) - gammaln(endog + 1) - a2 / a1)
  return np.exp(linpred)
  a2 = mu + (a1 - 1) * endog
  a2 = mu + (a1 - 1) * endog
  a1 = 1 + alpha * mu_p
  np.log(a1) - gammaln(endog + 1) - a2 / a1)


Iteration limit exceeded    (Exit mode 9)
            Current function value: nan
            Iterations: 1001
            Function evaluations: 11001
            Gradient evaluations: 1001


  np.log(a1) - gammaln(endog + 1) - a2 / a1)
  np.log(a1) - gammaln(endog + 1) - a2 / a1)


                    ZeroInflatedGeneralizedPoisson Regression Results                     
Dep. Variable:                hatebase_proportion   No. Observations:                 3186
Model:             ZeroInflatedGeneralizedPoisson   Df Residuals:                     3174
Method:                                       MLE   Df Model:                           11
Date:                            Sun, 22 Aug 2021   Pseudo R-squ.:                     nan
Time:                                    09:04:05   Log-Likelihood:                    nan
converged:                                  False   LL-Null:                           nan
Covariance Type:                        nonrobust   LLR p-value:                       nan
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
inflate_const             1.0000        nan        nan        nan         nan         nan
co

  np.log(a1) - gammaln(endog + 1) - a2 / a1)


In [11]:
# ZINB for Perspective absolute number of tweets (since mostly zeroes, and overdispersed)
p_abs_model=reg_models.ZeroInflatedNegativeBinomialP(reg_df['tweets_flagged_perspective'], X)
p_abs_res = p_abs_model.fit_regularized()
print(p_abs_res.summary())

  return np.sum(-np.exp(XB) +  endog*XB - gammaln(endog+1))


Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.7989414044815596
            Iterations: 104
            Function evaluations: 109
            Gradient evaluations: 104


  a1 * np.log(a1) + y * np.log(mu) -


                     ZeroInflatedNegativeBinomialP Regression Results                    
Dep. Variable:        tweets_flagged_perspective   No. Observations:                 3186
Model:             ZeroInflatedNegativeBinomialP   Df Residuals:                     3174
Method:                                      MLE   Df Model:                           11
Date:                           Fri, 27 Aug 2021   Pseudo R-squ.:                 0.04976
Time:                                   21:47:46   Log-Likelihood:                -2545.4
converged:                                  True   LL-Null:                       -2678.7
Covariance Type:                       nonrobust   LLR p-value:                 9.206e-51
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
inflate_const           -14.5381    365.494     -0.040      0.968    -730.894     701.818
const     

In [None]:
# ZIGP for Perspective proportion of tweets (since mostly zeroes, underdispersed)
p_prop_model=reg_models.ZeroInflatedGeneralizedPoisson(reg_df['perspective_proportion'], X)
p_prop_res = p_prop_model.fit_regularized()
print(p_prop_res.summary())

  np.log(a1) - gammaln(endog + 1) - a2 / a1)
  return np.exp(linpred)
  a2 = mu + (a1 - 1) * endog
  a2 = mu + (a1 - 1) * endog
  a1 = 1 + alpha * mu_p
  np.log(a1) - gammaln(endog + 1) - a2 / a1)


Iteration limit exceeded    (Exit mode 9)
            Current function value: nan
            Iterations: 1001
            Function evaluations: 11001
            Gradient evaluations: 1001


  return 1/(1+np.exp(-X))
  np.log(a1) - gammaln(endog + 1) - a2 / a1)
  np.log(a1) - gammaln(endog + 1) - a2 / a1)
  numpy.max(numpy.abs(fsim[0] - fsim[1:])) <= fatol):
