In [None]:
#The basics
import pandas as pd
import numpy as np
import json
import seaborn as sns

#Plotting
import matplotlib.pyplot as plt


#Statistical fitting of models
import statsmodels.api as sm
import statsmodels.formula.api as smf 

from matplotlib.patches import Arc
from matplotlib import cm
from matplotlib.colors import Normalize


from matplotlib.patches import Ellipse
from pandas.io.json import json_normalize

## Import Selenium for scrape
from selenium import webdriver
import time

# Import ipywidgets
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets

In [None]:
footy_df = pd.read_csv('international_matches_xg.csv')

In [None]:
# Drop the ones without any xG values
footy_df = footy_df[footy_df.team_a_xg != 0]

In [None]:
footy_df[['home_team_goal_count','away_team_goal_count','team_a_xg','team_b_xg']].corr()

In [None]:
import scipy as sp
linreg = sp.stats.linregress(footy_df['home_team_goal_count'],footy_df['team_a_xg'])

plt.scatter(footy_df['home_team_goal_count'],footy_df['team_a_xg'])
plt.plot(np.unique(footy_df['home_team_goal_count']), np.poly1d(np.polyfit(footy_df['home_team_goal_count'], footy_df['team_a_xg'], 1))(np.unique(footy_df['home_team_goal_count'])))
plt.text(6, 4, 'R-squared = %0.3f' %linreg.rvalue)

In [None]:
r2_score(footy_df['home_team_goal_count'], footy_df['team_a_xg'])

In [None]:
sns.heatmap(footy_df[['home_team_goal_count','away_team_goal_count','team_a_xg','team_b_xg']])

In [None]:
footy_df.groupby(by='home_team_name').mean()

In [None]:
euro_list = ['Turkey','Switzerland','Italy','Wales','Denmark','Finland','Belgium','Russia'
        ,'Netherlands','Ukraine','Austria','FYR Macedonia','England','Croatia','Scotland','Czech Republic'
       ,'Spain','Sweden','Poland','Slovakia','Hungary','Portugal','France','Germany']
euro_teams_df = pd.DataFrame(data=euro_list)

In [None]:
euro_teams_df=euro_teams_df.rename(columns={0:'euro_team'})

In [None]:
home_df = footy_df[['Competition','home_team_name','home_team_goal_count','team_a_xg','team_b_xg']]
away_df = footy_df[['Competition','away_team_name','away_team_goal_count','team_b_xg','team_a_xg']]

away_df.columns = home_df.columns

combined_df = pd.concat([home_df,away_df])

combined_df=combined_df.rename(columns={'home_team_name':'team_name','home_team_goal_count':'team_goal_count'})

In [None]:
Euros_teams_df = combined_df.merge(euro_teams_df,how='inner',left_on='team_name',right_on='euro_team',validate="many_to_one")

In [None]:
Euros_teams_df.groupby('team_name').mean()

In [None]:
Euros_teams_df[Euros_teams_df['team_name']=='Spain']

In [None]:
xG_actual_corr_df = pd.DataFrame(Euros_teams_df.groupby('team_name')[['team_goal_count','team_a_xg']].corr().iloc[0::2,-1])

In [None]:
test = pd.DataFrame(Euros_teams_df.groupby('team_name')[['team_goal_count','team_a_xg']].corr().unstack().iloc[:,1])

In [None]:
pd.DataFrame(test['team_goal_count']['team_a_xg'])['team_a_xg']

In [None]:
plt.plot(test['team_goal_count']['team_a_xg'])

In [None]:
test.columns=test.columns.droplevel(0)

In [None]:
test.columns = test.columns.map(lambda x: x[1]) 

In [None]:
test = test.reset_index()
test = test.rename(columns={'e':'Correlation with xG'})

In [None]:
plt.scatter(test['team_name'],test['Correlation with xG'],marker='o',c='')

In [None]:
from matplotlib.pyplot import figure

figure(figsize=(25, 6), dpi=80)

plt.bar(test['team_name'],test['Correlation with xG'])

In [None]:
Euros_teams_df_agg = Euros_teams_df.groupby('team_name').median()

In [None]:
Euros_teams_df_agg['xG_dif'] = Euros_teams_df_agg['team_a_xg'] - Euros_teams_df_agg['team_goal_count']

In [None]:
test= test.drop(columns='index')

In [None]:
new_df = test.join(Euros_teams_df_agg,on='team_name',how='inner')

In [None]:
new_df_final = new_df[['team_name','Correlation with xG']]

In [None]:
new_df_final.sort_values(by='Correlation with xG').plot(kind='barh',color = 'forestgreen',figsize=(8, 10), zorder=2, width=0.85)
y_pos = np.arange(len(test))
plt.yticks(y_pos, new_df_final['team_name'])
plt.title('Euro 2020 teams xG vs actual goals scored correlation')
#plt.savefig('plots/Euro_predictor_corr.png',bbox_inches='tight')

In [None]:
new_df

In [None]:
test

In [None]:
Euros_teams_df

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
Euros_teams_df.groupby(Euros_teams_df.team_name).apply(lambda x: mean_squared_error(x.team_goal_count,x.team_a_xg)**0.5)

In [None]:
test_2_df = pd.DataFrame(Euros_teams_df.groupby(Euros_teams_df.team_name).apply(lambda x: mean_squared_error(x.team_goal_count,x.team_a_xg)))

In [None]:
test_2_df.columns = test_2_df.columns.map(lambda x: x[1]) 

In [None]:
test_2_df.columns

In [None]:
Euros_teams_df.columns

In [None]:
test_2_df.index = list(test_2_df.index)


In [None]:
test_2_df.columns

In [None]:
test_2_df=test_2_df.rename(columns={0:'RMSE'})

In [None]:
test_2_df.sort_values(by='RMSE').plot(kind='barh',color = 'forestgreen',figsize=(8, 10), zorder=2, width=0.85)
y_pos = np.arange(len(test_2_df))
#plt.yticks(y_pos, new_df_final['team_name'])
plt.title('Euro 2020 teams xG Root Mean Squared Error vs Actual Goals')
plt.savefig('plots/Euro_2020_xG_RMSE.png',bbox_inches='tight')


In [None]:
Euros_teams_df['goals_over_xg'] = Euros_teams_df['team_goal_count'] - Euros_teams_df['team_a_xg'] 

In [None]:
goal_over_xg_df = Euros_teams_df[['team_name','goals_over_xg']].groupby('team_name').mean()

In [None]:
goal_over_xg_df

In [None]:
goal_over_xg_df.sort_values(by='Goals Over xG').plot(kind='barh',color = 'navy',figsize=(8, 10), zorder=2, width=0.85)
y_pos = np.arange(len(goal_over_xg_df))
#plt.yticks(y_pos, goal_over_xg_df['team_name'])
plt.title('Euro 2020 teams Goals over xG')
plt.ylabel('')

plt.savefig('plots/Euro_2020_goals_over_xg.png',bbox_inches='tight')

In [None]:
goal_over_xg_df

In [None]:
goal_over_xg_df.columns = goal_over_xg_df.columns.map(lambda x: x[1]) 

In [None]:
goal_over_xg_df=goal_over_xg_df.rename(columns={'o':'Goals Over xG'})

In [None]:
goal_over_xg_df.columns=goal_over_xg_df.columns.droplevel(0)