In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
team_stats = pd.read_csv('../data/FS_champ_team_2021.csv')
games = pd.read_csv('../data/FS_champ_games_2021.csv')
goal_timings = pd.read_csv('../data/FS_Champ2021_Goal_timings.csv')

In [4]:
games['over/under'] = np.where(games['home_team_goal_count'] + games['away_team_goal_count'] > 2.5, 1, 0)
# Adding column 0 represents under 2.5, 1 represents over 2.5

In [5]:
games = games[0:552]
#Dropping playoff matches

In [6]:
team_stats.columns

Index(['team_name', 'common_name', 'season', 'country', 'matches_played',
       'matches_played_home', 'matches_played_away', 'suspended_matches',
       'wins', 'wins_home',
       ...
       'goals_conceded_min_61_to_70', 'goals_conceded_min_71_to_80',
       'goals_conceded_min_81_to_90', 'draw_percentage_overall',
       'draw_percentage_home', 'draw_percentage_away',
       'loss_percentage_ovearll', 'loss_percentage_home',
       'loss_percentage_away', 'over145_corners_percentage'],
      dtype='object', length=293)

In [7]:
games['home_xg'] = ""
games['away_xg'] = ""
games['home_xg_against'] = ""
games['away_xg_against'] = ""
games['home_total_corners'] = ""
games['away_total_corners'] = ""

# Creating new columns where our data will go

Let's write a function to help fill xg in our games df

In [8]:
xg_stats = team_stats[['common_name', 'xg_for_avg_overall']]
corner_stats = team_stats[['common_name', 'corners_per_match']]
xga_stats = team_stats[['common_name', 'xg_against_avg_overall']]

xga_stats.head()

Unnamed: 0,common_name,xg_against_avg_overall
0,Stoke City,1.21
1,Middlesbrough,1.13
2,AFC Bournemouth,1.29
3,Swansea City,1.28
4,Watford,1.29


In [9]:
xg_stats = dict(xg_stats.values)
corner_stats = dict(corner_stats.values)
xga_stats = dict(xga_stats.values)

#By making this into dictionaries, it will be easier to call them/update them with the function

In [10]:
def fill_stats(team):
    games['home_xg'] = np.where(games['home_team_name'] == team, xg_stats[team], games['home_xg'])
    games['away_xg'] = np.where(games['away_team_name'] == team, xg_stats[team], games['away_xg'])
    games['home_xg_against'] = np.where(games['home_team_name'] == team, xga_stats[team], games['home_xg_against'])
    games['away_xg_against'] = np.where(games['away_team_name'] == team, xga_stats[team], games['away_xg_against'])
    games['home_total_corners'] = np.where(games['home_team_name'] == team, corner_stats[team], games['home_total_corners'])
    games['away_total_corners'] = np.where(games['away_team_name'] == team, corner_stats[team], games['away_total_corners'])
    
#Function to put team stats into games dataframe

In [11]:
fill_stats('Norwich City')
fill_stats('Blackburn Rovers')
fill_stats('Brentford')
fill_stats('Queens Park Rangers')
fill_stats('AFC Bournemouth')
fill_stats('Middlesbrough')
fill_stats('Barnsley')
fill_stats('Millwall')
fill_stats('Watford')
fill_stats('Reading')
fill_stats('Cardiff City')
fill_stats('Nottingham Forest')
fill_stats('Rotherham United')
fill_stats('Wycombe Wanderers')
fill_stats('Luton Town')
fill_stats('Stoke City')
fill_stats('Swansea City')
fill_stats('Derby County')
fill_stats('Huddersfield Town')
fill_stats('Preston North End')
fill_stats('Coventry City')
fill_stats('Sheffield Wednesday')
fill_stats('Birmingham City')
fill_stats('Bristol City')

#Reading in stats

# Some more EDA

In [12]:
games.corr()['over/under'].sort_values(ascending=False)[0:20]

over/under                             1.000000
total_goal_count                       0.813089
total_goals_at_half_time               0.564385
home_team_goal_count                   0.563420
away_team_goal_count                   0.538042
home_team_goal_count_half_time         0.402322
home_team_shots_on_target              0.361050
away_team_goal_count_half_time         0.358810
away_team_shots_on_target              0.284876
team_a_xg                              0.221940
team_b_xg                              0.181444
home_team_shots                        0.156785
away_team_shots                        0.150139
average_goals_per_match_pre_match      0.091487
over_15_percentage_pre_match           0.084846
over_05_HT_FHG_percentage_pre_match    0.084284
over_05_2HG_percentage_pre_match       0.083594
over_25_percentage_pre_match           0.083280
Game Week                              0.078257
away_team_possession                   0.077439
Name: over/under, dtype: float64

# Now let's do some modeling

In [13]:
features = ['home_xg', 'away_xg', 'home_xg_against', 'away_xg_against']
X = games[features]
y = games['over/under']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 42)

In [15]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

In [16]:
logreg.fit(X_train, y_train)

LogisticRegression()

In [17]:
preds = logreg.predict(X_test)

In [18]:
accuracy_score(y_test, preds)

0.6428571428571429

In [19]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [20]:
preds = knn.predict(X_test)

In [21]:
accuracy_score(y_test, preds)

0.5714285714285714

In [22]:
dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [23]:
preds = dt.predict(X_test)

In [24]:
accuracy_score(y_test, preds)

0.5714285714285714

In [25]:
#decision trees