# Omar El Yousfi
# Graduation Project
# Master Data Science and Intelligent Systems
##  `Ploy-Disciplinary Faculty of Nador`

In this notebook we will present the backend code that we can't include in the application due to time complexity such as training a model and merging hundreds of dataframes.

### Importing necessary packages

In [16]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json
from matplotlib.patches import Arc
import pandas as pd
import os
import ast
from tkinter import filedialog
import sys
import pickle
from collections import defaultdict
from sklearn.model_selection import cross_validate

### Task 1: Creating shots dataset of a team

The first step is merging event data to create shots dataset of football teams, we will use `Real Betis` as an example.

- Create an empty dataframe with the desired columns

In [3]:
shots_df = pd.DataFrame(columns = ["Team", "Player", "Body part","Shot type", "Location", "Shot_Outcome", "statsbomb_xg", "Season"])

- The merge:

In [4]:
# The path variable stores all the files in the events repository
path = os.listdir('C:\\Users\\OMAR\\Desktop\\PFE\\Interface\\events\\')

# Paths that have the files we would need
events = 'C:\\Users\\OMAR\\Desktop\\PFE\\Interface\\events\\'
matches = 'C:\\Users\\OMAR\\Desktop\\PFE\\Interface\\matches\\'

# Matches repository has multiple repositories named after the id of the competition, we have to loop through each one
# of them, the id's available are:
competitions_id = [2,11,16,37,43,49,72]

# Loop through each file in the events repository
for file_name in path:
    with open(events + file_name) as data_file:
        data = json.load(data_file)
    df_t = pd.json_normalize(data, sep = "_").assign(match_id = file_name[:-5])
    # Store the match's id
    mid = df_t["match_id"][0]
    
    # Keep the events that are shots and of Real Betis
    shots = df_t.loc[df_t["type_name"] == "Shot"].set_index("id")
    shots = shots.loc[shots["team_name"] == "Real Betis"]
    
    # Loop through the matches repository:
    for competi in competitions_id:
        files = os.listdir(matches + str(competi) )
        for file in files:
            with open(matches+ str(competi) + "/" + file) as f:
                temp = json.load(f)
            # Each file has multiple matches, we have to look for those that have the same match id as the match desired
            for i in range(len(temp)):
                if temp[i]["match_id"] == pd.to_numeric(mid):
                    shots = shots.assign(season = temp[i]["season"]["season_name"])
    # Keep only the columns we need: Team name, Locations of shots, The shot outcome and the season of the match.
    for i, shot in shots.iterrows():
        shots_df.at[i,"Team"] = shot["team_name"]
        shots_df.at[i,"Player"] = shot["player_name"]
        shots_df.at[i,"Body part"] = shot["shot_body_part_name"]
        shots_df.at[i,"Shot type"] = shot["shot_type_name"]
        shots_df.at[i,"Location"] = shot["location"]
        shots_df.at[i,"Shot_Outcome"] = shot["shot_outcome_name"]
        shots_df.at[i,"statsbomb_xg"] = shot["shot_statsbomb_xg"]
        shots_df.at[i,"Season"] = shot["season"]

# Save the dataframe to a Csv file to be used in the application
shots_df.to_csv("Real Betis.csv")

##### The resulted dataframe has the locations of all shots of Real Betis grouped by season.

### Task 2: Number of matches of each team

In order to calculate the number of matches of each team we have to loop through all the files in matches repository.

In [35]:
# Create an empty dictionary
teams = defaultdict(lambda:0)

# Loop through matches
for competi in competitions_id:
    files = os.listdir(matches + str(competi) )
    for file in files:
        with open(matches+ str(competi) + "/" + file) as f:
            temp = json.load(f)
            for i in range(len(temp)):
                teams[temp[i]['home_team']['home_team_name']] +=1
                teams[temp[i]['away_team']['away_team_name']] += 1

### Task 3: Goalscorers

To calculate the goalscorers of each team, we have to loop through events repository and amtches repository to group by season

In [41]:
goalers = pd.DataFrame(columns=['Team', 'Player name', 'Season'])
p = 'C:\\Users\\OMAR\\Desktop\\PFE\\Interface\\events\\'
p_m = 'C:\\Users\\OMAR\\Desktop\\PFE\\Interface\\matches\\'

path = os.listdir(p)
path_m = os.listdir(p_m)
competitions_id = [2,11,16,37,43,49,72]
for file_name in path:
    with open(p + file_name) as data_file:
        data = json.load(data_file)
    df_t = pd.json_normalize(data, sep = "_").assign(match_id = file_name[:-5])
    mid = df_t["match_id"][0]   
    goals = df_t.loc[df_t["shot_outcome_name"] == "Goal"].set_index("id")
    goals = goals.loc[goals["team_name"] == "Real Betis"]
    for competi in competitions_id:
        files = os.listdir(p_m + str(competi) )
        for file in files:
            with open(p_m + str(competi) + "/" + file) as f:
                temp = json.load(f)
            for i in range(len(temp)):
                if temp[i]["match_id"] == pd.to_numeric(mid):
                    goals = goals.assign(season = temp[i]["season"]["season_name"])
    for i, goal in goals.iterrows():
        goalers.at[i,"Team"] = goal["team_name"]
        goalers.at[i,"Player name"] = goal["player_name"]
        goalers.at[i,"Season"] = goal["season"]
goalers.to_csv("Barcelona.csv")

KeyboardInterrupt: 

### Task 4: The teams and competition of each match id

In [None]:
files = [f for f in os.listdir('C:\\Users\\OMAR\\Desktop\\PFE\\Interface\\events\\')]
for f in files:
    with open('C:\\Users\\OMAR\\Desktop\\PFE\\Interface\\events\\'+f) as data_file:
        mid = f[:-5]
    d = json.load(data_file)
    d = json_normalize(d, sep = "_").assign(mid = f[:-5])
    print("Match id:" + mid + "," + d["team_name"][0] + " Vs " + d["team_name"][1])

### Task 5: Test classifiers to create expected goals model

In [None]:
# Import a data set to use
shots_df = pd.read_csv("Barcelona.csv")

# Data preprocessing
shots_df.Location = shots_df.Location.apply(lambda s: list(ast.literal_eval(s)))
for i,shot in shots_df.iterrows():
    shots_df.at[i,"X"] = shot["Location"][0]
    shots_df.at[i,"Y"] = shot["Location"][1]
    
for i,shot in shots_df.iterrows():
    if shot["Shot_Outcome"] == "Goal":
        shots_df.at[i,"Goal"] = 1
    else:
        shots_df.at[i,"Goal"] = 0
        
# Create Distance and Angle columns (Explained in the project report)
    shots_df.at[i,'X'] = 120-shot["X"]
    x = shots_df.at[i,'X'] 
    y = abs(shots_df.at[i,'Y'] - 40)
    
    shots_df.at[i,'Distance'] = np.sqrt(x**2 + y**2)

    a = np.arctan(8 * x /(x**2 + y**2 - (8/2)**2))
    if a<0:
        a=np.pi+a
    shots_df.at[i,'Angle'] =a

#Two dimensional histogram
H_Shot=np.histogram2d(shots_df['X'], shots_df['Y'],bins=50,range=[[0, 120],[0, 80]])
goals_only=shots_df[shots_df['Goal']==1]
H_Goal=np.histogram2d(goals_only['X'], goals_only['Y'],bins=50,range=[[0, 120],[0, 80]])

Test three different models: Linear SVM, Logistic Regression, GradientBoostingClassifier

First, we import the classifiers we would test:

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression

Split data into train set and test set

In [None]:
X = shots_df[["Distance", "Angle", "Body part", "Shot type", 'Player']]
y = shots_df["Goal"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Then test the cross validation score of these classifiers

In [None]:
KFold_Score = pd.DataFrame()
classifiers = ['Linear SVM', 'LogisticRegression','GradientBoostingClassifier']
models = [svm.SVC(kernel='linear'),
          LogisticRegression(max_iter = 1000),
          GradientBoostingClassifier(random_state=0)
         ]
j = 0
for i in models:
    model = i
    cv = KFold(n_splits=5, random_state=0, shuffle=True)
    KFold_Score[classifiers[j]] = (cross_val_score(model, X, np.ravel(y), scoring = 'accuracy', cv=cv))
    j = j+1

In [None]:
mean = pd.DataFrame(KFold_Score.mean(), index= classifiers)
KFold_Score = pd.concat([KFold_Score,mean.T])
KFold_Score.index=['Fold 1','Fold 2','Fold 3','Fold 4','Fold 5','Mean']
KFold_Score.T.sort_values(by=['Mean'], ascending = False)

We can see that GradientBoostingClassifier has the best average cross validation score

### Task 6: Training and tuning the model

Initialize the model:

In [None]:
model = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10)

Now we will use GridSearchCV to tune parameters of the model, we want to choose number of estimators, we set the interval of variables to test as [0,20]

In [None]:
from sklearn.model_selection import GridSearchCV
CV_gbc = GridSearchCV(estimator=model, param_grid={'n_estimators':range(0,20,1)}, scoring='roc_auc',n_jobs=4,iid=False, cv= 5)
CV_gbc.fit(X_train,y_train)
CV_gbc.best_params_, CV_gbc.best_score_

number of estimators we got is 12, now we move to maximum depth of the tree and minimum samples split parameter:

In [None]:
param_test2 = {'max_depth':range(0,9,1), 'min_samples_split':range(100,600,100)}
gbc = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.2, n_estimators=7, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gbc.fit(X_train,y_train)
gbc.best_params_, gbc.best_score_

We keep testing all the parameters

In [None]:
param_test3 = {'min_samples_split':range(100,1400,100), 'min_samples_leaf':range(0,20,2)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.2, n_estimators=7,max_depth=6, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.best_params_, gsearch3.best_score_

In [None]:
param_test4 = {'max_features':range(0,20,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.2, n_estimators=7,max_depth=6, min_samples_split=800, min_samples_leaf=2, subsample=0.8, random_state=10),
param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.best_params_, gsearch4.best_score_

The following function returns the accuracy score, roc auc score, cv score and an histogram that shows us the importance of each feature(Angle and distance)

In [None]:
def modelfit(alg, dtrain, pred, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], pred)
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    #Perform cross-validation:
    if performCV:
        cv_score = cross_validate(alg, dtrain[predictors], pred, cv=cv_folds, scoring='roc_auc')
    
    #Print model report:
    print("\nModel Report")
    print("Accuracy :",metrics.accuracy_score(pred.values, dtrain_predictions))
    print("AUC Score (Train):", metrics.roc_auc_score(pred, dtrain_predprob))
    print("cv Score: ", np.mean(cv_score['test_score']))
        
    #Print Feature Importance:
    if printFeatureImportance:
        feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')

In [None]:
modelfit(gsearch3.best_estimator_, X_train, y_train, X_train.columns)

In [None]:
modelfit(gsearch4.best_estimator_, X_train, y_train, X_train.columns)

In [None]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9,1]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.2, n_estimators=7,max_depth=6,min_samples_split=800, min_samples_leaf=2, subsample=0.8, random_state=10,max_features=2),
param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(X_train,y_train)
gsearch5.best_params_, gsearch5.best_score_

In [None]:
modelfit(gsearch5.best_estimator_, X_train, y_train, X_train.columns)

In [None]:
gbm_tuned_1 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=6, min_samples_split=800,min_samples_leaf=2, subsample=0.8, random_state=10, max_features=2)
modelfit(gbm_tuned_1, X_train, y_train, X_train.columns)

In [None]:
gbm_tuned_2 = GradientBoostingClassifier(learning_rate=0.05, n_estimators=1000,max_depth=6, min_samples_split=800,min_samples_leaf=2, subsample=0.8, random_state=10, max_features=2)
modelfit(gbm_tuned_2, X_train, y_train, X_train.columns)

In [None]:
gbm_tuned_3 = GradientBoostingClassifier(learning_rate=0.001, n_estimators=1200,max_depth=6, min_samples_split=800,min_samples_leaf=2, subsample=0.8, random_state=10, max_features=2)
modelfit(gbm_tuned_2, X_train, y_train, X_train.columns)

We can see that the accuracy of gbm_tuned_3 has decreased, so we will fit gbm_tuned_2 to the data

In [None]:
model = gbm_tuned_2
model.fit(X_train,y_train)

Calculate probability(expected goals)

In [None]:
proba = model.predict_proba(shots_df[["Distance", "Angle", "Shot type", "Body part", "Player"]])

Create a scatter plot of distance and angle of shots

In [None]:
fig, ax = plt.subplots(1)
ax = sns.scatterplot(x="Distance", y="Angle", size = proba[:,1], hue=proba[:,1], data=shots_df)
fig.set_size_inches(10,8)
plt.show()

Visualize the probability of a shot being a goal on a data sample:

In [None]:
shots_df = shots_df.assign(xg = proba[:,1])

In [None]:
shots_df[['Shot_Outcome', 'statsbomb_xg', 'xg']].sample(10)

Save the model as a pickle file:

In [None]:
pickle.dump(model, open("model", 'wb'))

#### By Omar El Yousfi