In [1]:
import sys
sys.path.insert(0, '/Users/david/galvanize/super_liga_xg')

In [2]:
import pprint
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from scraping_tools.html_scraper import db
from data_preparation import ConvertData

In [4]:
pd.set_option('display.max_columns', 50)

# 1) Pulling shot data from mongo_db and preparing for modeling

In [5]:
games = db['games_update'].find()

In [6]:
cd = ConvertData()

In [7]:
df = cd.games_to_df(games)

In [8]:
df_no_pen = cd.drop_penalties(df)

In [9]:
df_no_pen.head()

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick,shot_distance,shot_angle,assisted_shot,is_penalty_attempt,is_goal
0,448548,99779.0,6.67,0.0,1.51,-1.21,-1,0.83,22336297,11,20,1.033333,177285.0,0.89,12.44,29.36,-4.84,-1.0,-1.0,1.0,6.838786,12.75599,1.0,0.0,1
1,448548,60730.0,21.33,12.0,4.24,-0.91,-1,0.73,22336386,35,20,9.683333,,,,,,,,0.0,21.747333,11.242763,0.0,0.0,0
3,448548,185745.0,8.44,0.89,14.53,-6.66,-1,-1.0,22336723,33,20,38.883333,177285.0,22.67,6.22,3.93,19.67,-1.0,-1.0,0.0,16.803407,59.849102,1.0,0.0,0
4,448548,77892.0,22.22,0.44,13.92,-8.48,-1,-1.0,22336739,33,13,40.083333,,,,,,,,0.0,26.220122,32.065607,0.0,0.0,0
5,448548,77892.0,11.56,0.44,10.59,-1.21,-1,0.14,22336802,35,13,44.483333,,,,,,,,0.0,15.677426,42.492487,0.0,0.0,0


In [10]:
X, y, shot_df = cd.create_xy_prep(df_no_pen)

In [11]:
len(X) == len(y)

True

In [12]:
shot_df.head()

Unnamed: 0,player_id,shot_distance,shot_angle,assisted_shot,is_goal,passed_from_id
0,99779.0,6.838786,12.75599,1.0,1,177285.0
1,60730.0,21.747333,11.242763,0.0,0,
3,185745.0,16.803407,59.849102,1.0,0,177285.0
4,77892.0,26.220122,32.065607,0.0,0,
5,77892.0,15.677426,42.492487,0.0,0,


# 2) Performance w K-folds

In [13]:
from kfold_comp import KfoldComparison
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import  RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import numpy as np

In [14]:
rf_model = RandomForestClassifier(n_estimators=300, max_depth=3, verbose=1, random_state=8)
xgb_model = XGBClassifier(learning_rate = 0.01, max_depth = 3, n_estimators = 300, random_state=8)
gb_model = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, max_features='log2', min_samples_leaf=4, n_estimators=280, subsample=0.25, random_state=8)

In [15]:
kc = KfoldComparison(rf_model, xgb_model, gb_model)

In [16]:
model_performance = kc.kfold_performance(X, y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

In [17]:
model_performance

{'RandomForestClassifier': {'scores over k splits': [0.3275601142949458,
   0.2598850371235993,
   0.31029943270000976,
   0.29977531479770164,
   0.29027946265614446],
  'mean score': 0.29755987231448017},
 'XGBClassifier': {'scores over k splits': [0.3205625975668869,
   0.27954830782049717,
   0.28788262710303947,
   0.29842956981019064,
   0.29757355771333377],
  'mean score': 0.29679933200278963},
 'GradientBoostingClassifier': {'scores over k splits': [0.2927048439053497,
   0.2997007760634123,
   0.3217547255945939,
   0.27255568875160635,
   0.29018746923636013],
  'mean score': 0.29538070071026445}}

# 3) Tuning

In [18]:
from kfold_comp import XGBoostTuner

In [19]:
xt = XGBoostTuner()

In [20]:
xgb_scores = xt.xgb_kfolds(X, y, [2, 4, 6, 8, 10])

In [21]:
xgb_scores

{'200 trees': {'scores over k splits': [0.31678828204817633,
   0.3075737889308263,
   0.300106135438032,
   0.32910019458435913,
   0.29707677435234126],
  'mean score': 0.310129035070747},
 '400 trees': {'scores over k splits': [0.3060992603957215,
   0.28942265463445116,
   0.28049036862109517,
   0.3223998676684192,
   0.27295035071703927],
  'mean score': 0.29427250040734526},
 '600 trees': {'scores over k splits': [0.3070932225022903,
   0.2908978165839525,
   0.27925455492838996,
   0.3269659901298035,
   0.2714525934374605],
  'mean score': 0.29513283551637937},
 '800 trees': {'scores over k splits': [0.30896474396097745,
   0.29195561359143435,
   0.28003585335921827,
   0.3317013686646105,
   0.2716020154046644],
  'mean score': 0.29685191899618096},
 '1000 trees': {'scores over k splits': [0.31210567712345544,
   0.2933789899490555,
   0.2808106052908389,
   0.33556055737084106,
   0.2743885882256418],
  'mean score': 0.29924888359196655}}

In [22]:
xt.best_params()

(0.29427250040734526, '400 trees')

In [23]:
from kfold_comp import GradientBoostTuner

In [24]:
gbt = GradientBoostTuner()

In [25]:
gb_dict = gbt.gb_kfolds(X, y)

In [26]:
gb_dict

{'Depth 1': {'scores over ksplits': [0.2934454133195602,
   0.28705023498832694,
   0.29030841605540975,
   0.3028250959653611,
   0.289204671557898],
  'mean score': 0.29256676637731116,
  'number of trees': [299, 289, 299, 298, 221]},
 'Depth 2': {'scores over ksplits': [0.29256850442725096,
   0.2888257241274842,
   0.29016859898563535,
   0.3053394291733273,
   0.28908210673994195],
  'mean score': 0.29319687269072797,
  'number of trees': [299, 248, 197, 167, 138]},
 'Depth 3': {'scores over ksplits': [0.2947779852165973,
   0.2891957251953801,
   0.2915187686934574,
   0.30843221245251123,
   0.28828025693564124],
  'mean score': 0.29444098969871746,
  'number of trees': [299, 240, 277, 162, 134]},
 'Depth 4': {'scores over ksplits': [0.2928617044707311,
   0.2872799799842507,
   0.29266328666391983,
   0.3095114702082014,
   0.2891856456950433],
  'mean score': 0.29430041740442925,
  'number of trees': [164, 235, 140, 114, 140]},
 'Depth 5': {'scores over ksplits': [0.2968893447

In [27]:
gbt.best_params()

('Depth 1', 0.29256676637731116, 281.2)

# 4) Comparing Tuned Models

In [28]:
xgboost_model = XGBClassifier(learning_rate = 0.01, max_depth = 3, n_estimators = 400, random_state=8)
gradient_boost_model = GradientBoostingClassifier(learning_rate=0.01, max_depth=1, max_features='log2', min_samples_leaf=4, n_estimators=281, subsample=0.25, random_state=8)
random_forest_model = RandomForestClassifier(n_estimators=300, max_depth=3, verbose=1, random_state=8)

In [29]:
kc_tuned = KfoldComparison(random_forest_model, xgboost_model, gradient_boost_model)

In [30]:
tuned_model_performance = kc_tuned.kfold_performance(X, y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

In [31]:
tuned_model_performance

{'RandomForestClassifier': {'scores over k splits': [0.29546540906476165,
   0.2889701853094394,
   0.29951160736615234,
   0.29573637940391106,
   0.3036051751704289],
  'mean score': 0.2966577512629387},
 'XGBClassifier': {'scores over k splits': [0.2660052969558712,
   0.29229684397030403,
   0.2699489613118417,
   0.32000349151408847,
   0.32344481072475],
  'mean score': 0.2943398808953711},
 'GradientBoostingClassifier': {'scores over k splits': [0.31291283322009356,
   0.2931699923604007,
   0.303222326530608,
   0.2840979503124294,
   0.3058292452325093],
  'mean score': 0.2998464695312082}}

In [32]:
model_performance

{'RandomForestClassifier': {'scores over k splits': [0.3275601142949458,
   0.2598850371235993,
   0.31029943270000976,
   0.29977531479770164,
   0.29027946265614446],
  'mean score': 0.29755987231448017},
 'XGBClassifier': {'scores over k splits': [0.3205625975668869,
   0.27954830782049717,
   0.28788262710303947,
   0.29842956981019064,
   0.29757355771333377],
  'mean score': 0.29679933200278963},
 'GradientBoostingClassifier': {'scores over k splits': [0.2927048439053497,
   0.2997007760634123,
   0.3217547255945939,
   0.27255568875160635,
   0.29018746923636013],
  'mean score': 0.29538070071026445}}

2/3 tuned_model_performance perform better - xgboost is the best

# Modeling on all

In [109]:
from xg import ExpectedGoal

In [110]:
shot_df.head()

Unnamed: 0,player_id,shot_distance,shot_angle,assisted_shot,is_goal,passed_from_id
0,99779.0,6.838786,12.75599,1.0,1,177285.0
1,60730.0,21.747333,11.242763,0.0,0,
3,185745.0,16.803407,59.849102,1.0,0,177285.0
4,77892.0,26.220122,32.065607,0.0,0,
5,77892.0,15.677426,42.492487,0.0,0,


In [111]:
eg = ExpectedGoal(random_forest_model, xgboost_model, gradient_boost_model, shot_df)

In [112]:
eg.predict_prob()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.1s finished


In [113]:
ensemble_df, assist_df = eg.xg_ensemble()

In [114]:
total_df = eg.xg_and_xa()

In [115]:
total_df.head()

Unnamed: 0,player_id,total_xG,goals,total_xA,total_xG+xA
125,24256.0,5.6,12,0.47,6.07
113,488.0,3.09,8,0.37,3.46
234,119431.0,1.86,6,1.64,3.5
83,136659.0,1.82,5,0.49,2.31
344,172801.0,2.08,5,0.25,2.33


In [116]:
total_df['total_xG'].sum()

346.93999999999994

In [117]:
total_df['goals'].sum()

342

In [119]:
total_df[total_df['total_xG+xA'] > 4]

Unnamed: 0,player_id,total_xG,goals,total_xA,total_xG+xA
125,24256.0,5.6,12,0.47,6.07
203,20195.0,4.23,4,0.19,4.42
208,81286.0,1.78,3,3.03,4.81
222,17217.0,4.05,3,0.37,4.42
2,185745.0,2.43,3,1.72,4.15
39,93853.0,3.73,2,0.39,4.12
6,36681.0,1.94,0,2.63,4.57


# Player Data

- need to wrap this into class

In [135]:
from combined_player import player_minutes_value
from dataframe_cleaner import transfer_markt_cleaner
from mongo_to_db import create_master_player_min_df

In [127]:
players = db.players.find()

In [132]:
tmc = transfer_markt_cleaner(players)

In [133]:
tmc.head()

Unnamed: 0,_id,birthday,club,foot,height,player,squad_num,transfer_value(sterlings),club_brev,age,transfer_value(USD)
0,5c0588e63d54c00a19927577,"Jan 26, 1991 (27)",Squad Club Atlético Boca Juniors,right,"1,94 m",Esteban Andrada,31.0,£4.50m,BOC,27,5.71
1,5c0588e73d54c00a19927578,"Mar 17, 1987 (31)",Squad Club Atlético Boca Juniors,right,"1,92 m",Carlos Lampe,28.0,£495k,BOC,31,0.63
2,5c0588e73d54c00a19927579,"Sep 27, 1993 (25)",Squad Club Atlético Boca Juniors,right,"1,85 m",Lisandro Magallán,6.0,£4.05m,BOC,25,5.14
3,5c0588e73d54c00a1992757a,"May 12, 1985 (33)",Squad Club Atlético Boca Juniors,right,"1,84 m",Paolo Goltz,2.0,£1.62m,BOC,33,2.06
4,5c0588e73d54c00a1992757b,"Feb 22, 1991 (27)",Squad Club Atlético Boca Juniors,left,"1,73 m",Frank Fabra,18.0,£5.40m,BOC,27,6.86


In [168]:
tmc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 657 entries, 0 to 656
Data columns (total 11 columns):
_id                          657 non-null object
birthday                     657 non-null object
club                         657 non-null object
foot                         657 non-null object
height                       657 non-null object
player                       657 non-null object
squad_num                    652 non-null float64
transfer_value(sterlings)    657 non-null object
club_brev                    657 non-null object
age                          657 non-null int64
transfer_value(USD)          607 non-null float64
dtypes: float64(2), int64(1), object(8)
memory usage: 56.5+ KB


### this is acting weird - afa is producing way too many players

In [178]:
games = db['games_update'].find()
afa = create_master_player_min_df(games)

In [179]:
afa.rename(columns={'squad_number': 'squad_num'}, inplace=True)

In [180]:
afa.head()

Unnamed: 0,game_id,name,player_id,position_id,squad_num,substitute,team_id,minutes_played,club_brev
0,448548,Alexander Domínguez,19132.0,1,22.0,False,20,90.0,VEL
1,448548,Lucas Hoyos,27987.0,1,12.0,True,20,0.0,VEL
2,448548,Gastón Díaz,30896.0,2,24.0,False,20,90.0,VEL
3,448548,Joaquín Laso,94225.0,2,6.0,False,20,90.0,VEL
4,448548,Luis Abram,99779.0,2,29.0,False,20,90.0,VEL


In [181]:
afa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6915 entries, 0 to 6914
Data columns (total 9 columns):
game_id           6915 non-null object
name              6900 non-null object
player_id         6915 non-null float64
position_id       6915 non-null object
squad_num         6551 non-null float64
substitute        6915 non-null object
team_id           6915 non-null object
minutes_played    6915 non-null float64
club_brev         6915 non-null object
dtypes: float64(3), object(6)
memory usage: 486.3+ KB


In [148]:
player_df = pd.merge(afa, tmc, on=['club_brev', 'squad_num'])
player_df.drop(columns=['_id', 'club_brev'], inplace=True)

In [149]:
player_df.head()

Unnamed: 0,game_id,name,player_id,position_id,squad_num,substitute,team_id,minutes_played,birthday,club,foot,height,player,transfer_value(sterlings),age,transfer_value(USD)
0,448548,Alexander Domínguez,19132.0,1,22.0,False,20,90.0,"Jun 5, 1987 (31)",Squad Club Atlético Vélez Sarsfield,right,"1,95 m",Alexander Domínguez,£1.58m,31,2.01
1,448553,Alexander Domínguez,19132.0,1,22.0,False,20,90.0,"Jun 5, 1987 (31)",Squad Club Atlético Vélez Sarsfield,right,"1,95 m",Alexander Domínguez,£1.58m,31,2.01
2,448568,Alexander Domínguez,19132.0,1,22.0,False,20,90.0,"Jun 5, 1987 (31)",Squad Club Atlético Vélez Sarsfield,right,"1,95 m",Alexander Domínguez,£1.58m,31,2.01
3,448575,Alexander Domínguez,19132.0,1,22.0,False,20,90.0,"Jun 5, 1987 (31)",Squad Club Atlético Vélez Sarsfield,right,"1,95 m",Alexander Domínguez,£1.58m,31,2.01
4,448594,Alexander Domínguez,19132.0,1,22.0,False,20,90.0,"Jun 5, 1987 (31)",Squad Club Atlético Vélez Sarsfield,right,"1,95 m",Alexander Domínguez,£1.58m,31,2.01


# Merging

In [166]:
player_df.head()

Unnamed: 0,game_id,name,player_id,position_id,squad_num,substitute,team_id,minutes_played,birthday,club,foot,height,player,transfer_value(sterlings),age,transfer_value(USD)
0,448548,Alexander Domínguez,19132.0,1,22.0,False,20,90.0,"Jun 5, 1987 (31)",Squad Club Atlético Vélez Sarsfield,right,"1,95 m",Alexander Domínguez,£1.58m,31,2.01
1,448553,Alexander Domínguez,19132.0,1,22.0,False,20,90.0,"Jun 5, 1987 (31)",Squad Club Atlético Vélez Sarsfield,right,"1,95 m",Alexander Domínguez,£1.58m,31,2.01
2,448568,Alexander Domínguez,19132.0,1,22.0,False,20,90.0,"Jun 5, 1987 (31)",Squad Club Atlético Vélez Sarsfield,right,"1,95 m",Alexander Domínguez,£1.58m,31,2.01
3,448575,Alexander Domínguez,19132.0,1,22.0,False,20,90.0,"Jun 5, 1987 (31)",Squad Club Atlético Vélez Sarsfield,right,"1,95 m",Alexander Domínguez,£1.58m,31,2.01
4,448594,Alexander Domínguez,19132.0,1,22.0,False,20,90.0,"Jun 5, 1987 (31)",Squad Club Atlético Vélez Sarsfield,right,"1,95 m",Alexander Domínguez,£1.58m,31,2.01


In [161]:
total_df.head()

Unnamed: 0,player_id,total_xG,goals,total_xA,total_xG+xA
125,24256.0,5.6,12,0.47,6.07
113,488.0,3.09,8,0.37,3.46
234,119431.0,1.86,6,1.64,3.5
83,136659.0,1.82,5,0.49,2.31
344,172801.0,2.08,5,0.25,2.33


In [162]:
final_df = pd.merge(player_df, total_df, on=['player_id'])

In [163]:
final_df.head()

Unnamed: 0,game_id,name,player_id,position_id,squad_num,substitute,team_id,minutes_played,birthday,club,foot,height,player,transfer_value(sterlings),age,transfer_value(USD),total_xG,goals,total_xA,total_xG+xA
0,448548,Gastón Díaz,30896.0,2,24.0,False,20,90.0,"Mar 13, 1988 (30)",Squad Club Atlético Vélez Sarsfield,right,"1,75 m",Gastón Díaz,£450k,30,0.57,0.78,0,0.6,1.38
1,448553,Gastón Díaz,30896.0,2,24.0,False,20,90.0,"Mar 13, 1988 (30)",Squad Club Atlético Vélez Sarsfield,right,"1,75 m",Gastón Díaz,£450k,30,0.57,0.78,0,0.6,1.38
2,448568,Gastón Díaz,30896.0,2,24.0,False,20,90.0,"Mar 13, 1988 (30)",Squad Club Atlético Vélez Sarsfield,right,"1,75 m",Gastón Díaz,£450k,30,0.57,0.78,0,0.6,1.38
3,448575,Gastón Díaz,30896.0,2,24.0,False,20,90.0,"Mar 13, 1988 (30)",Squad Club Atlético Vélez Sarsfield,right,"1,75 m",Gastón Díaz,£450k,30,0.57,0.78,0,0.6,1.38
4,448594,Gastón Díaz,30896.0,2,24.0,False,20,90.0,"Mar 13, 1988 (30)",Squad Club Atlético Vélez Sarsfield,right,"1,75 m",Gastón Díaz,£450k,30,0.57,0.78,0,0.6,1.38


In [157]:
final_sub = final_df[['name', 'minutes_played', 'age', 'transfer_value(USD)', 'goals', 'total_xG', 'total_xA', 'total_xG+xA']]

Unnamed: 0,name,minutes_played,age,transfer_value(USD),goals,total_xG,total_xA,total_xG+xA
0,Gastón Díaz,90.0,30,0.57,0,0.78,0.6,1.38
1,Gastón Díaz,90.0,30,0.57,0,0.78,0.6,1.38
2,Gastón Díaz,90.0,30,0.57,0,0.78,0.6,1.38
3,Gastón Díaz,90.0,30,0.57,0,0.78,0.6,1.38
4,Gastón Díaz,90.0,30,0.57,0,0.78,0.6,1.38


# Player Querying

In [155]:
final_df.head()

Unnamed: 0,game_id,name,player_id,position_id,squad_num,substitute,team_id,minutes_played,birthday,club,foot,height,player,transfer_value(sterlings),age,transfer_value(USD),total_xG,goals,total_xA,total_xG+xA
0,448548,Gastón Díaz,30896.0,2,24.0,False,20,90.0,"Mar 13, 1988 (30)",Squad Club Atlético Vélez Sarsfield,right,"1,75 m",Gastón Díaz,£450k,30,0.57,0.78,0,0.6,1.38
1,448553,Gastón Díaz,30896.0,2,24.0,False,20,90.0,"Mar 13, 1988 (30)",Squad Club Atlético Vélez Sarsfield,right,"1,75 m",Gastón Díaz,£450k,30,0.57,0.78,0,0.6,1.38
2,448568,Gastón Díaz,30896.0,2,24.0,False,20,90.0,"Mar 13, 1988 (30)",Squad Club Atlético Vélez Sarsfield,right,"1,75 m",Gastón Díaz,£450k,30,0.57,0.78,0,0.6,1.38
3,448575,Gastón Díaz,30896.0,2,24.0,False,20,90.0,"Mar 13, 1988 (30)",Squad Club Atlético Vélez Sarsfield,right,"1,75 m",Gastón Díaz,£450k,30,0.57,0.78,0,0.6,1.38
4,448594,Gastón Díaz,30896.0,2,24.0,False,20,90.0,"Mar 13, 1988 (30)",Squad Club Atlético Vélez Sarsfield,right,"1,75 m",Gastón Díaz,£450k,30,0.57,0.78,0,0.6,1.38
