# SHAP Values

In [1]:
import numpy as np
import pandas as pd

# Tree Models
from sklearn.tree import DecisionTreeClassifier

# Data Preprocessing
# Splitting Training and Evaluation Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Model Interpretability
import shap

# Performance Evaluation
from sklearn import metrics

In [2]:
data = pd.read_csv('statistics.csv')
data.head()

Unnamed: 0,Date,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,...,Yellow Card,Yellow & Red,Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO,Own goals,Own goal Time
0,14-06-2018,Russia,Saudi Arabia,5,40,13,7,3,3,6,...,0,0,0,Yes,12.0,Group Stage,No,0,,
1,14-06-2018,Saudi Arabia,Russia,0,60,6,0,3,3,2,...,0,0,0,No,,Group Stage,No,0,,
2,15-06-2018,Egypt,Uruguay,0,43,8,3,3,2,0,...,2,0,0,No,,Group Stage,No,0,,
3,15-06-2018,Uruguay,Egypt,1,57,14,4,6,4,5,...,0,0,0,Yes,89.0,Group Stage,No,0,,
4,15-06-2018,Morocco,Iran,0,64,13,3,6,4,5,...,1,0,0,No,,Group Stage,No,0,1.0,90.0


In [3]:
data.shape

(128, 27)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Date                    128 non-null    object 
 1   Team                    128 non-null    object 
 2   Opponent                128 non-null    object 
 3   Goal Scored             128 non-null    int64  
 4   Ball Possession %       128 non-null    int64  
 5   Attempts                128 non-null    int64  
 6   On-Target               128 non-null    int64  
 7   Off-Target              128 non-null    int64  
 8   Blocked                 128 non-null    int64  
 9   Corners                 128 non-null    int64  
 10  Offsides                128 non-null    int64  
 11  Free Kicks              128 non-null    int64  
 12  Saves                   128 non-null    int64  
 13  Pass Accuracy %         128 non-null    int64  
 14  Passes                  128 non-null    in

In [5]:
data.isnull().sum()

Date                        0
Team                        0
Opponent                    0
Goal Scored                 0
Ball Possession %           0
Attempts                    0
On-Target                   0
Off-Target                  0
Blocked                     0
Corners                     0
Offsides                    0
Free Kicks                  0
Saves                       0
Pass Accuracy %             0
Passes                      0
Distance Covered (Kms)      0
Fouls Committed             0
Yellow Card                 0
Yellow & Red                0
Red                         0
Man of the Match            0
1st Goal                   34
Round                       0
PSO                         0
Goals in PSO                0
Own goals                 116
Own goal Time             116
dtype: int64

In [6]:
data = data.dropna(axis = 1)
data.shape

(128, 24)

In [7]:
cat_data = data.select_dtypes('object')
cat_data.head()

Unnamed: 0,Date,Team,Opponent,Man of the Match,Round,PSO
0,14-06-2018,Russia,Saudi Arabia,Yes,Group Stage,No
1,14-06-2018,Saudi Arabia,Russia,No,Group Stage,No
2,15-06-2018,Egypt,Uruguay,No,Group Stage,No
3,15-06-2018,Uruguay,Egypt,Yes,Group Stage,No
4,15-06-2018,Morocco,Iran,No,Group Stage,No


In [8]:
num_data = data.select_dtypes(exclude = 'object')
num_data.head()

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,Pass Accuracy %,Passes,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,Goals in PSO
0,5,40,13,7,3,3,6,3,11,0,78,306,118,22,0,0,0,0
1,0,60,6,0,3,3,2,1,25,2,86,511,105,10,0,0,0,0
2,0,43,8,3,3,2,0,1,7,3,78,395,112,12,2,0,0,0
3,1,57,14,4,6,4,5,1,13,3,86,589,111,6,0,0,0,0
4,0,64,13,3,6,4,5,0,14,2,86,433,101,22,1,0,0,0


In [9]:
cat_cols = cat_data.columns.drop('Date')
cat_cols

Index(['Team', 'Opponent', 'Man of the Match', 'Round', 'PSO'], dtype='object')

In [10]:
data = num_data.join(data[cat_cols])
data.head(3)

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,...,Fouls Committed,Yellow Card,Yellow & Red,Red,Goals in PSO,Team,Opponent,Man of the Match,Round,PSO
0,5,40,13,7,3,3,6,3,11,0,...,22,0,0,0,0,Russia,Saudi Arabia,Yes,Group Stage,No
1,0,60,6,0,3,3,2,1,25,2,...,10,0,0,0,0,Saudi Arabia,Russia,No,Group Stage,No
2,0,43,8,3,3,2,0,1,7,3,...,12,2,0,0,0,Egypt,Uruguay,No,Group Stage,No


In [11]:
data = data.replace({'Man of the Match':{'Yes': 1, 'No': 0}})
data.iloc[:3, -4:]

Unnamed: 0,Opponent,Man of the Match,Round,PSO
0,Saudi Arabia,1,Group Stage,No
1,Russia,0,Group Stage,No
2,Uruguay,0,Group Stage,No


In [12]:
y = data['Man of the Match']

X = data.drop('Man of the Match', axis = 1)

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)

In [14]:
X_train.head(3)

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,...,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,Goals in PSO,Team,Opponent,Round,PSO
91,1,57,4,2,1,1,3,3,18,3,...,100,15,1,0,0,0,Colombia,Senegal,Group Stage,No
81,0,70,26,6,11,9,9,1,16,3,...,115,7,0,0,0,0,Germany,Korea Republic,Group Stage,No
2,0,43,8,3,3,2,0,1,7,3,...,112,12,2,0,0,0,Egypt,Uruguay,Group Stage,No


In [15]:
enc_t = X_train.copy()
enc_v = X_valid.copy()

In [16]:
cats = [col for col in X_train.columns if X_train[col].dtype == 'object']
cats

['Team', 'Opponent', 'Round', 'PSO']

In [17]:
encoder = LabelEncoder()

In [18]:
for col in cats:
    enc_t[col] = encoder.fit_transform(enc_t[col])
    enc_v[col] = encoder.transform(enc_v[col])
    
enc_t.head(4)

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,...,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,Goals in PSO,Team,Opponent,Round,PSO
91,1,57,4,2,1,1,3,3,18,3,...,100,15,1,0,0,0,4,25,2,0
81,0,70,26,6,11,9,9,1,16,3,...,115,7,0,0,0,0,11,15,2,0
2,0,43,8,3,3,2,0,1,7,3,...,112,12,2,0,0,0,8,31,2,0
31,2,43,8,2,4,2,3,3,11,3,...,107,15,2,0,0,0,25,21,2,0


In [20]:
model = DecisionTreeClassifier(random_state = 22).fit(enc_t, y_train)

In [21]:
# After fitting a model
explainer = shap.TreeExplainer(model) # Shap values calculator object

In [22]:
# Single prediction
d_pred = enc_v.iloc[5]
d_pred

Goal Scored                 4
Ball Possession %          39
Attempts                    8
On-Target                   6
Off-Target                  1
Blocked                     1
Corners                     2
Offsides                    1
Free Kicks                 14
Saves                       1
Pass Accuracy %            75
Passes                    271
Distance Covered (Kms)     99
Fouls Committed            14
Yellow Card                 2
Yellow & Red                0
Red                         0
Goals in PSO                0
Team                       10
Opponent                    6
Round                       1
PSO                         0
Name: 126, dtype: int64

In [23]:
d_pred.shape

(22,)

In [24]:
d_pred.values

array([  4,  39,   8,   6,   1,   1,   2,   1,  14,   1,  75, 271,  99,
        14,   2,   0,   0,   0,  10,   6,   1,   0], dtype=int64)

In [32]:
model.predict(d_pred)

ValueError: Expected 2D array, got 1D array instead:
array=[  4.  39.   8.   6.   1.   1.   2.   1.  14.   1.  75. 271.  99.  14.
   2.   0.   0.   0.  10.   6.   1.   0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [31]:
d_pred.values.reshape(1, -1)

array([[  4,  39,   8,   6,   1,   1,   2,   1,  14,   1,  75, 271,  99,
         14,   2,   0,   0,   0,  10,   6,   1,   0]], dtype=int64)

In [None]:
d_pred = d_pred.values.reshape(1, -1)

In [None]:
shape_v = explainer.shap_values()