In [None]:
"""
Objectives:
- What features in the data are the most important
- For any prediction from a model, how did each feature in the data affect that particular prediction
- How does feature affect a model's predictions
"""

In [None]:
"""
Permutation Importance
- Calculated after a model has been fitted
- If I randomly shuffle a single column  of the validation data, how would that affect the accuracy of predictions in that now-shuffled data?
- Should cause less accurate predictions

Process:
- Get trained model
- Shuffle values in a single columns
- Make predictions and look at loss function
- Performance deterioration measures the performance of the feature

"""

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv('data/FIFA 2018 Statistics.csv')
y = (data['Man of the Match'] == "Yes")
features_names = [i for i in data.columns if data[i].dtype in [np.int64]]
X = data[features_names]
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)
model = RandomForestClassifier(n_estimators=100, random_state=0).fit(X_train, y_train)

In [9]:
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(model, X_val, y_val, n_repeats=30, random_state=1)


In [10]:
import pandas as pd

importance_df = pd.DataFrame({
    'feature': X_val.columns,
    'importance': perm_importance.importances_mean
}).sort_values(by='importance', ascending=False)

print(importance_df)


                   feature  importance
0              Goal Scored    0.083333
7                 Offsides    0.042708
6                  Corners    0.040625
5                  Blocked    0.028125
14             Yellow Card    0.027083
10         Pass Accuracy %    0.018750
2                 Attempts    0.018750
4               Off-Target    0.007292
8               Free Kicks    0.004167
13         Fouls Committed    0.002083
17            Goals in PSO    0.002083
3                On-Target    0.002083
12  Distance Covered (Kms)    0.000000
15            Yellow & Red    0.000000
16                     Red    0.000000
9                    Saves   -0.001042
1        Ball Possession %   -0.008333
11                  Passes   -0.017708
