In [1]:
### imports
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
df = pd.read_csv('data_2020-02-24.csv')

## Making predictive model for determining goals

In [4]:
y = df['shot_outcome'].str.contains('Goal|On Net')
X = df.drop(columns=['shot_outcome'])
dropped_columns = ['game_date']
X = X.drop(columns=dropped_columns)

In [5]:
# utilize pandas get dummies to turn textual features into numerical binary features
dummies_columns = ['team','passer','shooter', 'shot_type', 'game_state', 'home_team', 'away_team']
X_dummies = pd.get_dummies(X, prefix_sep='-', columns = dummies_columns)

In [7]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, test_size=0.33, random_state=13)
rfc = RandomForestClassifier(random_state=12)
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=12)

In [8]:
# perform recursive feature estimation to determine the most important features
rfe = RFE(estimator=rfc, n_features_to_select=25, step=1)
rfe.fit(X_train, y_train)

RFE(estimator=RandomForestClassifier(random_state=12), n_features_to_select=25)

In [9]:
rfe.predict(X_test)[0:10]

array([ True,  True,  True, False, False, False, False, False, False,
        True])

In [10]:
confusion_matrix(y_test, rfe.predict(X_test))

array([[117,  70],
       [ 62,  87]])

In [11]:
precision_score(y_test, rfe.predict(X_test))

0.554140127388535

In [12]:
recall_score(y_test, rfe.predict(X_test))

0.5838926174496645

In [13]:
df_feature_importance = pd.DataFrame(rfc.feature_importances_, index = X_test.columns )

In [14]:
df_feature_importance.to_csv('df_feature_importances.csv')