<img src="images/buildacourt.jpg" alt="Drawing" style="width: 438px;"/><img src="images/luxurycourt.jpg" alt="Drawing" style="width: 492px;"/>

## Step 4 - Building a Model - ATP Match Statistics

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score


# Set standard figure size for plots
mlp.rcParams['figure.figsize'] = (10,6)

# Set Seaborn Styles
sns.set()

# Set Color Palette that can be used for plotting
colorsP = ['#D28DDC','#CA7AD6','#C366D0','#BB53CA','#B340C3','#A337B2','#A437B3','#92319F','#802B8C','#6E2578']
colorsP = colorsP[::-1]

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

sns.set_style("whitegrid")
sns.set_context("poster")
import sklearn.model_selection

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

result = pd.read_csv('data/result.csv')

In [None]:
a = ['ace_pct','df_pct','firstsrv_in_pct','firstsrv_won_pct','secondsrv_won_pct','rtn_pts_pct','brk_pts_pct']
b = ['win']
result = result.loc[:,a + b].dropna()
result.head()

In [None]:
features = result.drop('win', axis=1)
labels = result['win']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
for dataset in [y_train, y_val, y_test]:
    print(round(len(dataset) / len(labels), 2))

In [None]:
X_train.to_csv(r'train_features.csv', index=False)
X_val.to_csv(r'val_features.csv', index=False)
X_test.to_csv(r'test_features.csv', index=False)

y_train.to_csv(r'train_labels.csv', index=False)
y_val.to_csv(r'val_labels.csv', index=False)
y_test.to_csv(r'test_labels.csv', index=False)

In [None]:
tr_features = X_train
tr_labels = y_train

val_features = X_val
val_labels = y_val

te_features = X_test
te_labels = y_test

In [None]:
sns.set(style="ticks", color_codes=True)
g = sns.pairplot(features)
plt.show()

# Logistic Regression

In [None]:
lr = LogisticRegression()

scores = cross_val_score(lr, tr_features, tr_labels.values.ravel(), cv=5)
scores

In [None]:
def run_lr(X_train,y_train):
    """
    Run a Logistic Recression of Features and Target Variable. We're using ROC AUC Score with Cross Validation
    """

    lr = LogisticRegression(solver='lbfgs')
    
    lr_scores = cross_val_score(lr, X_train, y_train.values.ravel(), cv=5)
    
    print('LR Scores: ', lr_scores)
    print('Mean ROC AUC Score: {}'.format(lr_scores.mean()))
    print('LR Score Range: {}'.format(round(lr_scores.max() - lr_scores.min(),4)))
    _ = plt.plot(np.arange(len(lr_scores)),lr_scores)
    _ = plt.ylim(0.90,1)
    return lr

lr = run_lr(tr_features,tr_labels)

In [None]:
def plot_coef(X,y,features):
    """
    Plot the Beta values of a Logistic Regression based on the Features and Target Variable provided.
    """
    lr = LogisticRegression(solver='lbfgs')
    std_scaler = StandardScaler()
    X_train_std = std_scaler.fit_transform(X_train)
    lr.fit(X_train,y_train)
    std_coef = lr.coef_.reshape(-1,1).tolist()
    feature_names = [feature.title() for feature in features]
    coef_df = pd.DataFrame(std_coef,index=feature_names,columns=['coef'])
    coef_df['abs'] = coef_df['coef'].abs()
    coef_df = coef_df.sort_values(by='abs',ascending=False).drop('abs',axis=1)
    ax = sns.barplot(coef_df['coef'],coef_df.index)
    for i, row in enumerate(coef_df.iterrows()):
        row_values = row[1]
        if row_values.coef < 0:
            ax.text(row_values.coef - 0.05,i,round(row_values.coef,2))
        else:
            ax.text(row_values.coef + 0.01,i,round(row_values.coef,2))
    _ = plt.xlabel('Coefficient')
    _ = plt.title('Standardized Coefficients')
    _ = plt.xlim(-0.6,0.4)

plot_coef(tr_features,tr_labels,features)

In [None]:
# find correlations to target
corr_matrix = tr_features.corr()

# Plot correlations for Tuning variables
plt.figure(figsize=(15,8))
sns.heatmap(corr_matrix,annot=True)
plt.show;

In [None]:
tr_features_new = tr_features.drop(['ace_pct','brk_pts_pct'], axis=1)
tr_features_new.head()

In [None]:
lr = LogisticRegression()

scores = cross_val_score(lr, tr_features_new, tr_labels.values.ravel(), cv=5)
scores

In [None]:
lr = run_lr(tr_features_new,tr_labels)

# Random Forest

## Fit and evaluate a basic model using 5-fold Cross-Validation

In [None]:
rf = RandomForestClassifier()

scores = cross_val_score(rf, tr_features, tr_labels.values.ravel(), cv=5)
scores

## Hyperparameter tuning

In [None]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [None]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [2, 10, 20, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())

print_results(cv)

## Fit best models on full training set

From above, the top three results came from:<br/>
0.931 (+/-0.003) for {'max_depth': 20, 'n_estimators': 50}<br/>
0.931 (+/-0.002) for {'max_depth': 20, 'n_estimators': 100}<br/>
0.931 (+/-0.004) for {'max_depth': None, 'n_estimators': 100}<br/>

In [None]:
rf1 = RandomForestClassifier(n_estimators=50, max_depth=20)
rf1.fit(tr_features, tr_labels.values.ravel())

rf2 = RandomForestClassifier(n_estimators=100, max_depth=20)
rf2.fit(tr_features, tr_labels.values.ravel())

rf3 = RandomForestClassifier(n_estimators=100, max_depth=None)
rf3.fit(tr_features, tr_labels.values.ravel())

## Evaluate models on validation set

In [None]:
for mdl in [rf1, rf2, rf3]:
    y_pred = mdl.predict(val_features)
    accuracy = round(accuracy_score(val_labels, y_pred), 3)
    precision = round(precision_score(val_labels, y_pred), 3)
    recall = round(recall_score(val_labels, y_pred), 3)
    print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(mdl.max_depth,
                                                                         mdl.n_estimators,
                                                                         accuracy,
                                                                         precision,
                                                                         recall))

## Evaluate the best model on the test set

In [None]:
y_pred = rf3.predict(te_features)
accuracy = round(accuracy_score(te_labels, y_pred), 3)
precision = round(precision_score(te_labels, y_pred), 3)
recall = round(recall_score(te_labels, y_pred), 3)
print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(rf3.max_depth,
                                                                     rf3.n_estimators,
                                                                     accuracy,
                                                                     precision,
                                                                     recall))

## Drop some features to reduce overfitting.

In [None]:
tr_features_new = X_train.drop(['ace_pct','brk_pts_pct'], axis=1)
val_features_new = X_val.drop(['ace_pct','brk_pts_pct'], axis=1)
te_features_new = X_test.drop(['ace_pct','brk_pts_pct'], axis=1)

In [None]:
rf = RandomForestClassifier()

scores = cross_val_score(rf, tr_features_new, tr_labels.values.ravel(), cv=5)
scores

In [None]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 100],
    'max_depth': [2, 10, 20, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(tr_features_new, tr_labels.values.ravel())

print_results(cv)

## Top three 
0.925 (+/-0.002) for {'max_depth': 10, 'n_estimators': 50}<br/>
0.925 (+/-0.002) for {'max_depth': 10, 'n_estimators': 100}<br/>
0.924 (+/-0.002) for {'max_depth': 20, 'n_estimators': 50}<br/>

In [None]:
rf1 = RandomForestClassifier(n_estimators=50, max_depth=10)
rf1.fit(tr_features_new, tr_labels.values.ravel())

rf2 = RandomForestClassifier(n_estimators=100, max_depth=10)
rf2.fit(tr_features_new, tr_labels.values.ravel())

rf3 = RandomForestClassifier(n_estimators=50, max_depth=20)
rf3.fit(tr_features_new, tr_labels.values.ravel())

In [None]:
for mdl in [rf1, rf2, rf3]:
    y_pred = mdl.predict(val_features_new)
    accuracy = round(accuracy_score(val_labels, y_pred), 3)
    precision = round(precision_score(val_labels, y_pred), 3)
    recall = round(recall_score(val_labels, y_pred), 3)
    print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(mdl.max_depth,
                                                                         mdl.n_estimators,
                                                                         accuracy,
                                                                         precision,
                                                                         recall))

## Re-run Final Model

In [None]:
y_pred = rf2.predict(te_features_new)
accuracy = round(accuracy_score(te_labels, y_pred), 3)
precision = round(precision_score(te_labels, y_pred), 3)
recall = round(recall_score(te_labels, y_pred), 3)
print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(rf2.max_depth,
                                                                     rf2.n_estimators,
                                                                     accuracy,
                                                                     precision,
                                                                     recall))