In [40]:
# Pandas
import pandas as pd

# SQL Alchemy
from sqlalchemy import create_engine

# PyMySQL 
import pymysql
pymysql.install_as_MySQLdb()

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import config


In [43]:
# Create Engine and Pass in MySQL Connection
# PyMySQL
engine = create_engine(f'mysql+pymysql://{config.DATABASE_USERNAME}:{config.DATABASE_PASSWORD}@localhost/{config.DATABASE_NAME}')
conn = engine.connect()


In [44]:
# Query all records in the the preview2 table
preview_data = pd.read_sql("SELECT * FROM preview2", conn)


In [45]:
# Query all records in the the result table
result_data = pd.read_sql("SELECT * FROM result", conn)


In [46]:
# Rename ID columns
preview_data.rename({'preview_id': 'id'}, axis=1, inplace=True)
result_data.rename({'result_id': 'id'}, axis=1, inplace=True)


In [47]:
preview_data.head()

Unnamed: 0,id,game_no,away_pitcher_rh,away_pitcher_record,away_pitcher_era,away_pitcher_ip,home_pitcher_rh,home_pitcher_record,home_pitcher_era,home_pitcher_ip,...,away_venue_record,away_pitcher_type_record,home_record,home_last_ten,home_venue_record,home_pitcher_type_record,away_ops_vs_pitcher_type,home_ops_vs_pitcher_type,matchup_count,home_matchup_record
0,ANA201804020,4,1,0.0,0.0,121.2,1,0.0,0.0,147.1,...,0.33,0.0,0.75,0.75,0.0,0.67,0.505,0.505,0,0.5
1,ANA201804030,5,1,0.0,0.0,141.0,1,0.0,7.2,27.2,...,0.5,0.33,0.6,0.6,0.0,0.5,0.629,0.629,2,0.0
2,ANA201804060,9,1,0.0,9.0,91.1,1,0.0,0.0,121.0,...,0.0,0.6,0.71,0.71,0.67,0.67,0.721,0.721,5,0.0
3,ANA201804070,10,1,0.0,1.8,65.1,1,0.0,7.71,147.1,...,0.0,0.5,0.75,0.75,0.75,0.71,0.801,0.801,6,0.0
4,ANA201804080,11,1,0.0,8.1,105.1,1,1.0,4.5,0.0,...,0.5,0.57,0.67,0.67,0.6,0.62,0.808,0.808,7,0.0


In [48]:
result_data.head()

Unnamed: 0,id,date,gamenum,away_name,away_score,home_name,home_score,home_win
0,ANA201804020,20180402,0,ANA,0,CLE,6,0
1,ANA201804030,20180403,0,ANA,13,CLE,2,0
2,ANA201804040,20180404,0,ANA,3,CLE,2,0
3,ANA201804060,20180406,0,ANA,13,OAK,9,1
4,ANA201804070,20180407,0,ANA,3,OAK,7,0


In [49]:
# Merge the columns
combined_data = pd.merge(preview_data, result_data, on="id", how="inner")


In [50]:
combined_data.head()


Unnamed: 0,id,game_no,away_pitcher_rh,away_pitcher_record,away_pitcher_era,away_pitcher_ip,home_pitcher_rh,home_pitcher_record,home_pitcher_era,home_pitcher_ip,...,home_ops_vs_pitcher_type,matchup_count,home_matchup_record,date,gamenum,away_name,away_score,home_name,home_score,home_win
0,ANA201804020,4,1,0.0,0.0,121.2,1,0.0,0.0,147.1,...,0.505,0,0.5,20180402,0,ANA,0,CLE,6,0
1,ANA201804030,5,1,0.0,0.0,141.0,1,0.0,7.2,27.2,...,0.629,2,0.0,20180403,0,ANA,13,CLE,2,0
2,ANA201804060,9,1,0.0,9.0,91.1,1,0.0,0.0,121.0,...,0.721,5,0.0,20180406,0,ANA,13,OAK,9,1
3,ANA201804070,10,1,0.0,1.8,65.1,1,0.0,7.71,147.1,...,0.801,6,0.0,20180407,0,ANA,3,OAK,7,0
4,ANA201804080,11,1,0.0,8.1,105.1,1,1.0,4.5,0.0,...,0.808,7,0.0,20180408,0,ANA,6,OAK,1,1


In [51]:
# Clean up the data, drop non-numeric columns
combined_data = combined_data.drop(columns=["id", "away_name", "home_name"])
# Drop the null columns where all values are null
combined_data = combined_data.dropna(axis='columns', how='all')

# Drop the null rows
combined_data = combined_data.dropna()
combined_data.head()


Unnamed: 0,game_no,away_pitcher_rh,away_pitcher_record,away_pitcher_era,away_pitcher_ip,home_pitcher_rh,home_pitcher_record,home_pitcher_era,home_pitcher_ip,away_record,...,home_pitcher_type_record,away_ops_vs_pitcher_type,home_ops_vs_pitcher_type,matchup_count,home_matchup_record,date,gamenum,away_score,home_score,home_win
0,4,1,0.0,0.0,121.2,1,0.0,0.0,147.1,0.33,...,0.67,0.505,0.505,0,0.5,20180402,0,0,6,0
1,5,1,0.0,0.0,141.0,1,0.0,7.2,27.2,0.5,...,0.5,0.629,0.629,2,0.0,20180403,0,13,2,0
2,9,1,0.0,9.0,91.1,1,0.0,0.0,121.0,0.38,...,0.67,0.721,0.721,5,0.0,20180406,0,13,9,1
3,10,1,0.0,1.8,65.1,1,0.0,7.71,147.1,0.33,...,0.71,0.801,0.801,6,0.0,20180407,0,3,7,0
4,11,1,0.0,8.1,105.1,1,1.0,4.5,0.0,0.4,...,0.62,0.808,0.808,7,0.0,20180408,0,6,1,1


In [52]:
# Reformat the data for the model
# See example Stu_Voice_Recognition - Class 2
X = combined_data.drop("home_win", axis=1)
y = combined_data["home_win"].values.reshape(-1, 1)
print(X.shape, y.shape)


(84, 25) (84, 1)


In [53]:
# Create a train-test-split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [54]:
# Scale the data using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)


  return self.partial_fit(X, y)


In [55]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [56]:
# Create a Support Vector Machine
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)


  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [57]:
model.fit(X_test_scaled, y_test)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [58]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.5079365079365079
Testing Data Score: 0.8095238095238095


In [59]:
# Hyperparameter Tuning
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 10],
              'gamma': [0.0001, 0.001]}
grid = GridSearchCV(model, param_grid, verbose=3)


In [60]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.5238095238095238, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.6666666666666666, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ..... C=1, gamma=0.0001, score=0.38095238095238093, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.5238095238095238, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.6666666666666666, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ...... C=1, gamma=0.001, score=0.38095238095238093, total=   0.0s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.0s finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 10], 'gamma': [0.0001, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [61]:
print(grid.best_params_)
print(grid.best_score_)


{'C': 10, 'gamma': 0.0001}
0.5714285714285714
