In [1]:
from autosklearn.classification import AutoSklearnClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, Normalizer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()
MMS = MinMaxScaler()

In [2]:
data = pd.read_csv('Current Stats and Game Features.csv') 

In [3]:
data.head()

Unnamed: 0,Date,Home Odds,Vis Odds,Home,Visitor,Home PTS,Vis PTS,Home Points Dif,Home Win,H #Bat,...,V 1Run,V vRHP,V vLHP,V ≥.500,V <.500,HomeLastDif,VisitorLastDif,HomeWinStreak,VisitorWinStreak,HomeTeamWonLast
0,2006-06-18,2.0,1.83,Atlanta Braves,Boston Red Sox,7.0,10.0,-3.0,False,45,...,29-20,61-45,25-31,37-44,49-32,0,0,0,0,0
1,2006-06-21,1.63,2.29,Texas Rangers,San Diego Padres,2.0,3.0,-1.0,False,51,...,30-22,71-57,17-17,30-19,58-55,0,0,0,0,0
2,2006-06-21,1.67,2.25,Houston Astros,Minnesota Twins,5.0,3.0,2.0,True,39,...,20-11,62-40,34-26,41-38,55-28,0,0,0,0,0
3,2006-06-21,2.1,1.77,Colorado Rockies,Oakland Athletics,2.0,3.0,-1.0,False,50,...,32-22,70-51,23-18,42-37,51-32,0,0,0,0,0
4,2006-06-21,1.43,2.9,New York Mets,Cincinnati Reds,5.0,6.0,-1.0,False,49,...,27-20,56-58,24-24,27-34,53-48,0,0,0,0,0


In [4]:
# Split columns with hyphens
# Create function to use on both current and prev stats
def transform(x, y):
    
    x = x.astype(str)
    cols_to_delim = []
    for col in x.columns:
        result = x[col].str.contains(pat='\d-\d')
        if result.any():
            cols_to_delim.append(col)

    for col in cols_to_delim:
            x[[col + '1', col + '2']] = x[col].str.split('-', expand=True)
            del x[col]

    x = x.astype(float)
    
    # Scale and Normalise
    x = imputer.fit_transform(x, y)
    x = MMS.fit_transform(x)
    return x

In [14]:
# Split and transform 
X = data.loc[:,'H  #Bat':]
y = data['Home Win']
X = transform(X, y)

# Train, test, valid

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5)

In [19]:
# Initialize model

model = AutoSklearnClassifier(
    time_left_for_this_task= 5 * 60 * 60, # In seconds how long should the task take
    per_run_time_limit=30,
    n_jobs=-1,
)

In [20]:
%%time
model.fit(X_train, y_train)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 46703 instead
  expected, actual


CPU times: user 4h 34min 1s, sys: 35min 6s, total: 5h 9min 7s
Wall time: 5h 8s


AutoSklearnClassifier(n_jobs=-1, per_run_time_limit=30,
                      time_left_for_this_task=18000)

In [21]:
print(model.sprint_statistics())

auto-sklearn results:
  Dataset name: cfaf5266-d34d-11eb-96b0-bfd8c58d9f1b
  Metric: accuracy
  Best validation score: 0.549346
  Number of target algorithm runs: 2798
  Number of successful target algorithm runs: 1648
  Number of crashed target algorithm runs: 170
  Number of target algorithms that exceeded the time limit: 726
  Number of target algorithms that exceeded the memory limit: 254



In [22]:
print(model.show_models())

[(0.200000, SimpleClassificationPipeline({'balancing:strategy': 'none', 'classifier:__choice__': 'qda', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessing:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'none', 'feature_preprocessor:__choice__': 'no_preprocessing', 'classifier:qda:reg_param': 0.06373385566204748},
dataset_properties={
  'task': 1,
  'sparse': False,
  'multilabel': False,
  'multiclass': False,
  'target_type': 'classification',
  'signed': False})),
(0.160000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'lda', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coa

In [23]:
%%time
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc}')

Accuracy: 0.5441054665930425
CPU times: user 5.83 s, sys: 14.7 s, total: 20.5 s
Wall time: 2.78 s


In [14]:
# With odds and data
data_with_odds = data.copy()

data_with_odds.drop(['Home', 'Visitor', 'Home PTS', 'Vis PTS', 'Home Points Dif'], axis=1, inplace=True)
data_with_odds

Unnamed: 0,Date,Home Odds,Vis Odds,Home Win,H #Bat,H BatAge,H R/G,H G_x,H PA,H AB,...,V 1Run,V vRHP,V vLHP,V ≥.500,V <.500,HomeLastDif,VisitorLastDif,HomeWinStreak,VisitorWinStreak,HomeTeamWonLast
0,2006-06-18,2.00,1.83,False,45,27.4,5.24,162,6284,5583,...,29-20,61-45,25-31,37-44,49-32,0,0,0,0,0
1,2006-06-21,1.63,2.29,False,51,28.4,5.15,162,6273,5659,...,30-22,71-57,17-17,30-19,58-55,0,0,0,0,0
2,2006-06-21,1.67,2.25,True,39,30.5,4.54,162,6326,5521,...,20-11,62-40,34-26,41-38,55-28,0,0,0,0,0
3,2006-06-21,2.10,1.77,False,50,27.8,5.02,162,6348,5562,...,32-22,70-51,23-18,42-37,51-32,0,0,0,0,0
4,2006-06-21,1.43,2.90,False,49,30.1,5.15,162,6291,5558,...,27-20,56-58,24-24,27-34,53-48,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96583,2021-04-12,1.75,2.14,True,31,28.7,4.25,20,759,669,...,3-1,4-10,6-0,6-5,4-5,2,-6,0,0,0
96584,2021-04-12,1.77,2.11,True,29,28.0,5.00,20,766,677,...,0-4,4-5,4-6,3-4,5-7,-1,3,2,5,1
96585,2021-04-12,2.85,1.45,False,32,28.1,3.81,21,775,688,...,2-3,9-9,3-2,4-8,8-3,6,2,3,0,1
96586,2021-04-12,2.66,1.50,False,33,26.3,4.10,20,737,659,...,1-3,7-7,2-4,1-5,8-6,14,4,3,0,0
