# Using pickle to serialize the sklearn model

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [2]:
nba_games = pd.read_csv('data/04_improved_model_with_moneylines.csv')
nba_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6133 entries, 0 to 6132
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   GAME_DATE_EST         6133 non-null   object 
 1   GAME_ID               6133 non-null   float64
 2   HOME_TEAM_ID          6133 non-null   float64
 3   AWAY_TEAM_ID          6133 non-null   float64
 4   SEASON                6133 non-null   float64
 5   HOME_TEAM_POINTS      6133 non-null   float64
 6   AWAY_TEAM_POINTS      6133 non-null   float64
 7   HOME_WIN_PCT          6133 non-null   float64
 8   HOME_HOME_WIN_PCT     6133 non-null   float64
 9   AWAY_WIN_PCT          6133 non-null   float64
 10  AWAY_AWAY_WIN_PCT     6133 non-null   float64
 11  HOME_TEAM_WINS        6133 non-null   bool   
 12  HOME_TEAM_B2B         6133 non-null   bool   
 13  AWAY_TEAM_B2B         6133 non-null   bool   
 14  GAME_DATETIME         6133 non-null   float64
 15  HOME_LAST_10_WIN_PCT 

In [3]:
feat_cols = [
  "GAME_DATETIME",
  "HOME_TEAM_ID",
  "AWAY_TEAM_ID",
  "HOME_WIN_PCT",
  "HOME_HOME_WIN_PCT",
  "AWAY_WIN_PCT",
  "AWAY_AWAY_WIN_PCT",
  "HOME_TEAM_B2B",
  "AWAY_TEAM_B2B",
  "HOME_LAST_10_WIN_PCT",
  "AWAY_LAST_10_WIN_PCT"
]

target = "HOME_TEAM_WINS"

In [4]:
def calculate_season_model(s):
  season = nba_games.loc[(nba_games['SEASON'] == s)].sort_values(by="GAME_DATETIME")
  train_1, test_1, test_2 = np.array_split(season, 3)
  train = pd.concat([train_1])
  X_train, y_train = train[feat_cols], train[target]

  test = pd.concat([test_1, test_2])
  X_test, y_test = test[feat_cols], test[target]
  clf = RandomForestClassifier(random_state=42, n_jobs=2, n_estimators=1000, max_depth=7, bootstrap=True)
  clf.fit(X_train, y_train)
  print('Train accuracy:',clf.score(X_train, y_train))
  print('Test accuracy:', clf.score(X_test, y_test))
  return clf

In [7]:
model_2024 = calculate_season_model(2024)

  return bound(*args, **kwds)


Train accuracy: 1.0
Test accuracy: 0.7475247524752475


In [8]:
pickle.dump(model_2024, open('data/2024_random_forest_model.pkl', 'wb'))

In [9]:
loaded_model_2024 = pickle.load(open('data/2024_random_forest_model.pkl', 'rb'))

In [10]:
def test_loaded_season_model(model):
  season = nba_games.loc[(nba_games['SEASON'] == 2024)].sort_values(by="GAME_DATETIME")
  train_1, test_1, test_2 = np.array_split(season, 3)
  train = pd.concat([train_1])
  X_train, y_train = train[feat_cols], train[target]

  test = pd.concat([test_1, test_2])
  X_test, y_test = test[feat_cols], test[target]
  print('Train accuracy:',model.score(X_train, y_train))
  print('Test accuracy:', model.score(X_test, y_test))

test_loaded_season_model(loaded_model_2024)

  return bound(*args, **kwds)


Train accuracy: 1.0
Test accuracy: 0.7475247524752475


## Results
Model easily serialized/deserialized with pickle module