# Training

In [1]:
import sys
sys.path.append('..')

In [2]:
# # Loading useful libraries

import numpy as np
import pandas as pd

import optuna
import os
import pickle

import Demo2

In [3]:
# Selecting "regression" or "classification"

model_type = "classification"

assert(model_type in ["regression", "classification"])

In [4]:
# Loading data

X_train = pd.read_csv(f'../tmp/X_train_{model_type}.csv', index_col='User_ID')
y_train = pd.read_csv(f'../tmp/y_train_{model_type}.csv', index_col='User_ID')['is_BigSpender']

In [5]:
# Tuning hyperparameters
study = Demo2.training.tune(X_train, y_train, 10)

  0%|          | 0/10 [00:00<?, ?it/s]

Best Trial:
  Value:  0.5535465924895688
  Params: 
    n_estimators: 352
    num_leaves: 34
    max_depth: 2


In [6]:
# Creating the model with the best hyperparameters
model = Demo2.training.create_model(study.best_params)

# Fitting the model
model.fit(X_train, y_train)

# Evaluating model performance on the train set (no train-val splitting)
if model_type == 'regression':
    score_train = Demo2.training.evaluate(model, X_train, y_train)
else:
    best_threshold = Demo2.training.select_threshold(model, X_train, y_train)
    score_train = Demo2.training.evaluate(model, X_train, y_train, classification_threshold=best_threshold)
score_train

0.5698518733662503

In [7]:
# Evaluating model performance on the train set with cross-validation
if model_type == 'regression':
    scores = Demo2.training.evaluate(model, X_train, y_train, cross_val=True)
else:
    best_threshold = Demo2.training.select_threshold(model, X_train, y_train)
    scores = Demo2.training.evaluate(model, X_train, y_train, classification_threshold=best_threshold, cross_val=True)
scores.mean()



0.5541327124563445

# Saving model

In [8]:
try:
   os.makedirs("../tmp")
except FileExistsError:
   # directory already exists
   pass

In [9]:

with open('../tmp/model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('../tmp/best_threshold.pkl', 'wb') as f:
    pickle.dump(best_threshold, f)