In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt

import pandas as pd
import os
import sys
import warnings
warnings.filterwarnings('ignore')

if not os.path.abspath(os.pardir) in sys.path:
    sys.path.append(os.path.abspath(os.pardir))

from src.data_reader import read_data
from src.features_processor import FeaturesProcessor
from src.ml_experiments import *
from src.config import *

### Read Data and prepare features

In [None]:
train_df = read_data(DATA_PATH + TRAIN_FNAME, DATA_PATH + METADATA_FNAME, drop_cols = IGNORE_FEATURES)
test_df = read_data(DATA_PATH + TEST_FNAME, DATA_PATH + METADATA_FNAME, drop_cols = IGNORE_FEATURES)

test_df.shape, train_df.shape

In [None]:
fprocessor = FeaturesProcessor('OHE')

encoded_train_df, encoders = fprocessor.transform_features(train_df, make_calculated = True)
encoded_test_df, encoders = fprocessor.transform_features(test_df, make_calculated = True)

encoded_train_df, encoded_val_df = train_test_split(encoded_train_df, test_size=0.1, random_state=32)


encoded_train_df.shape, encoded_val_df.shape, encoded_test_df.shape

### Find hyperparameters of best performing model (catboost) using optuna library

In [None]:
feat_names = [col for col in encoded_train_df.columns if (col != TARGET_NAME)]
best_params, best_score = run_optuna_study(encoded_train_df, feat_names, n_trials=100)

### Apply this model, training it on train set, later testing on the test set

In [None]:
# feature names
feat_names = [col for col in encoded_train_df.columns if (col != TARGET_NAME)]

#best ones found by longer experiment
best_params = {'learning_rate': 0.12287358017398776, 'depth': 4, 'l2_leaf_reg': 0.95198171385157, 'iterations': 2486}

# given previous experiment only keep best version of each model

models = [ 
          CatBoostClassifier(**best_params, task_type = 'GPU' if GPU else 'CPU', silent = True, random_state=32),
         ]

exp_name_base = 'hp_tune'
#folder to store artifacts
if not os.path.isdir(MODELS_PATH + exp_name_base):
    os.mkdir(MODELS_PATH + exp_name_base)

for model in models[-1:]:

    model.fit(encoded_train_df[feat_names], encoded_train_df[TARGET_NAME])

    metrics = evaluate_model(model.predict(encoded_val_df[feat_names]), encoded_val_df[TARGET_NAME])
    model_name = get_model_string(model)
    
    log_model(exp_name_base, model_name,
             MODELS_PATH + MODELS_LOG_FNAME,
             metrics)

    save_model_pickle(model, MODELS_PATH + f'{exp_name_base}/' + model_name[:100] + '.pkl')
    print(model_name)
    print(metrics)
    print()



In [None]:
print(classification_report(model.predict(encoded_test_df[feat_names]), encoded_test_df[TARGET_NAME]))

In [None]:
evaluate_model(model.predict(encoded_test_df[feat_names]), encoded_test_df[TARGET_NAME])

### Check features importances

In [None]:
PLOT_MAX = 20
importances = sorted([(name,imp/max(model.feature_importances_)) for name, imp in zip(feat_names, model.feature_importances_)], 
                     key = lambda x:-x[1])[:PLOT_MAX]

plt.figure(figsize=(14, 8))
plt.bar([e[0] for e in importances], [e[1] for e in importances], color = COLORS[0])
plt.title('Feature Importances')
plt.xticks(rotation = 90)
plt.show()