In [1]:
import numpy as np
import pandas as pd
import math
import sys, os
#import seaborn as sns
#from scipy import stats
#from pathlib import Path
#import matplotlib.pyplot as plt
#from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
#from sklearn.metrics import mean_squared_error
#from sklearn.metrics import mean_squared_error
#from sklearn.preprocessing import MinMaxScaler
#from sklearn.datasets import fetch_california_housing
#import math
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import optuna
from optuna.samplers import TPESampler
#import lightgbm as lbgm
#from xgboost import XGBRegressor
#import xgboost as xgb
from catboost import CatBoostRegressor
#from lightgbm.sklearn import lbgmRegressor
#from sklearn.ensemble import RandomForestRegressor
import pickle
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

In [2]:
train_time = 1 * 60 * 15
objective = 'catregressor'

In [3]:
sys.path.append(os.path.realpath('..'))

In [4]:
train_df = pd.read_csv(r"..\data\train.csv", index_col=0)
test_df = pd.read_csv(r"..\data\test.csv", index_col=0)
sample_sub = pd.read_csv(r"..\data\sample_submission.csv", index_col=0)

In [5]:
del train_df['Over18']
del train_df['EmployeeCount']
del train_df['StandardHours']

In [6]:
columns_to_vectorize = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
for vector_target in columns_to_vectorize:
    print(vector_target)
    vectorizer = CountVectorizer()
    vectorizer.fit_transform(train_df[vector_target])
    train_df[f'{vector_target}_v'] = vectorizer.transform(train_df[vector_target]).toarray().argmax(axis=1)[:,None]
    vectorizer.fit_transform(test_df[vector_target])
    test_df[f'{vector_target}_v'] = vectorizer.transform(test_df[vector_target]).toarray().argmax(axis=1)[:,None]

BusinessTravel
Department
EducationField
Gender
JobRole
MaritalStatus
OverTime


In [7]:
features = ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction','HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
        'BusinessTravel_v', 'Department_v', 'EducationField_v','Gender_v', 'JobRole_v', 'MaritalStatus_v', 'OverTime_v']
target = ['Attrition']

In [8]:
scaler = MinMaxScaler().fit(train_df[features])
X = scaler.transform(train_df[features])
X_test = scaler.transform(test_df[features])

scaler = MinMaxScaler().fit(train_df[target])
Y = scaler.transform(train_df[target])

In [9]:
RANDOM_STATE = 12 
FOLDS = 5
param_grid_history = {}

def render_model(param_grid):
    reg = CatBoostRegressor(iterations=20000,
                            **param_grid,
                            bootstrap_type='Bernoulli',
                            grow_policy='SymmetricTree',
                            #loss_function='Logloss',
                            eval_metric='AUC',
                            task_type="GPU",
                            random_state=1,)
    return reg
    
def train_model(reg, X_train, y_train, X_valid, y_valid):
    reg.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=100, verbose=1000)

    preds = reg.predict(train_df[features])
    fpr, tpr, _ = roc_curve(train_df[target], preds)
    roc_auc = auc(fpr, tpr)
    return reg, roc_auc

def objective_v2(trial):
    scores = []
    param_grid = {
        'depth': trial.suggest_int('depth', 1, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        #'rsm': trial.suggest_float('rsm', 0.001, 0.9),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100), 
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 20),
        'random_strength': trial.suggest_float('random_strength', 0.001, 0.9),
    } 
        
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train_df[features], train_df[target])):
        print(10*"=", f"Fold={fold+1}", 10*"=")
        X_train, X_valid = train_df.iloc[train_idx][features], train_df.iloc[valid_idx][features]
        y_train , y_valid = train_df[target].iloc[train_idx] , train_df[target].iloc[valid_idx] 
        
        reg = render_model(param_grid)
        reg, roc_auc = train_model(reg, X_train, y_train, X_valid, y_valid)

        scores.append(roc_auc)
    mean_scores = np.mean(scores)
    param_grid_history[mean_scores] = param_grid

    return mean_scores

In [10]:
study = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name=objective)
study.optimize(objective_v2, timeout=train_time)

[32m[I 2023-01-17 20:43:36,710][0m A new study created in memory with name: catregressor[0m




Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7013514	best: 0.7013514 (0)	total: 10ms	remaining: 3m 20s
bestTest = 0.8365709459
bestIteration = 376
Shrink model to first 377 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7797297	best: 0.7797297 (0)	total: 12.2ms	remaining: 4m 4s
bestTest = 0.8321368243
bestIteration = 12
Shrink model to first 13 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7773729	best: 0.7773729 (0)	total: 10.6ms	remaining: 3m 31s
bestTest = 0.856440678
bestIteration = 191
Shrink model to first 192 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7755932	best: 0.7755932 (0)	total: 11.3ms	remaining: 3m 46s
bestTest = 0.851440678
bestIteration = 143
Shrink model to first 144 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6812712	best: 0.6812712 (0)	total: 11.7ms	remaining: 3m 53s


[32m[I 2023-01-17 20:43:53,916][0m Trial 0 finished with value: 0.9290890318212593 and parameters: {'depth': 6, 'learning_rate': 0.0840457046820096, 'subsample': 0.7201089549647443, 'min_data_in_leaf': 70, 'l2_leaf_reg': 20, 'random_strength': 0.19272055576768482}. Best is trial 0 with value: 0.9290890318212593.[0m


bestTest = 0.8070338983
bestIteration = 60
Shrink model to first 61 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7112753	best: 0.7112753 (0)	total: 8.97ms	remaining: 2m 59s
bestTest = 0.8418918919
bestIteration = 734
Shrink model to first 735 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.8130490	best: 0.8130490 (0)	total: 10.1ms	remaining: 3m 22s
bestTest = 0.8477618243
bestIteration = 9
Shrink model to first 10 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7753814	best: 0.7753814 (0)	total: 8.69ms	remaining: 2m 53s
bestTest = 0.865
bestIteration = 87
Shrink model to first 88 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7779661	best: 0.7779661 (0)	total: 8.98ms	remaining: 2m 59s
bestTest = 0.8511864407
bestIteration = 184
Shrink model to first 185 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7050424	best: 0.7050424 (0)	total: 8.86ms	remaining: 2m 57s


[32m[I 2023-01-17 20:44:11,857][0m Trial 1 finished with value: 0.9245920785375761 and parameters: {'depth': 5, 'learning_rate': 0.08319497646311656, 'subsample': 0.7677548109766389, 'min_data_in_leaf': 38, 'l2_leaf_reg': 20, 'random_strength': 0.22281863864145718}. Best is trial 0 with value: 0.9290890318212593.[0m


bestTest = 0.786779661
bestIteration = 105
Shrink model to first 106 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7472128	best: 0.7472128 (0)	total: 7.42ms	remaining: 2m 28s
1000:	test: 0.8126689	best: 0.8130912 (993)	total: 7.4s	remaining: 2m 20s
bestTest = 0.8185810811
bestIteration = 1376
Shrink model to first 1377 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7530828	best: 0.7530828 (0)	total: 7.56ms	remaining: 2m 31s
bestTest = 0.8415118243
bestIteration = 21
Shrink model to first 22 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7758051	best: 0.7758051 (0)	total: 8.3ms	remaining: 2m 46s
bestTest = 0.8713559322
bestIteration = 720
Shrink model to first 721 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7698729	best: 0.7698729 (0)	total: 8.56ms	remaining: 2m 51s
1000:	test: 0.8473729	best: 0.8478814 (980)	total: 7.31s	remaining: 2m 18s
bestTest = 0.8506779661
bestIteration = 1184
Shrink model to first 1185 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7379237	best: 0.7379237 (0)	total: 8.4ms	remaining: 2m 47s


[32m[I 2023-01-17 20:44:44,092][0m Trial 2 finished with value: 0.8693649289099525 and parameters: {'depth': 3, 'learning_rate': 0.008414189957077232, 'subsample': 0.6640258746092165, 'min_data_in_leaf': 48, 'l2_leaf_reg': 13, 'random_strength': 0.8864996394407495}. Best is trial 0 with value: 0.9290890318212593.[0m


bestTest = 0.7801271186
bestIteration = 52
Shrink model to first 53 iterations.


In [11]:
parm_grid_bytes = pickle.dumps(param_grid_history)
with open(f"rendered_data/{objective}_bytes.hex", "wb") as binary_file:
    binary_file.write(parm_grid_bytes)

In [12]:
def train(param_grid):
    reg = render_model(param_grid)
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train_df[features], train_df[target])):
        print(10*"=", f"FINAL TRAINING Fold={fold+1}", 10*"=")
        X_train, X_valid = train_df.iloc[train_idx][features], train_df.iloc[valid_idx][features]
        y_train , y_valid = train_df[target].iloc[train_idx] , train_df[target].iloc[valid_idx] 
        reg, roc_auc = train_model(reg, X_train, y_train, X_valid, y_valid)
    return reg

percent = math.ceil(len(param_grid_history.keys()) * .1)
top = sorted(list(param_grid_history.keys()))[-percent:]

train_preds = []
test_preds = []
for key in tqdm(top):
    model = train(param_grid_history[key])
    train_preds.append(model.predict(train_df[features]))
    test_preds.append(model.predict(test_df[features]))

train_final_preds = np.stack(train_preds).mean(0)
test_final_preds = np.stack(test_preds).mean(0)

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7013514	best: 0.7013514 (0)	total: 12ms	remaining: 3m 59s
bestTest = 0.8365709459
bestIteration = 376
Shrink model to first 377 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7797297	best: 0.7797297 (0)	total: 9.55ms	remaining: 3m 11s
bestTest = 0.8321368243
bestIteration = 12
Shrink model to first 13 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7773729	best: 0.7773729 (0)	total: 11.1ms	remaining: 3m 41s
bestTest = 0.856440678
bestIteration = 191
Shrink model to first 192 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7755932	best: 0.7755932 (0)	total: 9.73ms	remaining: 3m 14s
bestTest = 0.851440678
bestIteration = 143
Shrink model to first 144 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6812712	best: 0.6812712 (0)	total: 9.96ms	remaining: 3m 19s


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:16<00:00, 16.68s/it]

bestTest = 0.8070338983
bestIteration = 60
Shrink model to first 61 iterations.





In [13]:
train_submission = pd.DataFrame(data={'id': train_df.index, target[0]: train_final_preds})
train_submission.to_csv(fr'rendered_data/{objective}_train_submission.csv', index=False)

test_submission = pd.DataFrame(data={'id': test_df.index, target[0]: test_final_preds})
test_submission.to_csv(fr'rendered_data/{objective}_test_submission.csv', index=False)