In [1]:
import numpy as np
import pandas as pd
import math
import sys, os
#import seaborn as sns
#from scipy import stats
#from pathlib import Path
#import matplotlib.pyplot as plt
#from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
#from sklearn.metrics import mean_squared_error
#from sklearn.metrics import mean_squared_error
#from sklearn.preprocessing import MinMaxScaler
#from sklearn.datasets import fetch_california_housing
#import math
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import optuna
from optuna.samplers import TPESampler
#import lightgbm as lgbm
#from xgboost import XGBRegressor
import xgboost as xgb
#from catboost import CatBoostRegressor
#from lightgbm.sklearn import LGBMRegressor
#from sklearn.ensemble import RandomForestRegressor
import pickle
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
pd.set_option('display.max_columns', 500)

In [2]:
train_time = 1 * 60 * 10
objective = 'xgbregressor'

In [3]:
sys.path.append(os.path.realpath('..'))

In [4]:
train_df = pd.read_csv(r"..\data\train.csv", index_col=0)
test_df = pd.read_csv(r"..\data\test.csv", index_col=0)
sample_sub = pd.read_csv(r"..\data\sample_submission.csv", index_col=0)

In [5]:
total = train_df.isnull().sum().sort_values(ascending=False)
percent = (train_df.isnull().sum()/train_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

Unnamed: 0,Total,Percent
Age,0,0.0
StockOptionLevel,0,0.0
Over18,0,0.0
OverTime,0,0.0
PercentSalaryHike,0,0.0
PerformanceRating,0,0.0
RelationshipSatisfaction,0,0.0
StandardHours,0,0.0
TotalWorkingYears,0,0.0
BusinessTravel,0,0.0


In [6]:
del train_df['Over18']
del train_df['EmployeeCount']
del train_df['StandardHours']

In [7]:
columns_to_vectorize = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
for vector_target in columns_to_vectorize:
    print(vector_target)
    vectorizer = CountVectorizer()
    vectorizer.fit_transform(train_df[vector_target])
    train_df[f'{vector_target}_v'] = vectorizer.transform(train_df[vector_target]).toarray().argmax(axis=1)[:,None]
    vectorizer.fit_transform(test_df[vector_target])
    test_df[f'{vector_target}_v'] = vectorizer.transform(test_df[vector_target]).toarray().argmax(axis=1)[:,None]

BusinessTravel
Department
EducationField
Gender
JobRole
MaritalStatus
OverTime


In [8]:
train_df.columns

Index(['Age', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender',
       'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Attrition', 'BusinessTravel_v', 'Department_v',
       'EducationField_v', 'Gender_v', 'JobRole_v', 'MaritalStatus_v',
       'OverTime_v'],
      dtype='object')

In [9]:
features = ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction','HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
        'BusinessTravel_v', 'Department_v', 'EducationField_v','Gender_v', 'JobRole_v', 'MaritalStatus_v', 'OverTime_v']
target = ['Attrition']

In [10]:
scaler = MinMaxScaler().fit(train_df[features])
X = scaler.transform(train_df[features])
X_test = scaler.transform(test_df[features])

scaler = MinMaxScaler().fit(train_df[target])
Y = scaler.transform(train_df[target])

In [11]:
y_min = Y.min()
y_max = Y.max()

print(y_min, y_max)

def my_rmse(y_true, y_hat):
    y_true[y_true < y_min] = y_min
    y_true[y_true > y_max] = y_max
    
    y_hat[y_hat < y_min] = y_min
    y_hat[y_hat > y_max] = y_max
    
    y_true_nan = np.isnan(y_true)
    y_hat_nan = np.isnan(y_hat)
    
    if y_true_nan.sum() > 0:
        print(y_true_nan.sum())
        np.where(y_true_nan, np.ma.array(y_true, mask=np.isnan(y_true)).mean(axis=0), y_true)
    if y_hat_nan.sum() > 0:
        print(y_hat_nan.sum())
        np.where(y_hat_nan, np.ma.array(y_hat, mask=np.isnan(y_hat)).mean(axis=0), y_hat)
    
    return mean_squared_error(y_true, y_hat, squared=False)

0.0 1.0


In [12]:
RANDOM_STATE = 12 
FOLDS = 5
param_grid_history = {}

def render_model(param_grid):
    reg = xgb.XGBModel(
        # These parameters should help with trial speed.
        objective='binary:logistic',
        tree_method='gpu_hist',
        booster='gbtree',
        predictor='gpu_predictor',
        n_jobs=4,
        eval_metric='auc',
        early_stopping_rounds=100,
        **param_grid
    )
    return reg
    
def train_model(reg, X_train, y_train, X_valid, y_valid):
    reg.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

    preds = reg.predict(train_df[features])
    fpr, tpr, _ = roc_curve(train_df[target], preds)
    roc_auc = auc(fpr, tpr)
    return reg, roc_auc

def objective_v2(trial):
    scores = []
    param_grid = {
        'max_depth': trial.suggest_int('max_depth', 4, 20),
        'n_estimators': trial.suggest_int('n_estimators', 2, 1000, 1), 
        #'eta': trial.suggest_float('eta', 0.07, 0.113), 
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 40), 
    } 
        
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train_df[features], train_df[target])):
        print(10*"=", f"Fold={fold+1}", 10*"=")
        X_train, X_valid = train_df.iloc[train_idx][features], train_df.iloc[valid_idx][features]
        y_train , y_valid = train_df[target].iloc[train_idx] , train_df[target].iloc[valid_idx] 
        
        reg = render_model(param_grid)
        reg, roc_auc = train_model(reg, X_train, y_train, X_valid, y_valid)

        scores.append(roc_auc)
    mean_scores = np.mean(scores)
    param_grid_history[mean_scores] = param_grid

    return mean_scores

In [13]:
study = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name=objective)
study.optimize(objective_v2, timeout=train_time)

[32m[I 2023-01-17 20:28:37,646][0m A new study created in memory with name: xgbregressor[0m




[32m[I 2023-01-17 20:28:41,509][0m Trial 0 finished with value: 0.9229729180771834 and parameters: {'max_depth': 14, 'n_estimators': 286, 'subsample': 0.28447220586720645, 'colsample_bytree': 0.8226104074433699, 'reg_lambda': 35.17105792522576}. Best is trial 0 with value: 0.9229729180771834.[0m




[32m[I 2023-01-17 20:28:46,171][0m Trial 1 finished with value: 0.9705741367637103 and parameters: {'max_depth': 14, 'n_estimators': 699, 'subsample': 0.7847108617446061, 'colsample_bytree': 0.9434876689715768, 'reg_lambda': 20.09949327872667}. Best is trial 1 with value: 0.9705741367637103.[0m




[32m[I 2023-01-17 20:28:49,915][0m Trial 2 finished with value: 0.9644888287068382 and parameters: {'max_depth': 14, 'n_estimators': 187, 'subsample': 0.7951730035701288, 'colsample_bytree': 0.3315576524314612, 'reg_lambda': 11.119038766690746}. Best is trial 1 with value: 0.9705741367637103.[0m




[32m[I 2023-01-17 20:28:55,916][0m Trial 3 finished with value: 0.936237643872715 and parameters: {'max_depth': 5, 'n_estimators': 709, 'subsample': 0.26017063432619764, 'colsample_bytree': 0.24563662299061176, 'reg_lambda': 32.775752847125304}. Best is trial 1 with value: 0.9705741367637103.[0m




[32m[I 2023-01-17 20:29:01,668][0m Trial 4 finished with value: 0.9514163845633039 and parameters: {'max_depth': 8, 'n_estimators': 855, 'subsample': 0.4699311198096737, 'colsample_bytree': 0.3465206045929381, 'reg_lambda': 38.256960389504194}. Best is trial 1 with value: 0.9705741367637103.[0m




[32m[I 2023-01-17 20:29:06,318][0m Trial 5 finished with value: 0.9337698036560598 and parameters: {'max_depth': 12, 'n_estimators': 901, 'subsample': 0.34868446885997045, 'colsample_bytree': 0.16878641865645122, 'reg_lambda': 16.811586254367768}. Best is trial 1 with value: 0.9705741367637103.[0m




[32m[I 2023-01-17 20:29:10,565][0m Trial 6 finished with value: 0.9495362220717671 and parameters: {'max_depth': 20, 'n_estimators': 417, 'subsample': 0.4466164338192702, 'colsample_bytree': 0.816828731829418, 'reg_lambda': 17.00111585334962}. Best is trial 1 with value: 0.9705741367637103.[0m




[32m[I 2023-01-17 20:29:13,501][0m Trial 7 finished with value: 0.8792125930941097 and parameters: {'max_depth': 17, 'n_estimators': 605, 'subsample': 0.10569681681854795, 'colsample_bytree': 0.8345476031606157, 'reg_lambda': 8.860610907195623}. Best is trial 1 with value: 0.9705741367637103.[0m




[32m[I 2023-01-17 20:29:17,413][0m Trial 8 finished with value: 0.9577830060934327 and parameters: {'max_depth': 15, 'n_estimators': 296, 'subsample': 0.5539891255588161, 'colsample_bytree': 0.6445079261351787, 'reg_lambda': 7.860795770332823}. Best is trial 1 with value: 0.9705741367637103.[0m




[32m[I 2023-01-17 20:29:23,483][0m Trial 9 finished with value: 0.9686073121191605 and parameters: {'max_depth': 11, 'n_estimators': 843, 'subsample': 0.779400460453332, 'colsample_bytree': 0.12888144462553205, 'reg_lambda': 7.920101326927599}. Best is trial 1 with value: 0.9705741367637103.[0m




[32m[I 2023-01-17 20:29:28,627][0m Trial 10 finished with value: 0.9726729857819905 and parameters: {'max_depth': 20, 'n_estimators': 534, 'subsample': 0.9619649775180157, 'colsample_bytree': 0.9737399908174935, 'reg_lambda': 26.381330765188178}. Best is trial 10 with value: 0.9726729857819905.[0m




[32m[I 2023-01-17 20:29:33,296][0m Trial 11 finished with value: 0.9685341909275558 and parameters: {'max_depth': 19, 'n_estimators': 542, 'subsample': 0.9917896930706351, 'colsample_bytree': 0.9797045781723351, 'reg_lambda': 26.697608333479955}. Best is trial 10 with value: 0.9726729857819905.[0m




[32m[I 2023-01-17 20:29:38,999][0m Trial 12 finished with value: 0.9669119837508463 and parameters: {'max_depth': 17, 'n_estimators': 686, 'subsample': 0.9933493811035131, 'colsample_bytree': 0.9597720953016724, 'reg_lambda': 25.597575616758707}. Best is trial 10 with value: 0.9726729857819905.[0m


In [14]:
parm_grid_bytes = pickle.dumps(param_grid_history)
with open(f"rendered_data/{objective}_bytes.hex", "wb") as binary_file:
    binary_file.write(parm_grid_bytes)

In [15]:
def train(param_grid):
    reg = render_model(param_grid)
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train_df[features], train_df[target])):
        print(10*"=", f"FINAL TRAINING Fold={fold+1}", 10*"=")
        X_train, X_valid = train_df.iloc[train_idx][features], train_df.iloc[valid_idx][features]
        y_train , y_valid = train_df[target].iloc[train_idx] , train_df[target].iloc[valid_idx] 
        reg, roc_auc = train_model(reg, X_train, y_train, X_valid, y_valid)
    return reg

percent = math.ceil(len(param_grid_history.keys()) * .3)
top = sorted(list(param_grid_history.keys()))[-percent:]

train_preds = []
test_preds = []
for key in tqdm(top):
    model = train(param_grid_history[key])
    train_preds.append(model.predict(train_df[features]))
    test_preds.append(model.predict(test_df[features]))

train_final_preds = np.stack(train_preds).mean(0)
test_final_preds = np.stack(test_preds).mean(0)

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]



 25%|█████████████████████                                                               | 1/4 [00:04<00:14,  4.86s/it]



 50%|██████████████████████████████████████████                                          | 2/4 [00:10<00:10,  5.47s/it]



 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:16<00:05,  5.51s/it]



100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:21<00:00,  5.26s/it]


In [16]:
train_submission = pd.DataFrame(data={'id': train_df.index, target[0]: train_final_preds})
train_submission.to_csv(fr'rendered_data/{objective}_train_submission.csv', index=False)

test_submission = pd.DataFrame(data={'id': test_df.index, target[0]: test_final_preds})
test_submission.to_csv(fr'rendered_data/{objective}_test_submission.csv', index=False)