In [2]:
%run ../config/parameters.py
%run ../config/paths.py

In [3]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
import warnings
from collections import defaultdict
from statistics import mean
from sklearn import metrics
from catboost import CatBoostClassifier
warnings.filterwarnings('ignore')


In [4]:
train_data = pd.read_csv(P_TRAIN_DATA )
test_data = pd.read_csv(P_TEST_DATA )
sample_submission = pd.read_csv(P_SAMPLE_SUB)
hr_data = pd.read_csv(P_ADDITIONAL)

# Data Preprocessing
hr_data.loc[hr_data['Attrition'] == 'No', 'Attrition'] = 0
hr_data.loc[hr_data['Attrition'] == 'Yes', 'Attrition'] = 1
hr_data = hr_data.astype({'Attrition': 'int'})

# Constants Definition
TARGET_VAR = 'Attrition'
FEATURE_COLUMNS = hr_data.columns.to_list()[1:34]

# Removing Constant Value Features
FEATURE_COLUMNS.remove('Over18')
FEATURE_COLUMNS.remove('StandardHours')
FEATURE_COLUMNS.remove('EmployeeCount')

In [5]:
SELECTED_FEATURES = train_data.columns.to_list()[1:34]

SELECTED_FEATURES.remove('Over18')
SELECTED_FEATURES.remove('StandardHours')
SELECTED_FEATURES.remove('EmployeeCount')

In [6]:
processed_train_data = train_data.loc[:, SELECTED_FEATURES + [TARGET_COL]].copy()
processed_test_data = test_data.loc[:, SELECTED_FEATURES].copy()
processed_hr_data = hr_data.loc[:, SELECTED_FEATURES + [TARGET_COL]].copy()


In [7]:
# Processing 'processed_train_data'
processed_train_data['IncomePerAge'] = processed_train_data['MonthlyIncome'] / processed_train_data['Age']
processed_train_data["AgeRisk"] = (processed_train_data["Age"] < 34).astype(int)
processed_train_data["HourlyRateRisk"] = (processed_train_data["HourlyRate"] < 60).astype(int)
processed_train_data["DistanceRisk"] = (processed_train_data["DistanceFromHome"] >= 20).astype(int)
processed_train_data["ShortCompanyTenure"] = (processed_train_data["YearsAtCompany"] < 4).astype(int)
processed_train_data['NumCompaniesAdjusted'] = processed_train_data['NumCompaniesWorked'].replace(0, 1)
processed_train_data['AverageCompanyTenure'] = processed_train_data["TotalWorkingYears"] / processed_train_data["NumCompaniesAdjusted"]
processed_train_data['JobHopperIndicator'] = ((processed_train_data["NumCompaniesAdjusted"] > 2) & (processed_train_data["AverageCompanyTenure"] < 2.0)).astype(int)
processed_train_data["AttritionRiskScore"] = processed_train_data["AgeRisk"] + processed_train_data["HourlyRateRisk"] + processed_train_data["DistanceRisk"] + processed_train_data["ShortCompanyTenure"] + processed_train_data['JobHopperIndicator']
processed_train_data.loc[processed_train_data['Education'] == 15, 'Education'] = 5  
processed_train_data.loc[processed_train_data['JobLevel'] == 7, 'JobLevel'] = 5  

In [8]:
# Processing 'processed_test_data'
processed_test_data['IncomePerAge'] = processed_test_data['MonthlyIncome'] / processed_test_data['Age']
processed_test_data["AgeRisk"] = (processed_test_data["Age"] < 34).astype(int)
processed_test_data["HourlyRateRisk"] = (processed_test_data["HourlyRate"] < 60).astype(int)
processed_test_data["DistanceRisk"] = (processed_test_data["DistanceFromHome"] >= 20).astype(int)
processed_test_data["ShortCompanyTenure"] = (processed_test_data["YearsAtCompany"] < 4).astype(int)
processed_test_data['NumCompaniesAdjusted'] = processed_test_data['NumCompaniesWorked'].replace(0, 1)
processed_test_data['AverageCompanyTenure'] = processed_test_data["TotalWorkingYears"] / processed_test_data["NumCompaniesAdjusted"]
processed_test_data['JobHopperIndicator'] = ((processed_test_data["NumCompaniesAdjusted"] > 2) & (processed_test_data["AverageCompanyTenure"] < 2.0)).astype(int)
processed_test_data["AttritionRiskScore"] = processed_test_data["AgeRisk"] + processed_test_data["HourlyRateRisk"] + processed_test_data["DistanceRisk"] + processed_test_data["ShortCompanyTenure"] + processed_test_data['JobHopperIndicator']


In [9]:
# Processing 'processed_hr_data'
processed_hr_data['IncomePerAge'] = processed_hr_data['MonthlyIncome'] / processed_hr_data['Age']
processed_hr_data["AgeRisk"] = (processed_hr_data["Age"] < 34).astype(int)
processed_hr_data["HourlyRateRisk"] = (processed_hr_data["HourlyRate"] < 60).astype(int)
processed_hr_data["DistanceRisk"] = (processed_hr_data["DistanceFromHome"] >= 20).astype(int)
processed_hr_data["ShortCompanyTenure"] = (processed_hr_data["YearsAtCompany"] < 4).astype(int)
processed_hr_data['NumCompaniesAdjusted'] = processed_hr_data['NumCompaniesWorked'].replace(0, 1)
processed_hr_data['AverageCompanyTenure'] = processed_hr_data["TotalWorkingYears"] / processed_hr_data["NumCompaniesAdjusted"]
processed_hr_data['JobHopperIndicator'] = ((processed_hr_data["NumCompaniesAdjusted"] > 2) & (processed_hr_data["AverageCompanyTenure"] < 2.0)).astype(int)
processed_hr_data["AttritionRiskScore"] = processed_hr_data["AgeRisk"] + processed_hr_data["HourlyRateRisk"] + processed_hr_data["DistanceRisk"] + processed_hr_data["ShortCompanyTenure"] + processed_hr_data['JobHopperIndicator']

In [10]:
for idx, value in enumerate(ORDINAL_CAT_ORDER):
    processed_train_data.loc[processed_train_data['BusinessTravel'] == value, 'BusinessTravel'] = idx
    processed_test_data.loc[processed_test_data['BusinessTravel'] == value, 'BusinessTravel'] = idx
    processed_hr_data.loc[processed_hr_data['BusinessTravel'] == value, 'BusinessTravel'] = idx

processed_train_data = processed_train_data.astype({'BusinessTravel': 'int'})
processed_test_data = processed_test_data.astype({'BusinessTravel': 'int'})
processed_hr_data = processed_hr_data.astype({'BusinessTravel': 'int'})


In [11]:
encoded_features = []

for feature in CATEGORICAL_FEATURES:
    ohe = OneHotEncoder(sparse=False, drop='first')
    train_feature_df = pd.DataFrame(processed_train_data[feature])
    test_feature_df = pd.DataFrame(processed_test_data[feature])
    orig_feature_df = pd.DataFrame(processed_hr_data[feature])

    merged_feature_df = pd.concat([train_feature_df, test_feature_df, orig_feature_df], ignore_index=True)
    ohe.fit(merged_feature_df)

    new_encoded_columns = [f"{feature}_{val}_ohe" for val in ohe.categories_[0][1:]]
    encoded_features.extend(new_encoded_columns)

    train_encoded_columns = pd.DataFrame(ohe.transform(train_feature_df), columns=new_encoded_columns, dtype='int')
    test_encoded_columns = pd.DataFrame(ohe.transform(test_feature_df), columns=new_encoded_columns, dtype='int')
    orig_encoded_columns = pd.DataFrame(ohe.transform(orig_feature_df), columns=new_encoded_columns, dtype='int')

    for column in new_encoded_columns:
        processed_train_data[column] = train_encoded_columns[column]
        processed_test_data[column] = test_encoded_columns[column]
        processed_hr_data[column] = orig_encoded_columns[column]
        SELECTED_FEATURES.append(column)  

for feature in CATEGORICAL_FEATURES:
    if feature in SELECTED_FEATURES:
        SELECTED_FEATURES.remove(feature)

processed_train_data.drop(CATEGORICAL_FEATURES, axis=1, inplace=True)
processed_test_data.drop(CATEGORICAL_FEATURES, axis=1, inplace=True)
processed_hr_data.drop(CATEGORICAL_FEATURES, axis=1, inplace=True)

CATEGORICAL_FEATURES = encoded_features


In [12]:
for cont_feature in CONTINUOUS_FEATURES:
    mean_value = np.mean(processed_train_data[cont_feature])
    std_dev = np.std(processed_train_data[cont_feature])
    
    processed_train_data[cont_feature] = (processed_train_data[cont_feature] - mean_value) / std_dev
    
    processed_test_data[cont_feature] = (processed_test_data[cont_feature] - mean_value) / std_dev
    processed_hr_data[cont_feature] = (processed_hr_data[cont_feature] - mean_value) / std_dev


## Model development

In [13]:
# Define tuning parameters
n_estimators_values = [10, 25, 50, 100, 150, 200, 250, 300]
eta_values = [v / 10 for v in range(10)]
max_depth_values = [2, 4, 6, 8, 10]
subsample_values = [0.25, 0.50, 0.75, 0.90]
colsample_bytree_values = [0.25, 0.50, 0.75, 0.90]

cv_folds = 10
tuning_iterations = 10
include_orig = True
tuning_results = defaultdict(list)

# Create column names for predictions
col_names = [f'XGB_Step_{step}_Fold_{fold}' 
             for step in range(tuning_iterations) 
             for fold in range(cv_folds)]
test_predictions = pd.DataFrame(0, index = processed_test_data.index, columns = col_names)
valid_predictions = pd.DataFrame(0, index = processed_train_data.index, columns = col_names)

random.seed(2201020)

# Stratified K-Fold for Cross-Validation
skf_seed = random.randint(0, 2023)
skf = StratifiedKFold(n_splits = cv_folds, random_state = skf_seed, shuffle = True)

# Model Tuning Loop
for step in range(tuning_iterations):
    n_estimators = random.choice(n_estimators_values)
    eta = random.choice(eta_values)
    max_depth = random.choice(max_depth_values)
    subsample = random.choice(subsample_values)
    colsample_bytree = random.choice(colsample_bytree_values)
    
    aucs = []

    for i, (train_index, val_index) in enumerate(skf.split(processed_train_data[SELECTED_FEATURES], processed_train_data[TARGET_COL])):
        X_train, X_val = processed_train_data[SELECTED_FEATURES].iloc[train_index], processed_train_data[SELECTED_FEATURES].iloc[val_index]
        y_train, y_val = processed_train_data[TARGET_COL].iloc[train_index], processed_train_data[TARGET_COL].iloc[val_index]

        if include_orig:
            X_train = pd.concat([X_train, processed_hr_data[SELECTED_FEATURES]], ignore_index=True)
            y_train = pd.concat([y_train, processed_hr_data[TARGET_COL]], ignore_index=True)
        
        xgb_seed = random.randint(0, 2023)
        xgb = XGBClassifier(n_estimators=n_estimators, eta=eta, max_depth=max_depth, subsample=subsample, colsample_bytree=colsample_bytree, random_state=xgb_seed).fit(X_train.values, y_train)
        
        val_probs = [probs[1] for probs in xgb.predict_proba(X_val)]
        valid_predictions.loc[val_index, f'XGB_Step_{step}_Fold_{i}'] = val_probs
        
        fpr, tpr, thresholds = metrics.roc_curve(y_val, val_probs, pos_label=1)
        auc = metrics.auc(fpr, tpr)
        aucs.append(auc)
        
        test_predictions[f'XGB_Step_{step}_Fold_{i}'] = [probs[1] for probs in xgb.predict_proba(processed_test_data[SELECTED_FEATURES])]
    
    # Storing the tuning results
    tuning_results['step'].append(step)
    tuning_results['auc'].append(mean(aucs))
    tuning_results['n_estimators'].append(n_estimators)
    tuning_results['eta'].append(eta)
    tuning_results['max_depth'].append(max_depth)
    tuning_results['subsample'].append(subsample)
    tuning_results['colsample_bytree'].append(colsample_bytree)
    tuning_results['skf_seed'].append(skf_seed)
    tuning_results['xgb_seed'].append(xgb_seed)
    
    print(f'Step: {step}  AUC: {mean(aucs)}')

# Saving Predictions and Tuning Results
valid_predictions.to_csv('XGBoost_Valid_Predictions.csv', index=False)
test_predictions.to_csv('XGBoost_Test_Predictions.csv', index=False)

# Finalizing Tuning Results
tuning_results = pd.DataFrame(tuning_results)
tuning_results.sort_values(by='auc', ascending=False, inplace=True)
tuning_results.to_csv('XGBoost_Tuning_Results.csv', index=False)
tuning_results


Step: 0  AUC: 0.8599613899613899
Step: 1  AUC: 0.7582443923515353
Step: 2  AUC: 0.751296194153337
Step: 3  AUC: 0.8185865968008825
Step: 4  AUC: 0.8277300514800514
Step: 5  AUC: 0.8360449990807134
Step: 6  AUC: 0.5
Step: 7  AUC: 0.8532124471410185
Step: 8  AUC: 0.8264522430593859
Step: 9  AUC: 0.8320015168229454


Unnamed: 0,step,auc,n_estimators,eta,max_depth,subsample,colsample_bytree,skf_seed,xgb_seed
0,0,0.859961,100,0.2,2,0.5,0.5,1054,983
7,7,0.853212,200,0.1,10,0.9,0.25,1054,745
5,5,0.836045,200,0.4,2,0.9,0.9,1054,1406
9,9,0.832002,250,0.2,6,0.75,0.9,1054,1128
4,4,0.82773,50,0.5,8,0.9,0.5,1054,1731
8,8,0.826452,25,0.2,6,0.9,0.9,1054,1694
3,3,0.818587,200,0.5,10,0.75,0.75,1054,1705
1,1,0.758244,25,0.9,10,0.25,0.25,1054,460
2,2,0.751296,100,0.7,10,0.25,0.5,1054,1517
6,6,0.5,25,0.0,4,0.9,0.5,1054,1921


In [14]:
num_models = [2]

sub = sample_submission.copy()

for num in num_models:
    best_cols = [f'XGB_Step_{step}_Fold_{fold}' 
                   for step in tuning_results['step'][0:num]
                       for fold in range(cv_folds)]
    cv_probs = test_predictions[best_cols].mean(axis = 1).round(decimals = 4)

    sub['Attrition'] = cv_probs
    sub.to_csv(''.join(['xgb', str(num), '_model_ensemble.csv']), index = False)
   