This notebook contains some developments that were not included in the final version of the project



In [None]:
import numpy as np
import pandas as pd
import time
import gc

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.preprocessing import minmax_scale
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
from xgboost import plot_importance

from hyperopt import fmin, tpe, hp, space_eval
import warnings

import seaborn as sns
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier, Pool

from statsmodels.stats.outliers_influence import variance_inflation_factor

**Feature distribution**

In [None]:
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.show()

**Search and removal of multicollinear features**


First method - VIF [Variance Inflation Factor] 

In [None]:
class ReduceVIF(BaseEstimator, TransformerMixin):

    def __init__(self, thresh=10, impute=True, impute_strategy='median'):
        self.thresh = thresh

        if impute:
            self.imputer = SimpleImputer(strategy=impute_strategy)

    def fit(self, X, y=None):
        print('ReduceVIF fit')
        if hasattr(self, 'imputer'):
            self.imputer.fit(X)
        return self

    def transform(self, X, y=None):
        print('ReduceVIF transform')
        columns = X.columns.tolist()
        if hasattr(self, 'imputer'):
            X = pd.DataFrame(self.imputer.transform(X), columns=columns)
        return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=10):
        dropped = True
        while dropped:
            variables = X.columns
            dropped = False
            vif = [variance_inflation_factor(X[variables].values, X.columns.get_loc(var)) for var in X.columns]

            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f'Dropping {X.columns[maxloc]} with vif={max_vif}')
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped = True
        return X

The dataframe is too large, so processing in chunks

In [None]:
def process_dataframe_in_chunks(df, n=10):
    chunks = [df.iloc[:, i:i+n] for i in range(0, df.shape[1], n)]
    processed_chunks = [transformer.fit_transform(chunk, y) for chunk in chunks]
    return pd.concat(processed_chunks, axis=1)

In [None]:
df = process_dataframe_in_chunks(df, 10) # example of dataframe processing

PCA [Principal Component Analysis]

In [None]:
df = df.drop(['feature642', 'feature756'], axis=1)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_test['dummy'] = 'test'
df = pd.concat([df_train, df_test], axis=0, sort=False )
df = df.reset_index()
df = df.drop('index', axis=1)

def PCA_change(df, cols, n_components, rand_seed=4):
    pca = PCA(n_components=n_components, random_state=rand_seed)

    principalComponents = pca.fit_transform(df[cols])

    principalDf = pd.DataFrame(principalComponents)

    df.drop(cols, axis=1, inplace=True)

    df = pd.concat([df, principalDf], axis=1)
    
    return df

mas_v = df_train.columns[3:]

for col in mas_v:
    df[col] = df[col].fillna((df[col].min() - 2))
    df[col] = (minmax_scale(df[col], feature_range=(0,1)))

df = PCA_change(df, mas_v, n_components=100) # 

df_train, df_test = df[df['dummy'] != 'test'], df[df['dummy'] == 'test'].drop('dummy', axis=1)

X_train, y_train = df_train.drop(['id', 'target', 'sample_ml_new', 'isChurn'], axis=1), df_train['target']
X_test, y_test = df_test.drop(['id', 'target', 'sample_ml_new'], axis=1), df_test['target']

**XGBoost test**

In [None]:
def objective(params):
    time1 = time.time()
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'subsample': "{:.2f}".format(params['subsample']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'reg_lambda': "{:.3f}".format(params['reg_lambda']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'num_leaves': '{:.3f}'.format(params['num_leaves']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'min_child_samples': '{:.3f}'.format(params['min_child_samples']),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])
    }

    print("\n############## New Run ################")
    print(f"params = {params}")
    FOLDS = 7
    count=1
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

    tss = TimeSeriesSplit(n_splits=FOLDS)
    y_preds = np.zeros(sample_submission.shape[0])
    y_oof = np.zeros(X_train.shape[0])
    score_mean = 0
    for tr_idx, val_idx in tss.split(X_train, y_train):
        clf = xgb.XGBClassifier(
            n_estimators=600, random_state=4, verbose=True,
            tree_method='gpu_hist', 
            **params
        )

        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        clf.fit(X_tr, y_tr)

        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)
        score_mean += score
        print(f'{count} CV - score: {round(score, 4)}')
        count += 1
        
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect()
    print(f'Mean ROC_AUC: {score_mean / FOLDS}')
    del X_tr, X_vl, y_tr, y_vl, clf, score
    return -(score_mean / FOLDS)


space = {
    'max_depth': hp.quniform('max_depth', 7, 23, 1),
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'num_leaves': hp.choice('num_leaves', list(range(20, 250, 10))),
    'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
}

In [None]:
warnings.filterwarnings('ignore')

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=27)

In [None]:
best_params = space_eval(space, best)

In [None]:
best_params = {'bagging_fraction': 0.7955230034197366, 'colsample_bytree': 0.46278254431111865, 'feature_fraction': 0.5443072652214971, 
               'gamma': 0.5543303427555978, 'learning_rate': 0.01167276021142852, 'max_depth': 9, 'min_child_samples': 120, 
               'num_leaves': 230, 'reg_alpha': 0.3468832526697093, 'reg_lambda': 0.17207846996353005, 'subsample': 0.4}

from xgboost import XGBClassifier

clf = xgb.XGBClassifier(
    n_estimators=300,
    **best_params,
    tree_method='gpu_hist'
)

clf.fit(X_train, y_train)

**Feature Importances**

In [None]:
feature_important = clf.get_booster().get_score(importance_type="weight")
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)

# Top 30 features
data.head(30)

fav = data.head(30).reset_index()['index'].tolist()

**Shit Test**

In [None]:
names_lst = []

aucs_train_lst = []
accuracy_train_lst = []
precision_train_lst = []
recall_train_lst = []
f1_train_lst = []

aucs_test_lst = []
accuracy_test_lst = []
precision_test_lst = []
recall_test_lst = []
f1_test_lst = []

def build_measure_model(models):
    plt.figure(figsize=(12,6))

    for name, model, X_train, y_train, X_test, y_test in models:
        
        names_lst.append(name)

        model.fit(X_train, y_train)
        
        y_train_pred = model.predict_proba(X_train)
        y_test_pred = model.predict_proba(X_test)

        pred_train = y_train_pred[:, 1]
        pred_train_binary = (pred_train >= 0.1)
        
        pred_test = y_test_pred[:, 1]
        pred_test_binary = (pred_test >= 0.1)

        Accuracy_train = metrics.accuracy_score(y_train, pred_train_binary)
        accuracy_train_lst.append(Accuracy_train)
        
        Accuracy_test = metrics.accuracy_score(y_test, pred_test_binary)
        accuracy_test_lst.append(Accuracy_test)

        Aucs_train = metrics.roc_auc_score(y_train, pred_train)
        aucs_train_lst.append(Aucs_train)
        
        Aucs_test = metrics.roc_auc_score(y_test, pred_test)
        aucs_test_lst.append(Aucs_test)

        PrecisionScore_train = metrics.precision_score(y_train, pred_train_binary)
        precision_train_lst.append(PrecisionScore_train)
        
        PrecisionScore_test = metrics.precision_score(y_test, pred_test_binary)
        precision_test_lst.append(PrecisionScore_test)

        RecallScore_train = metrics.recall_score(y_train, pred_train_binary)
        recall_train_lst.append(RecallScore_train)
        
        RecallScore_test = metrics.recall_score(y_test, pred_test_binary)
        recall_test_lst.append(RecallScore_test)

        F1Score_train = metrics.f1_score(y_train, pred_train_binary)
        f1_train_lst.append(F1Score_train)
        
        F1Score_test = metrics.f1_score(y_test, pred_test_binary)
        f1_test_lst.append(F1Score_test)

        cnf_matrix = metrics.confusion_matrix(y_test, pred_test_binary)

        print("Model Name :", name)
        
        print('Train Accuracy :{0:0.5f}'.format(Accuracy_train)) 
        print('Test Accuracy :{0:0.5f}'.format(Accuracy_test))
        
        print('Train AUC : {0:0.5f}'.format(Aucs_train))
        print('Test AUC : {0:0.5f}'.format(Aucs_test))
        
        print('Train Precision : {0:0.5f}'.format(PrecisionScore_train))
        print('Test Precision : {0:0.5f}'.format(PrecisionScore_test))
        
        print('Train Recall : {0:0.5f}'.format(RecallScore_train))
        print('Test Recall : {0:0.5f}'.format(RecallScore_test))
        
        print('Train F1 : {0:0.5f}'.format(F1Score_train))
        print('Test F1 : {0:0.5f}'.format(F1Score_test))
        
        print('Confusion Matrix : \n', cnf_matrix)
        print("\n")

        fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred)
        auc = metrics.roc_auc_score(y_test, y_test_pred)
        plt.plot(fpr,tpr,linewidth=2, label=name + ", auc="+str(auc))

    plt.legend(loc=4)
    plt.plot([0,1], [0,1], 'k--' )
    plt.rcParams['font.size'] = 12
    plt.title('ROC curve')
    plt.xlabel('False Positive Rate (1 - Specificity)')
    plt.ylabel('True Positive Rate (Sensitivity)')
    plt.show()

In [None]:
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

In [None]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

In [None]:
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

Logistic Regression

In [None]:
LRmodels = []

LRmodels.append(('LR imbalance', LogisticRegression(solver='liblinear', multi_class='ovr'), X_train, y_train, X_test, y_test))
LRmodels.append(('LR Undersampling', LogisticRegression(solver='liblinear', multi_class='ovr'),X_train_rus, y_train_rus, X_test, y_test))
LRmodels.append(('LR Oversampling', LogisticRegression(solver='liblinear', multi_class='ovr'), X_train_ros, y_train_ros, X_test, y_test))
LRmodels.append(('LR SMOTE', LogisticRegression(solver='liblinear', multi_class='ovr'), X_train_smote, y_train_smote, X_test, y_test))
LRmodels.append(('LR ADASYN', LogisticRegression(solver='liblinear', multi_class='ovr'), X_train_adasyn, y_train_adasyn, X_test, y_test))

build_measure_model(LRmodels)

CatBoost

In [None]:
CBmodels = []

cb = CatBoostClassifier()

CBmodels.append(('CB imbalance', cb, X_train, y_train, X_test,y_test))
CBmodels.append(('CB Undersampling', cb, X_train_rus, y_train_rus, X_test, y_test))
CBmodels.append(('CB Oversampling', cb, X_train_ros, y_train_ros, X_test, y_test))
CBmodels.append(('CB SMOTE', cb, X_train_smote, y_train_smote, X_test, y_test))
CBmodels.append(('CB ADASYN', cb, X_train_adasyn, y_train_adasyn, X_test, y_test))


build_measure_model(CBmodels)

In [None]:
data = {'Model': names_lst,
       'Accuracy_Test': accuracy_test_lst,
       'AUC_Test': aucs_test_lst,
       'PrecisionScore_Test': precision_test_lst,
       'RecallScore_Test': recall_test_lst,
       'F1Score_Test': f1_test_lst}

print("Performance measures of various classifiers: \n")
performance_df = pd.DataFrame(data) 
performance_df.sort_values(['F1Score_Test', 'RecallScore_Test', 'AUC_Test'], ascending=False)