In [34]:
import pandas as pd
import numpy as np
import os
from sklearn import metrics

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.under_sampling import TomekLinks

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold


# ML approaches to consider
ml_names = ['knn', 'Gnb', 'LR', 'SVM', 'SVM-rbf', 'DT', 'RF', 'XGBoost']

# Sampling methods
l_sampling = ["plain", "smote", "adasyn", "tomekLinks", "ncr", "smoteRandom", "smoteTomek", "smoteNcr"]
#l_sampling = ["plain", "smote"]

# Datasets
dataset_name = ['Bank', 'C2C', 'DSN', 'HR', 'K2009', 'KKBox', 'Member', 'Mobile', 'SATO', 'TelC', 'TelE', 'UCI', 'news']
dataset_name = ['Bank']


# sampling strategy
sampling_strat = dict()
sampling_strat["plain"] = "empty"
sampling_strat["smote"] = SMOTE(random_state=152)
sampling_strat["adasyn"] = ADASYN(random_state=152, sampling_strategy='not minority')
sampling_strat["tomekLinks"] = TomekLinks()
sampling_strat["ncr"] = NeighbourhoodCleaningRule()

over = SMOTE(random_state=152)
under = RandomUnderSampler(random_state=152)
steps = [('o', over), ('u', under)]
sampling_strat["smoteRandom"] = Pipeline(steps=steps)

sampling_strat["smoteTomek"] = SMOTETomek(random_state=152)

over = SMOTE(random_state=152)
under = NeighbourhoodCleaningRule()
steps = [('o', over), ('u', under)]              
sampling_strat["smoteNcr"] = Pipeline(steps=steps)  


In [38]:
# ------- Begin ------ #
ml_auc_list = dict()

kf = StratifiedKFold(n_splits=5)

for dataset in dataset_name:
    ml_auc_list[dataset] = dict()

    if dataset == "KKBox":
        target = "is_churn"
    else:
        target = "churn"

    # Load fold results
    telcom = pd.read_csv(f'/home/ec2-user/SageMaker/data/churn_package/churn/post_preprocessing_files/{dataset}.csv', sep=';')
    X = telcom.drop(target, axis=1)
    Y = telcom[target]
    
    fold_nb = 1
 
    for sampling in l_sampling:
        ml_auc_list[dataset][sampling] = dict()
        for train_index, test_index in kf.split(X, Y):
            
            X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
            y_train, y_test = Y[train_index], Y[test_index]
            
            X_col_names = X_train.columns
            y_col_names = "churn"        
        
            if sampling_strat[sampling] == "empty":
                X_resampled = X
                y_resampled = Y
            else:
                sm = sampling_strat[sampling]
                X_resampled, y_resampled = sm.fit_resample(X_train, y_train) 
        
    
            path_out = f"churn_package/churn/sampled_datasets/{dataset}/stratified_folds/{dataset}_{sampling}_skf_k{fold_nb}_train.csv"
            train_df = pd.concat((X_resampled, y_resampled), axis=1)
            
            
            path_out_test = f"churn_package/churn/sampled_datasets/{dataset}/stratified_folds/{dataset}_{sampling}_skf_k{fold_nb}_test.csv"
            test_df = pd.concat([X_test, y_test], axis=1)

            fold_nb = fold_nb + 1
            
        fold_nb = 1

            
