In [None]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.pipeline import Pipeline
from feature_engine.selection import (DropFeatures, DropConstantFeatures, 
                                      DropDuplicateFeatures)

In [None]:
# Load data sets
data_sets_raw = dict()

# KDD (KDD Cup 2009: Customer relationship prediction)
# https://www.openml.org/search?type=data&status=active&id=1112
kdd_data = arff.loadarff('00_data/00_raw/KDDCup09_churn.arff')
kdd_df = pd.DataFrame(kdd_data[0])
# Ensure correct encoding
kdd_df[kdd_df.select_dtypes(object).columns] = kdd_df.select_dtypes(object).apply(lambda x: x.str.decode('utf-8'))
kdd_df[kdd_df.select_dtypes(object).columns] = kdd_df[kdd_df.select_dtypes(object).columns].replace({'?':np.nan})
kdd_df.rename(columns={'CHURN':'churn'}, inplace=True)
kdd_df['churn'].replace({"-1":False, "1":True}, inplace=True)
kdd_df['churn'] = kdd_df['churn'].astype(bool)
data_sets_raw['kdd'] = kdd_df

# IBM HR Analytics Employee Attrition & Performance
# https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attrition-dataset
ibm_hr = pd.read_csv('00_data/00_raw/WA_Fn-UseC_-HR-Employee-Attrition.csv')
ibm_hr.rename(columns={'Attrition':'churn'}, inplace=True)
ibm_hr['churn'].replace({'No':False, 'Yes':True}, inplace=True)
ibm_hr['churn'] = ibm_hr['churn'].astype(bool)
ibm_hr.drop(columns=["EmployeeNumber"], inplace=True)
data_sets_raw['ibm_hr'] = ibm_hr

# Customer Churn Prediction 2020
# https://www.kaggle.com/competitions/customer-churn-prediction-2020/
# Also available here: https://data.world/earino/churn
ccp_2020 = pd.read_csv("00_data/00_raw/customer-churn-prediction-2020.csv")
ccp_2020['churn'].replace({'no':False, 'yes':True}, inplace=True)
ccp_2020['churn'] = ccp_2020['churn'].astype(bool)
data_sets_raw['ccp'] = ccp_2020

# Portuguese Bank Marketing Data Set
# https://www.kaggle.com/datasets/yufengsui/portuguese-bank-marketing-data-set
prt_bank = pd.read_csv("00_data/00_raw/bank-full.csv", delimiter=";")
prt_bank['y'].replace({'no':False, 'yes':True}, inplace=True)
prt_bank['y'] = prt_bank['y'].astype(bool)
prt_bank.rename(columns={'y':'churn'}, inplace=True)
data_sets_raw['prt_bank'] = prt_bank

In [None]:
def df_pre_cleaning(df, threshold=0.2):
    # Column names to lowercase
    df = df.rename(columns=str.lower)
    
    # Keep only features with <20% missing values
    x = df.isna().sum() / len(df)
    df = df[x[threshold<0.2].index]
    return df


# Apply function to all data sets above
data_sets_pc = dict((k, df_pre_cleaning(df)) for k,df in data_sets_raw.items())

# Drop constant values or values that are unique
data_sets_cleaned = dict()
ppl = Pipeline([
    ('drop_constant_values', DropConstantFeatures(tol=1, missing_values='ignore')),
    ('drop_duplicates', DropDuplicateFeatures())
])

for k in data_sets_pc.keys():
    data_sets_cleaned[k] = ppl.fit_transform(data_sets_pc[k])

In [None]:
# Summarize information about the column types in each data set
def prep_stats(df, data_set_name):
    dtypes_map = {'int64':'numeric', 'float64':'numeric', 'object':'object', 'category':'object',
             'datetime64':'date', 'timedelta[ns]':'date', 'bool':'bool'}

    k = df.dtypes.value_counts().reset_index()
    k['index'].replace(dtypes_map, inplace=True)
    k = k.groupby('index', as_index=False).sum()
    k.set_index('index', inplace=True)
    k.columns = [data_set_name]
    
    return k.to_dict()

In [None]:
data_sets_stats = {}

In [None]:
for k in data_sets_cleaned.keys():

    churn_rate = data_sets_cleaned[k].churn.value_counts().loc[True,] / (
        data_sets_cleaned[k].churn.value_counts().loc[True,]+
        data_sets_cleaned[k].churn.value_counts().loc[False,]
    )
     
    data_sets_stats.update(prep_stats(data_sets_cleaned[k], k))
    data_sets_stats[k].update({'churn':round(churn_rate,2)})

In [None]:
pd.DataFrame(data_sets_stats)

In [None]:
# Store all the stuff
for name, df in data_sets_cleaned.items():
    df.to_csv('00_data/00_data/{}_cleaned.csv'.format(name), index=False)

In [None]:
# Zip them: bnchmrk_datasets.tar.gz
! tar -czf bnchmrk_datasets.tar.gz 00_data/