In [229]:
import sys
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from imblearn import over_sampling

IN_COLAB = 'google.colab' in sys.modules
warnings.filterwarnings("ignore", category=UserWarning)

if IN_COLAB:
  df = pd.read_csv('https://raw.githubusercontent.com/chakraskun/churn-modelling/main/Churn_Modelling.csv')
else:
  df = pd.read_csv('Churn_Modelling.csv')

In [230]:
df.sample(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9369,9370,15795458,McMillan,718,Spain,Female,39,2,0.0,1,1,1,52138.49,0
4875,4876,15619616,Costa,571,France,Female,33,9,102017.25,2,0,0,128600.49,0
74,75,15770811,Wallace,519,France,Male,36,9,0.0,2,0,1,145562.4,0
2798,2799,15576044,Macdonald,579,Germany,Male,28,6,150329.15,1,1,0,145558.42,0
5287,5288,15803078,Bruno,635,Spain,Female,38,1,0.0,2,1,0,90605.05,0


In [231]:
numerical = [
  'CreditScore',
  'Age',
  'Balance',
  'EstimatedSalary',
  'Tenure',
  'NumOfProducts',
]
categorical = [
  'Geography',
  'Gender',
  'HasCrCard',
  'IsActiveMember',
]

# Data Preprocessing

- Dari hasil EDA didapat bahwa tidak ada value yang null
- drop CustomerId, Surname dan RowNumber
- tidak ada duplikat

In [232]:
df.duplicated().sum()

0

In [233]:
try:
  df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)
except:
  pass

df.sample(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
2264,582,France,Female,29,0,0.0,1,1,1,84012.81,0
3497,456,France,Female,63,1,165350.61,2,0,0,140758.07,1
7422,622,France,Male,29,7,101486.96,1,1,1,8788.35,0
7174,640,France,Female,50,9,117565.03,2,0,0,82559.77,0
3041,671,France,Male,34,7,106603.74,2,1,1,26387.71,0


# Outliers Handling

Outlier untuk sementara tidak dihandle, dikarenakan outlier bersifat statistikal saja,

# Split Dataset

In [234]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [235]:
try:
  df_test.drop(columns=['Exited'], inplace=True)
except:
  print(df_test.columns)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [236]:
df_test.to_csv('test_raw.csv', index=False)
df_train.to_csv('train_raw.csv', index=False)

# Transformation Train Dataset

## Re-read raw dataset

In [237]:
test = pd.read_csv('test_raw.csv')
train = pd.read_csv('train_raw.csv')

In [238]:
handled_dataset = [train, test]

## Handle Age

In [239]:
for i in handled_dataset:
  i['LogAge'] = np.log(i['Age'])

## Handle Balance, EstimatedSalary, Tenure, NumOfProducts

In [240]:
for i in handled_dataset:
  i['LogAgeStd'] = StandardScaler().fit_transform(i['LogAge'].values.reshape(len(i), 1))
  i['BalanceStd'] = StandardScaler().fit_transform(i['Balance'].values.reshape(len(i), 1))
  i['EstimatedSalaryStd'] = StandardScaler().fit_transform(i['EstimatedSalary'].values.reshape(len(i), 1))
  i['TenureNorm'] = MinMaxScaler().fit_transform(i['Tenure'].values.reshape(len(i), 1))
  i['NumOfProductsNorm'] = MinMaxScaler().fit_transform(i['NumOfProducts'].values.reshape(len(i), 1))
  i['CreditScoreNorm'] = MinMaxScaler().fit_transform(i['CreditScore'].values.reshape(len(i), 1))

# Feature Encoding

In [241]:
mapping_gender = {
    'Female' : 0,
    'Male' : 1
  }
for i in handled_dataset:
  i['Gender'] = i['Gender'].map(mapping_gender)

In [242]:
for i in handled_dataset:
  try:
    one_hot_encoding = pd.get_dummies(i['Geography'], prefix='geo')
    i = i.join(one_hot_encoding)
  except:
    pass

# Drop Unused Columns

In [243]:
for i in handled_dataset:
  i.drop(columns=['Geography','Age', 'Balance', 'EstimatedSalary', 'Tenure', 'NumOfProducts', 'LogAge', 'CreditScore'], inplace=True)

In [244]:
train.sample(5)

Unnamed: 0,Gender,HasCrCard,IsActiveMember,Exited,LogAgeStd,BalanceStd,EstimatedSalaryStd,TenureNorm,NumOfProductsNorm,CreditScoreNorm
2951,0,1,0,0,-2.839627,0.283628,0.447547,0.2,0.333333,0.754
5305,1,1,1,0,2.452205,1.0639,0.383819,0.9,0.333333,0.86
4724,1,0,1,0,-0.166744,0.256784,-0.57311,0.5,0.0,0.884
1842,0,1,0,0,-1.135852,-1.218471,1.58143,0.3,0.333333,0.422
7558,1,1,1,0,-0.06109,0.978314,-1.244289,0.4,0.0,1.0


In [245]:
test.sample(5)

Unnamed: 0,Gender,HasCrCard,IsActiveMember,LogAgeStd,BalanceStd,EstimatedSalaryStd,TenureNorm,NumOfProductsNorm,CreditScoreNorm
1836,0,1,1,-0.528704,0.301812,-1.076675,0.8,0.0,0.626
896,0,0,1,-0.649584,0.198042,0.348339,0.4,0.333333,0.43
518,0,1,0,-0.079268,0.360776,-1.106672,0.2,0.0,0.916
1601,1,1,1,-1.465251,0.132811,-0.148786,0.8,0.333333,0.446
104,1,1,1,-0.528704,0.829929,0.427703,0.1,0.0,0.78


# Handle imbalance class on train df

In [246]:
X = train[[col for col in train.columns if (str(train[col].dtype) != 'object') and col != 'Exited']]
y = train['Exited'].values
X_over_SMOTE, y_over_SMOTE = over_sampling.SMOTE(sampling_strategy=0.5).fit_resample(X, y)

print('BEFORE')
print(pd.Series(y).value_counts())
print('----------------------')
print('AFTER')
print(pd.Series(y_over_SMOTE).value_counts())

BEFORE
0    6356
1    1644
dtype: int64
----------------------
AFTER
0    6356
1    3178
dtype: int64


In [247]:
X_over_SMOTE.to_csv('churn_train.csv', index=False)

In [248]:
test.to_csv('churn_test.csv', index=False)

# Kesimpulan

- Outlier pada feature `Age` dan `CreditScore` tidak dihandle, karena bersifat statistical (to be reviewed step selanjutnya)
- Pada feature `Age` dilakukan log transformation agar mendapatkan hasil mendekati distribusi normal
- Feature `Balance` dan `EstimatedSalary` dilakukan standarizarion (to be reviewed di step selanjutnya)
- Feature `Tenure` dan `NumOfProducts` dilakukan normalization (to be reviewed di step selanjutnya)
- Dataset di split menjadi 80% train dan 20% test (`random_state=42`)
- Data imbalance pada dataset train di handle dengan menggunakan oversampling SMOTE