# Skewed or imbalanced handling of Dataset
## Using Telco customer dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("../datasets/Telco_Customer_Churn.csv")

In [3]:
df.sample(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
6012,5685-IIXLY,Female,0,Yes,Yes,5,Yes,No,Fiber optic,No,...,No,No,No,Yes,Month-to-month,Yes,Electronic check,83.6,404.2,Yes
3766,4201-JMNGR,Female,1,No,No,1,Yes,No,DSL,Yes,...,Yes,No,No,No,Month-to-month,Yes,Electronic check,55.8,55.8,Yes
4708,5181-OABFK,Female,0,Yes,Yes,56,Yes,No,DSL,Yes,...,Yes,No,No,No,Two year,Yes,Credit card (automatic),61.3,3346.8,No
3064,7855-DIWPO,Female,0,No,No,21,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,68.65,1493.2,No
6108,4819-HJPIW,Male,0,No,No,18,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,No,Mailed check,25.15,476.8,No


## Data Cleaning

In [4]:
df.drop('customerID',axis='columns',inplace=True)

In [5]:
df[pd.to_numeric(df.TotalCharges,errors='coerce').isnull()]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,Male,0,No,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,Female,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,Yes,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [6]:
df1 = df[df.TotalCharges!=' ']
df1_cp = df1.copy()

In [7]:
df1_cp.TotalCharges = pd.to_numeric(df1_cp.TotalCharges)

In [8]:
df1_cp.replace('No internet service','No',inplace=True)
df1_cp.replace('No phone service','No',inplace=True)

In [9]:
yes_no_columns = ['Partner','Dependents','PhoneService','MultipleLines','OnlineSecurity','OnlineBackup',
                  'DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Churn']
for col in yes_no_columns:
    df1_cp[col].replace({'Yes': 1,'No': 0},inplace=True)

In [10]:
df1_cp['gender'].replace({'Female':1,'Male':0},inplace=True)

## Preprocessing

In [11]:
df2 = pd.get_dummies(data=df1_cp, columns=['InternetService','Contract','PaymentMethod'])
df2.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [12]:
cols_to_scale = ['tenure','MonthlyCharges','TotalCharges']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df2[cols_to_scale] = scaler.fit_transform(df2[cols_to_scale])

## Train, test split

In [13]:
X = df2.drop('Churn',axis='columns')
y = testLabels = df2.Churn.astype(np.float32)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

# stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify.
# For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your random 
# split has 25% of 0's and 75% of 1's.

In [14]:
y_train.value_counts()

0.0    4130
1.0    1495
Name: Churn, dtype: int64

In [15]:
y.value_counts()

0.0    5163
1.0    1869
Name: Churn, dtype: int64

In [16]:
len(X_train.columns)

26

## Build a model (ANN) in tensorflow/keras

In [17]:
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import confusion_matrix , classification_report

In [18]:
def ANN(X_train, y_train, X_test, y_test, loss, weights):
    model = keras.Sequential([
        keras.layers.Dense(26, input_dim=26, activation='relu'),
        keras.layers.Dense(15, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    
    if weights == -1:
        model.fit(X_train, y_train, epochs=100)
    else:
        model.fit(X_train, y_train, epochs=100, class_weight = weights)
    
    print("\n", model.evaluate(X_test, y_test))
    
    y_preds = model.predict(X_test)
    y_preds = np.round(y_preds)
    
    print("\nClassification Report: \n", classification_report(y_test, y_preds))
    
    return y_preds

In [19]:
%%timeit -n1 -r1
y_preds = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<br><br>

## Mitigating Skewdness of Data

### 1. Method : Undersampling

Undersampling is a technique to balance uneven datasets by keeping all of the data in the minority class and decreasing the size of the majority class. It is one of several techniques data scientists can use to extract more accurate information from originally imbalanced datasets.

In [20]:
#divide the classes
df_class_0 = df2[df2['Churn'] == 0]
df_class_1 = df2[df2['Churn'] == 1]

In [21]:
df_class_0.sample(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
4377,1,0,1,0,0.295775,1,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
6603,0,0,1,0,0.183099,1,1,0,0,0,...,1,0,0,1,0,0,0,1,0,0


In [22]:
df_class_1.sample(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
6634,1,0,1,1,0.126761,1,1,0,1,1,...,0,1,0,1,0,0,0,1,0,0
1995,1,1,0,0,0.0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0


In [23]:
# get the count of each classes
count_class_0, count_class_1 = df2['Churn'].value_counts()
count_class_0, count_class_1

(5163, 1869)

In [24]:
df_class_0_under_smp = df_class_0.sample(count_class_1)

In [25]:
df_class_0_under_smp.shape

(1869, 27)

_Now, we can see that the majority (5163) i.e. the class 0 is reduced to the same size as of class 1 which is 1869_

In [26]:
# let now merge both the dataframe of class_0 and class_1 in into one

df_under_smp = pd.concat([df_class_0_under_smp, df_class_1], axis=0)

In [27]:
print('Random under-sampling:')
print(df_under_smp['Churn'].value_counts())

Random under-sampling:
0    1869
1    1869
Name: Churn, dtype: int64


_Now, the both of the class has same size_

In [28]:
# split into train, test

X = df_under_smp.drop('Churn', axis=1)
y = df_under_smp['Churn']


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

In [29]:
X_train.shape, y_train.shape

((2990, 26), (2990,))

In [30]:
# Number of classes in training Data
y_train.value_counts()

0    1495
1    1495
Name: Churn, dtype: int64

In [31]:
y_test.value_counts()

1    374
0    374
Name: Churn, dtype: int64

In [32]:
# Printing Classification in the last, Scroll down till the last epoch to watch the classification report

y_preds = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

_Check classification report above. f1-score for minority class 1 improved from 0.51 to 0.74. Score for class 0 reduced to 0.73 from 0.85 but that's ok._

<br><br>

#### Difference ----------> normal vs under sampling

##### Normal df:

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.82      0.89      0.85      1033
         1.0       0.60      0.45      0.51       374

        accuracy                           0.77      1407
       macro avg       0.71      0.67      0.68      1407
    weighted avg       0.76      0.77      0.76      1407

##### Under Samplimg df:

Classification Report: 
               precision    recall  f1-score   support

           0       0.74      0.73      0.73       374
           1       0.73      0.74      0.74       374

        accuracy                           0.74       748
       macro avg       0.74      0.74      0.74       748
    weighted avg       0.74      0.74      0.74       748

### 2. Method: Oversampling 

The simplest oversampling method involves randomly duplicating examples from the minority class in the training dataset, referred to as Random Oversampling.

In [35]:
df_class_0.shape, df_class_1.shape

((5163, 27), (1869, 27))

In [38]:
# Now, lets oversample the the minority class i.e. class_1

df_class_1_over_sample = df_class_1.sample(count_class_0, replace=True)

In [39]:
df_class_1_over_sample.shape

(5163, 27)

In [40]:
# concat both the classes into a df
df_over_smp = pd.concat([df_class_1_over_sample, df_class_0], axis=0)
df_over_smp['Churn'].value_counts()

1    5163
0    5163
Name: Churn, dtype: int64

In [41]:
# train, test split
X = df_over_smp.drop('Churn',axis='columns')
y = df_over_smp['Churn']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

In [42]:
# Number of classes in training Data
y_train.value_counts()

1    4130
0    4130
Name: Churn, dtype: int64

In [43]:
# Number of classes in test Data
y_test.value_counts()

1    1033
0    1033
Name: Churn, dtype: int64

In [44]:
# print the classification

y_preds = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<br><br>

#### Difference ----------> Under sampling vs Over sampling

##### Normal df:

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.82      0.89      0.85      1033
         1.0       0.60      0.45      0.51       374

        accuracy                           0.77      1407
       macro avg       0.71      0.67      0.68      1407
    weighted avg       0.76      0.77      0.76      1407

##### Over Samplimg df:
    
Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.75      0.77      1033
           1       0.76      0.80      0.78      1033

          accuracy                           0.77      2066
         macro avg       0.77      0.77      0.77      2066
      weighted avg       0.77      0.77      0.77      2066

<br><br>
_Check classification report above. f1-score for minority class 1 improved from 0.51 to 0.78. Score for class 0 reduced to 0.77 from 0.85_

### 3. Method : SMOTE

SMOTE (synthetic minority oversampling technique) is one of the most commonly used oversampling methods to solve the imbalance problem. It aims to balance class distribution by randomly increasing minority class examples by replicating them. SMOTE synthesises new minority by using KNN algo.


In [53]:
# !pip install imbalanced-learn

In [50]:
X = df2.drop('Churn',axis='columns')
y = df2['Churn']

In [51]:
y.value_counts()

0    5163
1    1869
Name: Churn, dtype: int64

On above we see there is a imblanced in class 0 and 1.

In [55]:
# let call SMOTE to smaple it
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X,y)

In [58]:
y_sm.value_counts()

0    5163
1    5163
Name: Churn, dtype: int64

Now, the classes are balanced

In [59]:
#train, test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=15, stratify=y_sm)

In [60]:
# Number of classes in training Data
y_train.value_counts()

1    4130
0    4130
Name: Churn, dtype: int64

In [61]:
y_test.value_counts()

1    1033
0    1033
Name: Churn, dtype: int64

In [64]:
y_preds = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<br><br>

#### Difference ----------> Normal sampling vs SMOTE sampling

##### Normal df:

Classification Report:

                precision    recall  f1-score   support

         0.0       0.82      0.89      0.85      1033
         1.0       0.60      0.45      0.51       374

        accuracy                           0.77      1407
       macro avg       0.71      0.67      0.68      1407
    weighted avg       0.76      0.77      0.76      1407


<br>

##### SMOTE Samplimg df:
    
Classification Report: 
    
                   precision    recall  f1-score   support
        
               0       0.84      0.75      0.79      1033
               1       0.78      0.86      0.82      1033

        accuracy                           0.81      2066
       macro avg       0.81      0.81      0.81      2066
    weighted avg       0.81      0.81      0.81      2066

<br><br>
SMOTE Oversampling increases f1 score of minority class 1 from 0.51 to 0.83 (huge improvement) Also over all accuracy improves from 0.77 to 0.81

### 4. Method: Use of Ensemble with undersampling

Ensemble of Undersampling (EUS) divides the whole negative set into several subsets, which are combined with the positive set

In [66]:
# Regain Original features and labels
X = df2.drop('Churn',axis='columns')
y = df2['Churn']

In [67]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

In [68]:
y_train.value_counts()

0    4130
1    1495
Name: Churn, dtype: int64

So, as we see that there are 4130 class_0 so we can divide it into 3 part in 1495

like...<br>
class0_a[:1495]<br>
class0_b[1495:2990]<br>
class0_c[2990:4130]

In [96]:
X_train_dfE_cp = X_train.copy()
X_train_dfE_cp['Churn'] = y_train

dfE_class_0 = X_train_dfE_cp[X_train_dfE_cp['Churn'] == 0]
dfE_class_1 = X_train_dfE_cp[X_train_dfE_cp['Churn'] == 1]

In [97]:
X_train_dfE_cp.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
684,1,0,0,0,0.0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,1,0
2446,1,0,0,0,0.239437,1,1,0,1,0,...,1,0,1,0,0,0,1,0,0,1
1680,0,0,1,1,0.774648,1,1,0,0,0,...,0,1,0,1,0,0,0,0,1,0
2220,0,0,1,0,1.0,1,0,1,1,0,...,0,0,0,0,1,1,0,0,0,0
2842,1,0,0,0,0.042254,0,0,1,0,1,...,0,0,1,0,0,0,0,0,1,0


In [98]:
def get_train_batch(df_majority, df_minority, start, end):
    df_train = pd.concat([df_majority[start:end], df_minority], axis=0)

    X_train = df_train.drop('Churn', axis='columns')
    y_train = df_train.Churn
    return X_train, y_train 

In [101]:
# split1
X_train, y_train = get_train_batch(dfE_class_0, dfE_class_1, 0, 1495)
y_pred1 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [103]:
# split2
X_train, y_train = get_train_batch(dfE_class_0, dfE_class_1, 1495,2990)
y_pred2 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [105]:
# split3
X_train, y_train = get_train_batch(dfE_class_0, dfE_class_1, 2990, 4130)
y_pred3 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [114]:
y_pred_final = []
for i in range(len(y_pred1)):
    n_ones = y_pred1[i] + y_pred2[i] + y_pred3[i]
    if n_ones>1:
        y_pred_final.append(1)
    else:
        y_pred_final.append(0)

In [124]:
cl_rep = classification_report(y_test, y_pred_final)
print("\nEmsemble sampling: \n\nClassification Report:\n"+cl_rep)


Emsemble sampling: 

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.69      0.78      1033
           1       0.47      0.76      0.58       374

    accuracy                           0.71      1407
   macro avg       0.68      0.73      0.68      1407
weighted avg       0.78      0.71      0.73      1407



<br><br>

#### Difference ----------> Normal sampling vs Emsemble sampling

##### Normal df:

Classification Report:

                precision    recall  f1-score   support

         0.0       0.82      0.89      0.85      1033
         1.0       0.60      0.45      0.51       374

        accuracy                           0.77      1407
       macro avg       0.71      0.67      0.68      1407
    weighted avg       0.76      0.77      0.76      1407


<br><br>
From the above classification we can conclude that,
f1-score for minority class 1 improved to 0.58 from 0.51. The score for majority class 0 is suffering and reduced to 0.78 from 0.85 but at least there is some balance in terms of prediction accuracy across two classes

## Best sampling:
Is the SMOTE cuz we got f1-score for class 0 is 0.79 and class 1 is 0.82, which is the best out of all the sampling we have done

SMOTE Samplimg df, Classification Report:

                   precision    recall  f1-score   support
    
               0       0.84      0.75      0.79      1033
               1       0.78      0.86      0.82      1033

        accuracy                           0.81      2066
       macro avg       0.81      0.81      0.81      2066
    weighted avg       0.81      0.81      0.81      2066