In [5281]:
from __future__ import print_function
import os
data_path = ['data']

In [5282]:
import pandas as pd

# Import the data using the file path
train_filepath = os.sep.join(data_path + ['Orange_Telecom_Churn_Data_train.csv'])
test_filepath = os.sep.join(data_path + ['Orange_Telecom_Churn_Data_test.csv'])

# csv to pandas DataFrame
train_data = pd.read_csv(train_filepath)
test_data = pd.read_csv(test_filepath)

In [5283]:
# only use 5 features

train_data.drop(['state', 'area_code', 'account_length', 'total_day_calls', 'phone_number', 'total_day_minutes', 'total_eve_minutes', 'total_eve_calls', 'total_night_minutes', 'total_night_calls', 'total_intl_minutes', 'total_intl_calls', 'total_intl_charge', 'number_vmail_messages', 'total_night_charge'], axis=1, inplace=True)

test_data.drop(['state', 'area_code', 'account_length', 'total_day_calls', 'phone_number', 'total_day_minutes', 'total_eve_minutes', 'total_eve_calls', 'total_night_minutes', 'total_night_calls', 'total_intl_minutes', 'total_intl_calls', 'total_intl_charge', 'number_vmail_messages', 'total_night_charge'], axis=1, inplace=True)

In [5284]:
train_data.columns

Index(['intl_plan', 'voice_mail_plan', 'total_day_charge', 'total_eve_charge',
       'number_customer_service_calls', 'churned'],
      dtype='object')

In [5285]:
# preprocessing - label feature to number

from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()

for col in ['intl_plan', 'voice_mail_plan', 'churned']:
    train_data[col] = lb.fit_transform(train_data[col])

for col in ['intl_plan', 'voice_mail_plan', 'churned']:
    test_data[col] = lb.fit_transform(test_data[col])

In [5286]:
# preprocessing - scale

from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

train_data = pd.DataFrame(scaler.fit_transform(train_data), columns=train_data.columns)

test_data = pd.DataFrame(scaler.fit_transform(test_data), columns=test_data.columns)

In [5287]:
# seperate stage1/stage2 data from train data

from sklearn.model_selection import train_test_split

X_train = train_data.copy()
y_train = X_train.pop('churned')

X_train_stage1, X_test_stage1, y_train_stage1, y_test_stage1 = train_test_split(X_train, y_train, test_size=0.45, stratify=y_train)

train_stage1_data = pd.concat([X_train_stage1, y_train_stage1], axis=1)

In [5288]:
# multi undersampling and create undersampled data set

import numpy as np
import math as math

non_churend_indices = train_stage1_data[train_stage1_data.churned == 0].index
non_churend_indices = np.random.permutation(non_churend_indices)    # shuffle indices

churned_indices = train_stage1_data[train_stage1_data.churned == 1].index

# calculate count of the undersampled data set
sample_count = math.floor(len(non_churend_indices) / len(churned_indices))

non_churned_indices_set = np.array_split(non_churend_indices, sample_count)

train_stage1_samples = []
# pair each non churned data with churned data
for i in range(sample_count):
    train_stage1_samples.append(
        pd.concat([
            train_stage1_data.loc[churned_indices],
            train_stage1_data.loc[non_churned_indices_set[i]]
        ])
    )

In [5289]:
X_train_stage1_set = []
y_train_stage1_set = []

for i in range(sample_count):
    X_train_stage1_set.append(train_stage1_samples[i].copy())
    y_train_stage1_set.append(X_train_stage1_set[i].pop('churned'))

In [5290]:
# create and learning knn model for each undersampled data

from sklearn.neighbors import KNeighborsClassifier

knn_set_stage1 = []
for i in range(sample_count):
    knn_set_stage1.append(KNeighborsClassifier(n_neighbors=25))
    knn_set_stage1[i].fit(X_train_stage1_set[i], y_train_stage1_set[i])

In [5291]:
# predict

y_pred_set_stage1 = []
for i in range(sample_count):
    y_pred_set_stage1.append(knn_set_stage1[i].predict(X_test_stage1))
    
total_y_pred_stage1 = np.sum(y_pred_set_stage1, axis=0)

In [5292]:
# voting

y_pred_stage1 = []
for i in range(len(total_y_pred_stage1)): 
    if (total_y_pred_stage1[i] >= 2):
        y_pred_stage1.append(1)
    else:
        y_pred_stage1.append(0)

In [5293]:
# divide data which are predicted as true

pred_true_indices = [i for i, y in enumerate(y_pred_stage1) if y == 1]
pred_true_indices = pd.Index(pred_true_indices)

X_train_stage2 = X_test_stage1.copy().reset_index()
X_train_stage2.drop(['index'], axis=1, inplace=True)
X_train_stage2 = X_train_stage2.loc[pred_true_indices]

y_train_stage2 = y_test_stage1.copy().reset_index()
y_train_stage2.drop(['index'], axis=1, inplace=True)
y_train_stage2 = y_train_stage2.loc[pred_true_indices].squeeze()

In [5294]:
# create stage2 knn model and learning using the data which are predicted as true at stage1

knn_stage2 = KNeighborsClassifier(n_neighbors=25)
knn_stage2.fit(X_train_stage2, y_train_stage2)

y_test_train_stage2 = knn_stage2.predict(X_train_stage2)

import sklearn.metrics as metrics
accuracy = metrics.accuracy_score(y_train_stage2, y_test_train_stage2)
accuracy

0.7209302325581395

In [5295]:
##### real predict #####
# ready for predicting test data

X_test = test_data.copy()
y_test = X_test.pop('churned')

In [5296]:
# predict using stage1 knn models

y_pred_set_for_test = []
for i in range(sample_count):
    y_pred_set_for_test.append(knn_set_stage1[i].predict(X_test))
    
total_y_pred_for_test = np.sum(y_pred_set_for_test, axis=0)

# voting

y_pred_for_test = []
for i in range(len(total_y_pred_for_test)): 
    if (total_y_pred_for_test[i] >= 2):
        y_pred_for_test.append(1)
    else:
        y_pred_for_test.append(0)

In [5297]:
# simple metrics

import sklearn.metrics as metrics

precision, recall, fscore, _ = metrics.precision_recall_fscore_support(y_test, y_pred_for_test, average='weighted')
accuracy = metrics.accuracy_score(y_test, y_pred_for_test)

result_metrics = list()
result_metrics.append(pd.Series({'precision':precision, 'recall':recall, 
                        'fscore':fscore, 'accuracy':accuracy}, 
                        name='scores'))

result_metrics = pd.concat(result_metrics, axis=1)

result_metrics

Unnamed: 0,scores
precision,0.886983
recall,0.804
fscore,0.828436
accuracy,0.804


In [5298]:
print(metrics.confusion_matrix(y_test, y_pred_for_test))

[[1030  258]
 [  36  176]]


In [5299]:
# divide data which are predicted as true

pred_true_indices = [i for i, y in enumerate(y_pred_for_test) if y == 1]
pred_true_indices = pd.Index(pred_true_indices)

X_test_pred_true = X_test.copy().reset_index()
X_test_pred_true.drop(['index'], axis=1, inplace=True)
X_test_pred_true = X_test_pred_true.loc[pred_true_indices]

y_test_pred_true = y_test.copy().reset_index()
y_test_pred_true.drop(['index'], axis=1, inplace=True)
y_test_pred_true = y_test_pred_true.loc[pred_true_indices].squeeze()

In [5300]:
# predict the data which are predicted as true using stage2 knn model 

y_pred_pred_true = knn_stage2.predict(X_test_pred_true)

import sklearn.metrics as metrics
accuracy = metrics.accuracy_score(y_test_pred_true, y_pred_pred_true)
accuracy

0.7442396313364056

In [5301]:
# gather the final predicted data

iter = 0
for pred_true_index in pred_true_indices:
    y_pred_for_test[pred_true_index] = y_pred_pred_true[iter]
    iter += 1

In [5302]:
# simple metrics

import sklearn.metrics as metrics

precision, recall, fscore, _ = metrics.precision_recall_fscore_support(y_test, y_pred_for_test, average='weighted')
accuracy = metrics.accuracy_score(y_test, y_pred_for_test)

result_metrics = list()
result_metrics.append(pd.Series({'precision':precision, 'recall':recall, 
                        'fscore':fscore, 'accuracy':accuracy}, 
                        name='scores'))

result_metrics = pd.concat(result_metrics, axis=1)

result_metrics

Unnamed: 0,scores
precision,0.894782
recall,0.902
fscore,0.896769
accuracy,0.902


In [5303]:
# detail metrics

print(metrics.classification_report(y_test, y_pred_for_test, target_names=['false', 'true']))

precision    recall  f1-score   support

       false       0.93      0.96      0.94      1288
        true       0.70      0.54      0.61       212

    accuracy                           0.90      1500
   macro avg       0.81      0.75      0.78      1500
weighted avg       0.89      0.90      0.90      1500



In [5304]:
# confusion matrix

print(metrics.confusion_matrix(y_test, y_pred_for_test))

[[1238   50]
 [  97  115]]
