In [1]:
from __future__ import print_function
import os
data_path = ['data']

In [2]:
import pandas as pd

# Import the data using the file path
filepath = os.sep.join(data_path + ['Orange_Telecom_Churn_Data.csv'])
data = pd.read_csv(filepath)

In [3]:
# drop uselesss features
data.drop(['state', 'area_code', 'phone_number', 'account_length', 'total_day_calls'
           ], axis=1, inplace=True)

In [4]:
data.columns

Index(['intl_plan', 'voice_mail_plan', 'number_vmail_messages',
       'total_day_minutes', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls', 'total_night_charge', 'total_intl_minutes',
       'total_intl_calls', 'total_intl_charge',
       'number_customer_service_calls', 'churned'],
      dtype='object')

In [5]:
# preprocessing - label to number

from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()

for col in ['intl_plan', 'voice_mail_plan', 'churned']:
    data[col] = lb.fit_transform(data[col])

In [6]:
# preprocessing - scale

from sklearn.preprocessing import MinMaxScaler

msc = MinMaxScaler()

data = pd.DataFrame(msc.fit_transform(data),  # this is an np.array, not a dataframe.
                    columns=data.columns)

In [7]:
# data['number_customer_service_calls'] = 3 * data['number_customer_service_calls']
data

Unnamed: 0,intl_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churned
0,0.0,1.0,0.480769,0.754196,0.754183,0.542755,0.582353,0.542866,0.619494,0.520000,0.619584,0.500,0.15,0.500000,0.111111,0.0
1,0.0,1.0,0.500000,0.459744,0.459672,0.537531,0.605882,0.537690,0.644051,0.588571,0.644344,0.685,0.15,0.685185,0.111111,0.0
2,0.0,0.0,0.000000,0.692461,0.692436,0.333242,0.647059,0.333225,0.411646,0.594286,0.411930,0.610,0.25,0.609259,0.000000,0.0
3,1.0,0.0,0.000000,0.851778,0.851740,0.170195,0.517647,0.170171,0.498481,0.508571,0.498593,0.330,0.35,0.329630,0.222222,0.0
4,1.0,0.0,0.000000,0.474253,0.474230,0.407754,0.717647,0.407959,0.473165,0.691429,0.473270,0.505,0.15,0.505556,0.333333,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,1.0,0.769231,0.670555,0.670515,0.613143,0.741176,0.613394,0.753165,0.662857,0.753517,0.495,0.25,0.494444,0.222222,0.0
4996,0.0,0.0,0.000000,0.524040,0.523929,0.706076,0.429412,0.706244,0.540759,0.645714,0.540799,0.735,0.10,0.735185,0.333333,1.0
4997,0.0,0.0,0.000000,0.400000,0.399933,0.475117,0.752941,0.475251,0.537722,0.554286,0.537985,0.680,0.20,0.679630,0.111111,0.0
4998,0.0,0.0,0.000000,0.537127,0.537149,0.472092,0.541176,0.472016,0.568101,0.508571,0.568374,0.425,0.30,0.425926,0.000000,0.0


In [8]:
# seperate train/test data

from sklearn.model_selection import train_test_split

X = data.copy()
y = X.pop('churned')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

# train all data
# train_data = data

# test all data
# X_test = X
# y_test = y

In [9]:
# feature selection

# from io import StringIO
# from sklearn.tree import DecisionTreeClassifier
# from sklearn import tree
# import pydotplus
# from IPython.display import Image

# dt = DecisionTreeClassifier(random_state=42)
# dt = dt.fit(X, y)
# dt.tree_.node_count, dt.tree_.max_depth


# dot_data = StringIO()

# tree.export_graphviz(dt, out_file=dot_data, feature_names=X.columns)
# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

# # View the tree image
# filename = 'wine_tree.png'
# graph.write_png(filename)
# img = Image(filename=filename)
# display(img)

In [10]:
import numpy as np
import math as math

non_churned_indices = train_data[train_data.churned == 0].index
non_churend_indices = np.random.permutation(non_churned_indices)
churned_indices = train_data[train_data.churned == 1].index

scale_diff = math.floor(len(non_churned_indices) / len(churned_indices))

non_churned_indices_set = np.array_split(non_churned_indices, scale_diff)

train_data_samples = []
for i in range(scale_diff):
    train_data_samples.append(
        pd.concat([train_data.loc[churned_indices], train_data.loc[non_churned_indices_set[i]]])
    )

In [11]:
X_train_set = []
y_train_set = []

for i in range(scale_diff):
    X_train_set.append(train_data_samples[i].copy())
    y_train_set.append(X_train_set[i].pop('churned'))

In [12]:
#learning

from sklearn.neighbors import KNeighborsClassifier

knn_set = []
for i in range(scale_diff):
    knn_set.append(KNeighborsClassifier(n_neighbors=5))
    knn_set[i].fit(X_train_set[i], y_train_set[i])

In [13]:
# predict

y_pred_set = []
for i in range(scale_diff):
    y_pred_set.append(knn_set[i].predict(X_test))
    
total_y_pred = np.sum(y_pred_set, axis=0)

In [14]:
# with np.nditer(total_y_pred, op_flags=['readwrite']) as it:
#     for i in it:
#         if i >= scale_diff - 1:
#             i[...] = 1
#         else:
#             i[...] = 0.0

for i in range(len(total_y_pred)): 
    if (total_y_pred[i] >= scale_diff):
        total_y_pred[i] = 1
    else:
        total_y_pred[i] = 0

In [15]:
# simple metrics

import sklearn.metrics as metrics

precision, recall, fscore, _ = metrics.precision_recall_fscore_support(y_test, total_y_pred, average='weighted')
accuracy = metrics.accuracy_score(y_test, total_y_pred)

result_metrics = list()
result_metrics.append(pd.Series({'precision':precision, 'recall':recall, 
                        'fscore':fscore, 'accuracy':accuracy}, 
                        name='scores'))

result_metrics = pd.concat(result_metrics, axis=1)

result_metrics

Unnamed: 0,scores
precision,0.902769
recall,0.853
fscore,0.867976
accuracy,0.853


In [16]:
# detail metrics

print(metrics.classification_report(y_test, total_y_pred, target_names=['false', 'true']))

              precision    recall  f1-score   support

       false       0.97      0.85      0.91       859
        true       0.49      0.84      0.62       141

    accuracy                           0.85      1000
   macro avg       0.73      0.85      0.76      1000
weighted avg       0.90      0.85      0.87      1000



In [17]:
# confusion matrix

print(metrics.confusion_matrix(y_test, total_y_pred))

[[734 125]
 [ 22 119]]
