In [1]:
from __future__ import print_function
import os
data_path = ['data']

In [2]:
import pandas as pd

# Import the data using the file path
filepath1 = os.sep.join(data_path + ['orange-demo-train.csv'])
train_data = pd.read_csv(filepath1)

# Get test data
filepath2 = os.sep.join(data_path + ['orange-demo-test.csv'])
test_data = pd.read_csv(filepath2)

In [3]:
# drop uselesss features
train_data.drop(['state', 'area_code', 'phone_number', 'account_length', 'total_day_calls'
           ], axis=1, inplace=True)

test_data.drop(['state', 'area_code', 'phone_number', 'account_length', 'total_day_calls', 'churned'
           ], axis=1, inplace=True)

In [4]:
# preprocessing - label to number

from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()

for col in ['intl_plan', 'voice_mail_plan', 'churned']:
    train_data[col] = lb.fit_transform(train_data[col])
    
for col in ['intl_plan', 'voice_mail_plan']:
    test_data[col] = lb.fit_transform(test_data[col])

In [5]:
# preprocessing - scale

from sklearn.preprocessing import MinMaxScaler

msc = MinMaxScaler()

train_data = pd.DataFrame(msc.fit_transform(train_data),  # this is an np.array, not a dataframe.
                    columns=train_data.columns)

test_data = pd.DataFrame(msc.fit_transform(test_data),  # this is an np.array, not a dataframe.
                    columns=test_data.columns)

In [6]:
# seperate train/test data

# from sklearn.model_selection import train_test_split

# X = data.copy()
# y = X.pop('churned')

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# train_data = pd.concat([X_train, y_train], axis=1)
# test_data = pd.concat([X_test, y_test], axis=1)

# train all data
# train_data = data

# test all data
# X_test = X
# y_test = y

X_train = train_data.copy()
y_train = X_train.pop('churned')

X_test = test_data.copy()

In [7]:
# feature selection

# from io import StringIO
# from sklearn.tree import DecisionTreeClassifier
# from sklearn import tree
# import pydotplus
# from IPython.display import Image

# dt = DecisionTreeClassifier(random_state=42)
# dt = dt.fit(X, y)
# dt.tree_.node_count, dt.tree_.max_depth


# dot_data = StringIO()

# tree.export_graphviz(dt, out_file=dot_data, feature_names=X.columns)
# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

# # View the tree image
# filename = 'wine_tree.png'
# graph.write_png(filename)
# img = Image(filename=filename)
# display(img)

In [8]:
import numpy as np
import math as math

non_churned_indices = train_data[train_data.churned == 0].index
non_churend_indices = np.random.permutation(non_churned_indices)
churned_indices = train_data[train_data.churned == 1].index

scale_diff = math.floor(len(non_churned_indices) / len(churned_indices))

non_churned_indices_set = np.array_split(non_churned_indices, scale_diff)

train_data_samples = []
for i in range(scale_diff):
    train_data_samples.append(
        pd.concat([train_data.loc[churned_indices], train_data.loc[non_churned_indices_set[i]]])
    )

In [9]:
X_train_set = []
y_train_set = []

for i in range(scale_diff):
    X_train_set.append(train_data_samples[i].copy())
    y_train_set.append(X_train_set[i].pop('churned'))

In [10]:
#learning

from sklearn.neighbors import KNeighborsClassifier

knn_set = []
for i in range(scale_diff):
    knn_set.append(KNeighborsClassifier(n_neighbors=5))
    knn_set[i].fit(X_train_set[i], y_train_set[i])

In [11]:
# predict

y_pred_set = []
for i in range(scale_diff):
    y_pred_set.append(knn_set[i].predict(X_test))
    
total_y_pred = np.sum(y_pred_set, axis=0)

In [12]:
# with np.nditer(total_y_pred, op_flags=['readwrite']) as it:
#     for i in it:
#         if i >= scale_diff - 1:
#             i[...] = 1
#         else:
#             i[...] = 0.0

total_y_pred_text = []

for i in range(len(total_y_pred)): 
    if (total_y_pred[i] >= scale_diff):
        total_y_pred_text.append('TRUE')
    else:
        total_y_pred_text.append('FALSE')

In [13]:
test_result = pd.DataFrame(data=total_y_pred_text, columns=['churned'])

test_result.to_csv("test-result.csv", mode='w')