In [67]:
import numpy as np
import random
from sklearn.neighbors import NearestNeighbors
import math
from random import randint

class Smote(object):
	def __init__(self):
		super(Smote, self).__init__()
		self.synthetic_arr = []
		self.newindex = 0

	def Populate(self, N, i, indices, min_samples, k):
		while N!=0:
			arr = []
			nn = randint(0,k-2)
			features = len(min_samples[0])
			
			for attr in range(features):
				diff = min_samples[indices[nn]][attr] - min_samples[i][attr]
				gap = random.uniform(0,1)
				arr.append(min_samples[i][attr] + gap*diff)
			
			self.synthetic_arr.append(arr)
			self.newindex = self.newindex + 1
			N = N-1

	def k_neighbors(self, euclid_distance, k):
		nearest_idx_npy = np.empty([euclid_distance.shape[0], euclid_distance.shape[0]],dtype=np.int64)

		for i in range(len(euclid_distance)):
			idx = np.argsort(euclid_distance[i])
			nearest_idx_npy[i] = idx
			idx = 0

		return nearest_idx_npy[:,1:k]

	def find_k(self, X, k):
		euclid_distance = np.empty([X.shape[0], X.shape[0]],dtype = np.float32)

		for i in range(len(X)):
			dist_arr = []
			for j in range(len(X)):
				dist_arr.append(math.sqrt(sum((X[j]-X[i])**2)))
			dist_arr = np.asarray(dist_arr, dtype = np.float32)
			euclid_distance[i] = dist_arr

		return self.k_neighbors(euclid_distance, k)

	def generate_synthetic_points(self, min_samples, N, k):
		N = int(N/100)
		T = min_samples.shape[0]

		indices = self.find_k(min_samples, k)

		for i in range(indices.shape[0]):
			self.Populate(N, i, indices[i], min_samples, k)

		return np.asarray(self.synthetic_arr)

In [68]:
from __future__ import print_function
import os
data_path = ['data']

In [69]:
import pandas as pd

# Import the data using the file path
train_filepath = os.sep.join(data_path + ['Orange_Telecom_Churn_Data_train.csv'])
test_filepath = os.sep.join(data_path + ['Orange_Telecom_Churn_Data_test.csv'])

# csv to pandas DataFrame
train_data = pd.read_csv(train_filepath)
test_data = pd.read_csv(test_filepath)

In [70]:
# only use 5 features

train_data.drop(['state', 'area_code', 'account_length', 'total_day_calls', 'phone_number', 'total_day_minutes', 'total_eve_minutes', 'total_eve_calls', 'total_night_minutes', 'total_night_calls', 'total_intl_minutes', 'total_intl_calls', 'total_intl_charge', 'number_vmail_messages', 'total_night_charge'], axis=1, inplace=True)

test_data.drop(['state', 'area_code', 'account_length', 'total_day_calls', 'phone_number', 'total_day_minutes', 'total_eve_minutes', 'total_eve_calls', 'total_night_minutes', 'total_night_calls', 'total_intl_minutes', 'total_intl_calls', 'total_intl_charge', 'number_vmail_messages', 'total_night_charge'], axis=1, inplace=True)

In [71]:
train_data.columns

Index(['intl_plan', 'voice_mail_plan', 'total_day_charge', 'total_eve_charge',
       'number_customer_service_calls', 'churned'],
      dtype='object')

In [72]:
# preprocessing - label feature to number

from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()

for col in ['intl_plan', 'voice_mail_plan', 'churned']:
    train_data[col] = lb.fit_transform(train_data[col])

for col in ['intl_plan', 'voice_mail_plan']:
    test_data[col] = lb.fit_transform(test_data[col])

In [73]:
# preprocessing - scale

from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

# train_original_churned = train_data.copy()['churned']
# test_original_churned = test_data.copy()['churned']

train_data = pd.DataFrame(scaler.fit_transform(train_data), columns=train_data.columns)

test_data = pd.DataFrame(scaler.fit_transform(test_data), columns=test_data.columns)

# train_data['churned'] = train_original_churned
# test_data['churned'] = test_original_churned

In [74]:
# seperate train/test data

X_train = train_data.copy()
y_train = X_train.pop('churned')

X_test = test_data.copy()
# y_test = X_test.pop('churned')

In [75]:
# prepare learning data

sampling_ratio = 100 * len(train_data[train_data.churned != 1]) / len(train_data[train_data.churned == 1])

churned_data_array = train_data[train_data.churned == 1].to_numpy()

smote = Smote()
over_sample = smote.generate_synthetic_points(churned_data_array, sampling_ratio/2.5, 15)
over_sample_churned_data = pd.DataFrame(data=over_sample, columns=train_data.columns)

sampled_data = train_data[train_data.churned != 1]
sampled_data = pd.concat([sampled_data, over_sample_churned_data])

X_train = sampled_data.copy()
y_train = X_train.pop('churned')

In [76]:
# learning

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10)

knn = knn.fit(X_train, y_train)

In [77]:
# predict

y_pred = knn.predict(X_test)

In [78]:
y_pred_text = []

for y in y_pred: 
    if (y == 1.0):
        y_pred_text.append('TRUE')
    else:
        y_pred_text.append('FALSE')

In [79]:
test_result = pd.DataFrame(data=y_pred_text, columns=['churned'])

test_result.to_csv("orange-test-result.csv", mode='w')