In [1]:
import numpy as np
import random
from sklearn.neighbors import NearestNeighbors
import math
from random import randint
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


class Smote(object):
	"""docstring for Smote"""

	def __init__(self,distance):
		super(Smote, self).__init__()
		self.synthetic_arr=  []
		self.newindex = 0
		self.distance_measure = distance



	def Populate(self,N,i,indices,min_samples,k):
		"""
    		Populates the synthitic array
    		Returns:Synthetic Array to generate_syntheic_points 
    	"""

		while N!=0:
			arr = []
			nn = randint(0,k-2)
			features = len(min_samples[0])
			
			for attr in range(features):
				diff = min_samples[indices[nn]][attr] - min_samples[i][attr]
				gap = random.uniform(0,1)
				arr.append(min_samples[i][attr] + gap*diff)
			
			self.synthetic_arr.append(arr)
			self.newindex = self.newindex + 1
			N = N-1



	def k_neighbors(self,euclid_distance,k):
		nearest_idx_npy = np.empty([euclid_distance.shape[0],euclid_distance.shape[0]],dtype=np.int64)
		
		for i in range(len(euclid_distance)):
			idx = np.argsort(euclid_distance[i])
			nearest_idx_npy[i] = idx
			idx = 0

		return nearest_idx_npy[:,1:k]




	def find_k(self,X,k):

		"""
   			Finds k nearest neighbors using euclidian distance
   			Returns: The k nearest neighbor   
    	"""




		euclid_distance = np.empty([X.shape[0],X.shape[0]],dtype = np.float32)
		
		for i in range(len(X)):
			dist_arr = []
			for j in range(len(X)):
				dist_arr.append(math.sqrt(sum((X[j]-X[i])**2)))
			dist_arr = np.asarray(dist_arr,dtype = np.float32)
			euclid_distance[i] = dist_arr

		return self.k_neighbors(euclid_distance,k)






	def generate_synthetic_points(self,min_samples,N,k):

		"""
    		Returns (N/100) * n_minority_samples synthetic minority samples.
    		Parameters
    		----------
    		min_samples : Numpy_array-like, shape = [n_minority_samples, n_features]
    		    Holds the minority samples
    		N : percetange of new synthetic samples: 
    		    n_synthetic_samples = N/100 * n_minority_samples. Can be < 100.
    		k : int. Number of nearest neighbours. 
    		Returns
    		-------
    		S : Synthetic samples. array, 
    		    shape = [(N/100) * n_minority_samples, n_features]. 
    	"""
		

		if N < 100:
			raise ValueError("Value of N cannot be less than 100%")

		if self.distance_measure not in ('euclidian','ball_tree'):
			raise ValueError("Invalid Distance Measure.You can use only Euclidian or ball_tree")


		if k>min_samples.shape[0]:
			raise ValueError("Size of k cannot exceed the number of samples.")

		
		N = int(N/100)
		T = min_samples.shape[0]
		
		

		if self.distance_measure == 'euclidian':
				indices = self.find_k(min_samples,k)
			
		elif self.distance_measure=='ball_tree':
			nb = NearestNeighbors(n_neighbors = k,algorithm= 'ball_tree').fit(min_samples)
			distance,indices = nb.kneighbors(min_samples)
			indices = indices[:,1:]	



		for i in range(indices.shape[0]):
			self.Populate(N,i,indices[i],min_samples,k)
		
		return np.asarray(self.synthetic_arr)

		


	def plot_synthetic_points(self,min_samples,N,k):
		"""
			Plot the over sampled synthtic samples in a scatterplot
		"""


		if N < 100:
			raise ValueError("Value of N cannot be less than 100%")

		if self.distance_measure not in ('euclidian','ball_tree'):
			raise ValueError("Invalid Distance Measure.You can use only Euclidian or ball_tree")


		if k>min_samples.shape[0]:
			raise ValueError("Size of k cannot exceed the number of samples.")

		
		synthetic_points = self.generate_synthetic_points(min_samples,N,k)
		
		pca = PCA(n_components=2)
		pca.fit(synthetic_points)
		pca_synthetic_points = pca.transform(synthetic_points)
		
		plt.scatter(pca_synthetic_points[:,0],pca_synthetic_points[:,1])
		plt.show()

In [2]:
from __future__ import print_function
import os
data_path = ['data']

In [3]:
import pandas as pd

# Import the data using the file path
filepath = os.sep.join(data_path + ['Orange_Telecom_Churn_Data.csv'])
data = pd.read_csv(filepath)

In [4]:
# feature selection

# drop uselesss features
data.drop(['state', 'area_code', 'phone_number', 'account_length'], axis=1, inplace=True)

In [5]:
data.columns

Index(['intl_plan', 'voice_mail_plan', 'number_vmail_messages',
       'total_day_minutes', 'total_day_calls', 'total_day_charge',
       'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
       'total_night_minutes', 'total_night_calls', 'total_night_charge',
       'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
       'number_customer_service_calls', 'churned'],
      dtype='object')

In [6]:
# preprocessing - label to number

from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()

for col in ['intl_plan', 'voice_mail_plan', 'churned']:
    data[col] = lb.fit_transform(data[col])

In [7]:
# preprocessing - scale

from sklearn.preprocessing import MinMaxScaler

msc = MinMaxScaler()

data = pd.DataFrame(msc.fit_transform(data),  # this is an np.array, not a dataframe.
                    columns=data.columns)

In [8]:
# seperate train/test data

from sklearn.model_selection import train_test_split

X = data.copy()
y = X.pop('churned')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=25)

train_data = pd.concat([X_train, y_train], axis=1)

In [9]:
# prepare learning data

sampling_ratio = 100 * len(train_data[train_data.churned != 1]) / len(train_data[train_data.churned == 1])

churned_data_array = train_data[train_data.churned == 1].to_numpy()

smote_test = Smote('euclidian')
over_sample = smote_test.generate_synthetic_points(churned_data_array, sampling_ratio / 3, 3)
over_sample_churned_data = pd.DataFrame(data=over_sample, columns=train_data.columns)

sampled_data = train_data[train_data.churned != 1]
sampled_data = pd.concat([sampled_data, over_sample_churned_data])

X_train = sampled_data.copy()
y_train = X_train.pop('churned')

In [10]:
# prepare testing data

# X_test = data.copy()
# y_test = X_test.pop('churned')

In [11]:
# learning

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn = knn.fit(X_train, y_train)

In [12]:
# predict

y_pred = knn.predict(X_test)

In [13]:
# simple metrics

import sklearn.metrics as metrics

precision, recall, fscore, _ = metrics.precision_recall_fscore_support(y_test, y_pred, average='weighted')
accuracy = metrics.accuracy_score(y_test, y_pred)

result_metrics = list()
result_metrics.append(pd.Series({'precision':precision, 'recall':recall, 
                        'fscore':fscore, 'accuracy':accuracy}, 
                        name='scores'))

result_metrics = pd.concat(result_metrics, axis=1)

result_metrics

Unnamed: 0,scores
precision,0.889897
recall,0.8928
fscore,0.891217
accuracy,0.8928


In [14]:
# detail metrics

print(metrics.classification_report(y_test, y_pred, target_names=['false', 'true']))

              precision    recall  f1-score   support

       false       0.93      0.94      0.94      1073
        true       0.63      0.59      0.61       177

    accuracy                           0.89      1250
   macro avg       0.78      0.77      0.77      1250
weighted avg       0.89      0.89      0.89      1250



In [15]:
# confusion matrix

print(metrics.confusion_matrix(y_test, y_pred))

[[1012   61]
 [  73  104]]
