In [50]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from plotnine import *

from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics 
from sklearn.preprocessing import StandardScaler #Z-score variables

from sklearn.model_selection import train_test_split # simple TT split cv
from sklearn.model_selection import KFold # k-fold cv
from sklearn.model_selection import LeaveOneOut #LOO cv

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix

from sklearn.model_selection import GridSearchCV

%matplotlib inline

# Together

KNN is a simple, distance based algorithm that let's us CLASSIFY data points based on what class the data points around them are. Birds of a feather...

Despite it being distance based, KNN is a *classification* algorithm. In other words, it is supervised machine learning, as it requires truth labels (the actual class/group). However it does share characteristics with clustering algorithms we will see later.

KNN *can* work with binary/categorical variables, but not without some tweaking which we do not cover here.

# KNN

Use the telecom_churn.csv data from GitHub (for more information see this [link](https://www.kaggle.com/ivanhrek/telecom-churn)) and the KNN algorithm to predict `churn` in this dataset. Use TTS, and only continuous/interval variables. Z score your variables, and use `GridSearchCV()` to choose `k`.

How accurate is your model? Is it just as good at predicting people who do not churn, as people who do churn?

In [51]:
### YOUR CODE HERE ###
churn = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/telecom_churn.csv")
print(churn.columns)
churn.head()

Index(['state', 'account length', 'area code', 'phone number',
       'international plan', 'voice mail plan', 'number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls', 'churn'],
      dtype='object')


Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [52]:
continuous_preds = ['account length', 'number vmail messages',
                    'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls']

X = churn[continuous_preds]
y = churn["churn"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

z = StandardScaler()

Xz_train = z.fit_transform(X_train)
Xz_test = z.transform(X_test)

m2 = KNeighborsClassifier()

# choose potential values of k
ks = {"n_neighbors": range(1,30)}

# use grid search to find best parameters
grid = GridSearchCV(m2,ks, scoring = "accuracy", cv = 5)

m2 = grid.fit(Xz_train, y_train)

In [53]:
m2.score(Xz_test,y_test)

0.8920539730134932

# KNN From Scratch

Write a function, `neighbors()` that takes in three arguments:

- `k`: the number of neighbors to find
- `df`: a dataframe with ONLY continuous variables (can be any # of rows or columns)
- `point`: the values of the data point you're finding neighbors for

This function should find the euclidean distance between `point` and every other data point in `df` (hint: `np.linalg.norm()`, and return a list of the indices of the `k` nearest neighbors (by indices, I mean that if the k-nearest neighbors are in the 0th, 15th, 23rd, 32nd, and 56th rows, you should return a list `[0,15,23,32,56]`). Assume that the datapoint `point` is NOT included as a row in `df`.

You may use `np.argpartition()` to find the indices of the `k` nearest neighbors. Below is an example of how it works. [Documentation](https://numpy.org/doc/stable/reference/generated/numpy.argpartition.html) linked here.


In [54]:
# an array of numbers
ar = [1, 7, 9, 2, 0.1, 17, 17, 1.5]

# k (# of items)
k = 3

# get indices of k smallest values in ar
indices = np.argpartition(ar, k)[:k]

# check if this is correct
print(indices)

[4 0 7]


In [55]:
def neighbors(k,df,point):
    distances = [np.linalg.norm(point-df.iloc[x,]) for x in range(0,df.shape[0])]
    indices = np.argpartition(distances,k)[:k]
    return(indices)
    

In [56]:
# test
d = pd.DataFrame({"x" : [1.449, -1.069, -0.855, -0.281, -0.994, -0.969, -1.107, -1.252, -0.524, -0.497],
                  "y" : [-1.806, -0.582, -1.109, -1.015, -0.162,  0.563,  1.648, -0.773,  1.606, -1.158]})

p = np.array([-0.75, -1])

k = 3

# if your function is working, this should output `True`
set(neighbors(k,d,p)) == set(np.array([3,2,9]))

True