In [74]:
import numpy as np
from scipy import stats
import pandas as pd

# The Plan

Making a basic KNN algorithm has three main steps:

1. Measure the distance between points
    - this is generally Euchlidean Distance
    
2. Sort the data based off the distance from x point
    - once the sorting happens, we can simply take the top neighbors
    - depending on the amount of neighbors the user wants
    
3. Return the number of neighbors the user wants and the prediction for it's class/numerical value (if regression)

In [66]:
classification_dummy_data = [[22, 23, 0],
             [10, 11, 1],
             [11, 15, 1],
             [29, 30, 0],
             [9, 13, 1],
             [25, 27, 0],
             [10, 12, 1], 
             [25, 27, 0],
             [24, 28, 0],
             [14, 16, 1]]

In [25]:
#first make a function that measures Euchlidean Distance 

def e_distance(original_arr, test_arr):
    #start by having distance variable at 0
    d = 0
    
    #go through each item in the array
    #assuming the last item is actually classification label and is skipped
    
    for row in range(len(original_arr) - 1):
        #this is the execution of the euchlidian formula
        
        # the difference between two rows squared
        # added to the total distance of rows
        # then the square root is taken of total distance
        
        # and returned as the distance between arrays
        d += (original_arr[row] - test_arr[row]) ** 2
        
        distance = np.sqrt(d)
        
    return distance 

In [30]:
#test 

arr1 = [15, 25, 35, 45, 55]
arr2 = [15, 25, 35, 45, 55]

print('Distance between two identical arrays:', e_distance(arr1, arr2))

arr3 = [15, 25, 35, 45, 55]
arr4 = [15, 25, 30, 50, 55]

print('Distance between two similar arrays:', e_distance(arr3, arr4))

Distance between two identical arrays: 0.0
Distance between two similar arrays: 7.0710678118654755


In [62]:
#sort the e_distance between arrays 

#essentially I will be iterating over each row in the array
def knearest_neighbors_classification(original_array, test_array, k):
    all_distances = []
    for row in original_array:
        #measuring it's distance from my test point
        d = e_distance(row, test_array)
        #store them all in a list
        all_distances.append([d, row])

    #sort the list, smallest to largest
    all_distances.sort()
    
    #then return the number of nearest neighbors the user wants 
    count = 0
    for i in all_distances[:k]:
        count += 1
        print(f'This is the {count} closest variable:', i)
    #I also would like to return a prediction for the class of the test array
    nearest_classes = []
    for i in all_distances[:k]:
        nearest_classes.append(i[-1][-1])
    
    prediction = stats.mode(nearest_classes)
    return f'Prediction is that this is in the {prediction[0]} class.'

In [67]:
test_arr = [12, 13, 0]

print(knearest_neighbors_classification(classification_dummy_data, test_arr, 2))

This is the 1 closest variable: [2.23606797749979, [10, 12, 1]]
This is the 2 closest variable: [2.23606797749979, [11, 15, 1]]
Prediction is that this is in the [1] class.


In [69]:
regression_dummy_data = [[22, 23, 56],
             [10, 11, 134],
             [11, 15, 166],
             [29, 30, 87],
             [9, 13, 123],
             [25, 27, 55],
             [10, 12, 111], 
             [25, 27, 98],
             [24, 28, 75],
             [14, 16, 147]]

In [70]:
#sort the e_distance between arrays 

#essentially I will be iterating over each row in the array
def knearest_neighbors_regression(original_array, test_array, k):
    all_distances = []
    for row in original_array:
        #measuring it's distance from my test point
        d = e_distance(row, test_array)
        #store them all in a list
        all_distances.append([d, row])

    #sort the list, smallest to largest
    all_distances.sort()
    
    #then return the number of nearest neighbors the user wants 
    count = 0
    for i in all_distances[:k]:
        count += 1
        print(f'This is the {count} closest variable:', i)
    #I also would like to return a prediction for the class of the test array
    nearest_values = []
    for i in all_distances[:k]:
        nearest_values.append(i[-1][-1])
    
    prediction = np.mean(nearest_values)
    return f'Given the input value, the prediction is {prediction}.'

In [73]:
test_arr = [12, 13]

print(knearest_neighbors_regression(regression_dummy_data, test_arr, 2))

This is the 1 closest variable: [2.23606797749979, [10, 12, 111]]
This is the 2 closest variable: [2.23606797749979, [11, 15, 166]]
Given the input value, the prediction is 138.5.


# Real Data

Lets test out both the regression and classification algorithms with a real dataset.

https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients#

In [76]:
credit_data = pd.read_excel(r'C:\Users\Jrive\Desktop\CS_2\build_week1\default of credit card clients.xls')

In [79]:
credit_data = credit_data.drop('Unnamed: 0', axis=1)

In [81]:
credit_data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0


In [82]:
cols = list(credit_data.columns)

In [88]:
#I want the readable names to be the column names
# for readability and so I can convert them to numeric types

for i in cols:
    human_name = credit_data[i][0]
    credit_data.rename({i:human_name}, axis=1, inplace=True)

print(credit_data.columns)

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')


In [89]:
#update the cols
cols = list(credit_data.columns)

In [93]:
# now I can change the data types

for i in cols:
    credit_data.drop([0], inplace=True)

credit_data.apply(pd.to_numeric)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29996,220000,1,3,1,39,0,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29997,150000,1,3,2,43,-1,-1,-1,-1,0,...,8979,5190,0,1837,3526,8998,129,0,0,0
29998,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29999,80000,1,3,1,41,1,-1,0,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [96]:
for i in cols:
    print(i)
    print(f'Max value in {i} column:', max(credit_data[i]))
    print(f'Min value in {i} column:', min(credit_data[i]))
    print(f'Average value in {i} column:', np.mean(credit_data[i]))
    print()

LIMIT_BAL
Max value in LIMIT_BAL column: 1000000
Min value in LIMIT_BAL column: 10000
Average value in LIMIT_BAL column: 167484.32266666667

SEX
Max value in SEX column: 2
Min value in SEX column: 1
Average value in SEX column: 1.6037333333333332

EDUCATION
Max value in EDUCATION column: 6
Min value in EDUCATION column: 0
Average value in EDUCATION column: 1.8531333333333333

MARRIAGE
Max value in MARRIAGE column: 3
Min value in MARRIAGE column: 0
Average value in MARRIAGE column: 1.5518666666666667

AGE
Max value in AGE column: 79
Min value in AGE column: 21
Average value in AGE column: 35.4855

PAY_0
Max value in PAY_0 column: 8
Min value in PAY_0 column: -2
Average value in PAY_0 column: -0.0167

PAY_2
Max value in PAY_2 column: 8
Min value in PAY_2 column: -2
Average value in PAY_2 column: -0.13376666666666667

PAY_3
Max value in PAY_3 column: 8
Min value in PAY_3 column: -2
Average value in PAY_3 column: -0.1662

PAY_4
Max value in PAY_4 column: 8
Min value in PAY_4 column: -2
Ave

In [97]:
print(cols)

['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default payment next month']


In [None]:
#limit, female, grad school, married, 29 y/o, sept. pay, aug. pay, july pay, june pay, may pay, april pay, 
test1 = [20000, 2, 1, 1, 29, -.1, -.2, -.25, -.3, -.32, -.39, ]
test2 = []
test3 = []