# K-Nearest Neighbor Model Implementation


<img src="../images/KNN - Model for Group Assigment.jpg" alt="Alternative text" />

#### IMPORTS


In [1]:
import numpy as np
import pandas as pd
import sklearn.neighbors as skl_nb
import sklearn.model_selection as skl_ms
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as prep
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

import sys
sys.path.append("..")

from utils.loading_data  import load_to_df_from_csv, get_all_feature_combinations
from utils.knn_functions import find_best_k_with_misclassification_cv, model_iterator_cv, data_normalizer, generate_prediction_results

#### STEP 1: LOADING DATA


In [2]:
# Loading the train.csv as the main dataset
data = load_to_df_from_csv("../data/train.csv")

# Column Transformation to lowercase and underscored spaces
data.columns = data.columns.str.replace(' ', '_')
data.columns = data.columns.str.replace('-', '_')
data.columns = data.columns.str.lower()

X = data.loc[:, data.columns != 'lead']
y = data.loc[:, data.columns == 'lead']

#### SPLITTING DATA

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
[X_train.shape, X_test.shape, y_train.shape, y_test.shape]

[(779, 13), (260, 13), (779, 1), (260, 1)]

### Exploring data


In [4]:
data.shape

(1039, 14)

In [5]:
data.describe()


Unnamed: 0,number_words_female,total_words,number_of_words_lead,difference_in_words_lead_and_co_lead,number_of_male_actors,year,number_of_female_actors,number_words_male,gross,mean_age_male,mean_age_female,age_lead,age_co_lead
count,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0
mean,2334.256015,11004.368624,4108.256978,2525.024062,7.767084,1999.862368,3.507218,4561.85563,111.149182,42.353766,35.929588,38.716073,35.486044
std,2157.216744,6817.397413,2981.251156,2498.747279,3.901439,10.406632,2.088526,3417.855987,151.761551,7.81711,8.957193,12.285902,12.046696
min,0.0,1351.0,318.0,1.0,1.0,1939.0,1.0,0.0,0.0,19.0,11.0,11.0,7.0
25%,904.0,6353.5,2077.0,814.5,5.0,1994.0,2.0,2139.5,22.0,37.480769,29.5,30.0,28.0
50%,1711.0,9147.0,3297.0,1834.0,7.0,2000.0,3.0,3824.0,60.0,42.6,35.0,38.0,34.0
75%,3030.5,13966.5,5227.0,3364.0,10.0,2009.0,5.0,5887.5,143.5,47.333333,41.5,46.0,41.0
max,17658.0,67548.0,28102.0,25822.0,29.0,2015.0,16.0,31146.0,1798.0,71.0,81.333333,81.0,85.0


### Normalization Check

In [6]:
data_normalizer(X).describe()

Unnamed: 0,number_words_female,total_words,number_of_words_lead,difference_in_words_lead_and_co_lead,number_of_male_actors,year,number_of_female_actors,number_words_male,gross,mean_age_male,mean_age_female,age_lead,age_co_lead
count,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0,1039.0
mean,0.132193,0.145828,0.136419,0.097751,0.241682,0.800821,0.167148,0.146467,0.061818,0.449111,0.354449,0.395944,0.365206
std,0.122167,0.102987,0.107301,0.096772,0.139337,0.136929,0.139235,0.109737,0.084406,0.150329,0.127353,0.175513,0.154445
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.051195,0.07557,0.06331,0.031505,0.142857,0.723684,0.066667,0.068693,0.012236,0.355399,0.263033,0.271429,0.269231
50%,0.096897,0.11777,0.10722,0.070989,0.214286,0.802632,0.133333,0.122777,0.03337,0.453846,0.341232,0.385714,0.346154
75%,0.171622,0.190575,0.176684,0.130243,0.321429,0.921053,0.266667,0.189029,0.079811,0.544872,0.433649,0.5,0.435897
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
feature_combinations = get_all_feature_combinations(X.columns)

# 8191 if needed to run for all combinations
results = model_iterator_cv(X_train, y_train, feature_combinations, 8191)


#### Generate Prediction Results

In [8]:
selected_features = [
        'number_words_female',
        'total_words',
        'number_of_words_lead',
        'difference_in_words_lead_and_co_lead',
        'number_of_male_actors',
        'number_of_female_actors',
        'number_words_male',
        'gross'
    ]

generate_prediction_results(X_train[selected_features], y_train, X_test[selected_features], y_test, 8)

Train Misclassification Error: 32.74053664798522%
Train Accuracy: 67.25946335201478%
Test Misclassification Error: 29.69230769230769%
Test Accuracy: 70.30769230769232%
