![title](Header__0004_6.png "Header")
___
# Chapter 6 - Cluster Analysis
## Segment 3 - Instance-based learning w/ k-Nearest Neighbor
#### Setting up for classification analysis

In [None]:
# K Nearest Neighbor - KNN is a supervised classifier that memorizes observations from within a labeled test set 
# to predict classification  lables for new, unlabeled observations. 
# KNN predictions based on how similar training observations are to the nw, incoming observations. The more similar
# the observation's balues, the more likely they will be classified with the same label. 
# 
# Use cases:
# - Stock price prediction
# - recommendation systems
# - Credit risk analysis
# - Predictive trip planning
#
# Assumptions:
# - Dataset has little noise
# - dataset is labeled
# - dataset only contains relevant features
# - dataset has distinguishable subgroups
# - Avoid using KNN on large datasets. It will probably take too long. 
#
#
#
#
#
#
#
#
#

In [2]:
import numpy as np
import pandas as pd
import scipy

import matplotlib.pyplot as plt
from pylab import rcParams

import urllib

import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn import neighbors
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn import metrics

In [3]:
#just setting the plotting parameters. 
np.set_printoptions(precision=4, suppress=True) 
%matplotlib inline
rcParams['figure.figsize'] = 7, 4
plt.style.use('seaborn-whitegrid')

## Splitting your data into test and training datasets

In [4]:
#loading the mtcars dataset. 
address = 'C:/Users/Lillian Pierson/Desktop/Exercise Files/Ch06/06_03/mtcars.csv'
cars = pd.read_csv(address)
cars.columns = ['car_names','mpg','cyl','disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']
#we'll be using these variables as the predictive weight to predict the transmission of the model. 
#mpg, disp, hp, and weight. we'll also use values at the end so that we're accessing the values in the columns. 
X_prime = cars.ix[:,(1,3,4,6)].values
#setting our target variables. Am (automatic/manual)
y = cars.ix[:,9].values

In [5]:
#scaling our dataset. 
X = preprocessing.scale(X_prime)

In [7]:
#here we use scikit learns train/test function so that we can split our dataset into a test set and a train set. 
#output is x test, x train, y train, y test. 
#size tells us that 33% of our data should go into the test dataset and the rest will be used to train. 
#random state is the seed so that we get the same result each time. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=17)

## Building and training your model with training data

In [9]:
#instantiating a k nearest neighbors object and calling it clf. 
clf = neighbors.KNeighborsClassifier()

#calling the fit method on this data and passing in the training data. x train is the training data, y train is 
# the target variable. 
clf.fit(X_train, y_train)
print(clf)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')


## Evaluating your model's predictions against the test dataset

In [13]:
# renaming the y test set to y expect to make this easier. 
y_expect = y_test

# y pred is going to contain the labels that the model predicts for the y label. 
#calling the predict method on it and passing in the x test dataset. 
y_pred = clf.predict(X_test)

# classification support method will score the model. 
print(metrics.classification_report(y_expect, y_pred))

             precision    recall  f1-score   support

          0       0.71      1.00      0.83         5
          1       1.00      0.67      0.80         6

avg / total       0.87      0.82      0.82        11



In [None]:
# Recall is a measure of a model's completeness. These results are telling us that of all the points labeled 1
# only 67% of the results were truly relevent and in the entire dataset 82% of the results that were returned were
# truly relevent. 
# 
#
# hihg precision + low recall = few results returned, but many of the label predicitons returned were correct. 
# High accuracy, but low completion. 