# Lesson 9: Assessing Your Models

Today:
1. Assessing your models
    + Accuracy
    + Other ways to measure goodness of models
2. Improving your models
   + Incorporating more features
   + k-Nearest Neighbor Classifiers

## 1. Measuring "Goodness" of Classifiers

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
cancerdata = pd.read_csv('../../../shared/datasets/cancer.csv')
cancerdata.shape

In [None]:
# ---------------
# this part simply puts together the pieces we have done previously into one giant code cell

# 1. THE DATASET
#  split into training and test datasets:

from sklearn.model_selection import train_test_split

X = cancerdata[['Uniformity of Cell Size', 'Clump Thickness']]
Y = cancerdata['Class']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.5, random_state = 11)

# 2. THE CLASSIFIER 
# Encoding a simple classifier
#   (this was an example from lesson08)

def predict_tumor_class( uniformity , clump ):
    # uniformity = uniformity_of_cell_size
    # clump = clump thickness
    
    if( uniformity > 4 ):
        class_predicted = 1
    elif( clump > 6 ):
        class_predicted = 1
    else:
        class_predicted = 0
    
    return( class_predicted )


# 3. PREDICT THE CLASS OF EACH ROW OF THE TEST DATASET, USING A FOR LOOP

num_rows_test = len(y_test)
print(num_rows_test)


# empty array
y_predicted = np.empty( num_rows_test )

# empty data frame

predictions = pd.DataFrame( np.empty( (num_rows_test, 2) ) )
predictions.rename( columns = {0:'actual', 1:'predicted'}, inplace = True)

for row in np.arange(0, num_rows_test):
    predictions.iloc[row, 1] = predict_tumor_class( X.iloc[row, 0], X.iloc[row, 1] )
    predictions.iloc[row, 0] = y_test.iloc[row]

predictions.head()


In [None]:
# 4. ASSESSMENT
# Next, check how good our predictions are, by comparing to the actual class

# count how many predictions are incorrect and how many are correct
#    add a new column called "error"
#    if actual class is equal to predicted class, error is 0; else, error is 1

predictions['error'] = (predictions['predicted'] - predictions['actual']) ** 2

print(np.mean(predictions['error']))

### The k Nearest Neighbor Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors= 3 )
model.fit(X_train, y_train)

y_predicted = model.predict(X_test)

accuracy = model.score(X_test, y_test)

print(type(y_predicted))
print(y_predicted)
print(np.array(y_test))

### Measure Accuracy

In [None]:
# use a for loop
knnscores_df = pd.DataFrame( np.empty((50, 2)) )
knnscores_df.rename(columns = {0:'k', 1:'accuracy'}, inplace = True )
row = 0

for k in np.arange(1, 51) :
    model = KNeighborsClassifier(n_neighbors= k )
    model.fit(X_train, y_train)
    
    knnscores_df.iloc[row, 0] = k
    knnscores_df.iloc[row, 1] = model.score(X_test, y_test)
    
    row = row + 1

In [None]:
from sklearn.model_selection import cross_validate

model = KNeighborsClassifier(n_neighbors=3)

cv_results = cross_validate( model, X, Y, cv=5 )
np.mean(cv_results[ 'test_score' ])

In [None]:
# use a for loop
knnscores_df2 = pd.DataFrame( np.empty((50, 2)) )
knnscores_df2.rename(columns = {0:'k', 1:'accuracy'}, inplace = True )
row = 0

for k in np.arange(1, 51) :
    model = KNeighborsClassifier(n_neighbors=k)

    cv_results = cross_validate( model, X, Y, cv=10 )
    
    knnscores_df2.iloc[row, 0] = k
    knnscores_df2.iloc[row, 1] = np.mean(cv_results[ 'test_score' ])
    
    row = row + 1