In [1]:
#Fakharyar Khan and Colin Hwang
#ECE-475: Frequentist Machine Learning
#Project 3: Model Assessment and Selection
#Professor Keene

import pandas as pd
import numpy as np
import random
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest

In [15]:
#returns a randomly generated feature set and classification labels
#and sorts the features by their correlation with the classification labels

def sim_gen_dat():
  #generate a matrix whose entries are samples from a Gaussian distribution of
  #mean 0 and std dev 1
  #the first 5000 columns are the "features" of our dataset

  dataset = np.random.normal(0, 1, size = (50, 5000))

  #and the last will hold our class labels
  labels = np.array(random.choices([0, 1], k = 50))

  #get a list of the correlations between the columns of dataset and the class labels

  corr_val = [abs(np.correlate(dataset[:, col], labels)[0])*-1 for col in range(0, 5000)]

  #use argsort to get the indices of the sorted correlation array
  #needed to sort it in descending order so we multiplied the array by -1
  indices = np.argsort(corr_val)


  #then we rearange the columns dataset based on these indices
  #in other words, we sort the features in the matrix based on their correlation they
  #are to the class label

  dataset[:] = dataset[:, indices]

  return dataset, labels

    

In [13]:
#returns the top 100 features that are most correlated with the label
#but doesn't use the validation set since that would give the model
#information about the testing labels

#takes in the feature matrix, the class labels, and the indices of the feature
#and label matrix corresponding to the training set
def best_feat(feat_mat, labels, train):

    #extract the training set from the feature matrix and labels
    train_x = feat_mat[train]
    train_y = labels[train]

    #then just as before, we take the top 100 features in the matrix
    #that are most correlated with the labels

    corr_val = [abs(np.correlate(train_x[:, col], labels)[0])*-1 for col in range(0, 5000)]


    indices = np.argsort(corr_val)

    feat_mat[:] = feat_mat[:, indices]

    feat_mat = feat_mat[:, 0:100]
    return feat_mat
    

#this function performs 5-fold cross validation
#and returns the average score that the model (KNN)
#achieves which gives us an idea of how well the 
#model will perform on the testing set

#takes in a boolean bad which tells it whether or not
#to perform the cross validation incorrectly (True) or correctly (False)

def run_sim(bad):
    #get the features and labels
    feat_mat, labels = sim_gen_dat()

    #if we're doing cross validation wrong
    if(bad):
        #just take the top 100 most correlated features
        #which were determined using the entire dataset
        feat_mat = feat_mat[:, 0:100]

    avg_score = 0

    #create an instance of the KNN classifier
    neigh_mod = KNeighborsClassifier(n_neighbors = 1)

    #split the data into 5 folds
    kf = KFold(n_splits = 5)
    
    #then we iterate through each split by having
    #each fold be the validation set and the rest be the training set

    for train, val in kf.split(range(50)):
        
        feat = feat_mat

        #if we're doing it the right way
        if(not bad):
            #we determine the top 100 features using the 4 folds of training data
            #that we have by calling our best_feat function
            feat = best_feat(feat_mat, labels, train)

        #get the features in the validation and training sets
        #and the labels as well using the indices given in train and val

        val_x = feat[val]
        val_y = labels[val]

        train_x = feat[train]
        train_y = labels[train]

        #fit our model to the training data
        neigh_mod.fit(train_x, train_y)

        #get the score and add it
        avg_score += neigh_mod.score(val_x, val_y)
    
    #get the average performance of the model
    return avg_score / 5
        

In [16]:
#here we will test out the two different methods of validation
#and see what they tell us what kind of error we should expect
#when we train and test a KNN model on a randomly generated dataset

avg_score_bad = 0

avg_score_good = 0

for iterations in range(50):
    avg_score_bad += run_sim(True)
    avg_score_good += run_sim(False)

print("On average, the bad method of cross validation believes that we will achieve an error")
print("rate of " , 1 - avg_score_bad/50, " when we train and test our model on a randomly generated dataset")

print("\n\n")
print("On the other hand, the good method of cross validation tells us that on average, we will achieve an error")
print("rate of " , 1 - avg_score_good/50, " when we train and test our model on a randomly generated dataset")

On average, the bad method of cross validation believes that we will achieve an error
rate of  0.13360000000000016  when we train and test our model on a randomly generated dataset



On the other hand, the good method of cross validation tells us that on average, we will achieve an error
rate of  0.51  when we train and test our model on a randomly generated dataset


SUMMARY

In this project, we learned the right and the wrong way to do cross-validation. If given N = 50 samples with two equally sized classes and p = 5000 features that are independent of the class labels, we should expect to obtain an error rate of around 50% for the classifier. However, when you screen the best predictions before building the classifier and using cross validation to estimate tuning parameters, the prediction error will be much lower than expected.

Using the method outlined by the textbook (choosing the 100 predictors with the highest correlation, then using a 1-nearest neighbor classifier based on the predictors), the textbook obtained an error rate of 3% and in our implementation, we obtained an error rate of approximately 13%, which indicates this method is flawed. This is because when the predictors were chosen, they were chosen on the basis of all samples, leaving samples out only after the predictors had been chosen. This does not mirror how a classifier should operate on an independent test set since the predictors were allowed to "see" samples that were left out. 

Therefore, in order to obtain an error rate close to the theoretical, ideal error rate of 50%, we must choose the best predictions based on only the training data - not the testing - when performing cross validation. First, we divide the samples into K cross-validation folds randomly, where we chose 5 folds to follow the textbooks implementation. Then, for each of the 5 folds, we found a subset of good predictors and used the subset of predictors to construct a multivariate classifer, using all samples except samples within fold K. We then used the classifier to predict labels for the samples in fold K. The error estimates that are calculated from predicting the labels in all of the folds are accumulated in order to obtain the cross-validation of the error rate. This method prevents selected predictors from "seeing" leftout samples. Using this method, we got an error rate of ~50%, which is the exact value we should expect for this problem. 


