This notebook trains a KNN model on 80% of the training data, finds the optimal value of k, and reports f1 score on the dev data.

In [3]:
import numpy as np
import pandas as pd
import zipfile
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [36]:
# Unzip data files into the "csv" subdirectory

# **IMPORTANT**  This will overwrite existing files in the "csv" folder in your local repo
# with the most recent data files from the data.zip file

unzip_files = zipfile.ZipFile("data.zip", "r")
unzip_files.extractall()
unzip_files.close()

In [15]:
# Load csv files into pandas dataframes
# train_data = pd.read_csv('../W207_CrimeClassification/csv/train_data.csv')
# train_labels = pd.read_csv('../W207_CrimeClassification/csv/train_labels.csv')
# dev_data = pd.read_csv('../W207_CrimeClassification/csv/dev_data.csv')
# dev_labels = pd.read_csv('../W207_CrimeClassification/csv/dev_labels.csv')
# train_data_all = pd.read_csv('../W207_CrimeClassification/csv/train_data_all.csv')
# train_labels_all = pd.read_csv('../W207_CrimeClassification/csv/train_labels_all.csv')
# test_data_all = pd.read_csv('../W207_CrimeClassification/csv/test_data_all.csv')

# NOTE: this didn't work right, maybe because np.savetxt was used to create the csv files in the Data Set up notebook

In [37]:
# Load csv files into numpy arrays
train_data = np.loadtxt('csv/train_data.csv', delimiter=",")
train_labels = np.loadtxt('csv/train_labels.csv', dtype=str, delimiter=",")
dev_data = np.loadtxt('csv/dev_data.csv', delimiter=",")
dev_labels = np.loadtxt('csv/dev_labels.csv', dtype=str, delimiter=",")
train_data_all = np.loadtxt('csv/train_data_all.csv', delimiter=",")
train_labels_all = np.loadtxt('csv/train_labels_all.csv', dtype=str, delimiter=",")
test_data_all = np.loadtxt('csv/test_data_all.csv', delimiter=",")

In [39]:
# print shapes to compare before and after csv conversion
print("train_data shape is", train_data.shape)
print("train_labels shape is", train_labels.shape)
print("dev_data shape is", dev_data.shape)
print("dev_labels shape is", dev_labels.shape)
print("train_data_all shape is", train_data_all.shape)
print("train_data_all shape is", train_data_all.shape)
print("train_labels_all shape is", train_labels_all.shape)
print("test_data_all shape is", test_data_all.shape)

train_data shape is (702439, 6)
train_labels shape is (702439,)
dev_data shape is (175610, 6)
dev_labels shape is (175610,)
train_data_all shape is (878049, 6)
train_data_all shape is (878049, 6)
train_labels_all shape is (878049,)
test_data_all shape is (884262, 6)


In [40]:
# Set up functions for training models and finding optimal value of k and reporting accuracy
def TrainKNN(data, labels, test_data, k=5):
    """This function takes as input a set of matching training data and labels,
    test data, and can accept different values of k. It trains a KNN model on the training data
    and returns a set of predictions on the test data as a numpy array.
    
    This function does not require labels for the test data, so this model can be trained on either
    the full set of training data, with predictions created from the test data from Kaggle OR
    it can be run on our 80% training data with predictions created from the development data."""
    KNN = KNeighborsClassifier(n_neighbors=k, n_jobs = -1)
    KNN.fit(data, labels)
    predictions = KNN.predict(test_data)
    return predictions

def find_k(data, labels, dev_data, dev_labels, k_values):
    """Find optimal value of k.  Note that this cannot be used on test data from Kaggle 
    because we do not have labels for that data.  This function is intended to only be used
    in the development stage with the development data."""
    for k in k_values:
        predictions = TrainKNN(data, labels, dev_data, k)
        f1 = metrics.f1_score(dev_labels, predictions, average = "weighted")
        
        # Print accuracy and F1 score for each value of k
        print("For k =", k, "the F1 score is", round(f1, 4))
    print("\n")

In [41]:
# Find the optimal value of k using the 80% training data and the development data
k_values = [1, 3, 5, 7, 9, 11, 15]
find_k(train_data, train_labels, dev_data, dev_labels, k_values)

  'recall', 'true', average, warn_for)


For k = 1 the F1 score is 0.2006


  'precision', 'predicted', average, warn_for)


For k = 3 the F1 score is 0.1552
For k = 5 the F1 score is 0.154
For k = 7 the F1 score is 0.1533
For k = 9 the F1 score is 0.1526
For k = 11 the F1 score is 0.1515
For k = 15 the F1 score is 0.1492




In [None]:
# Create zip file of predictions for submission to Kaggle 

# Not written for this model because this was our baseline
# But this is where that zip file would be created in the workflow for other models