In [None]:
# Import Packages

import csv
import time
from sklearn.preprocessing import scale
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Load in the training data .csv file and convert it to a useable format
# This cell should take about 3 minutes to run
# Make sure the .csv file is in the same directory as this notebook!

start_time = time.time()
print "Loading training data."

train_data_raw = []

with open("train.csv") as train_csv:
    reader = csv.reader(train_csv)

    for row in reader:
        train_data_raw.append(row)
        
print "Training data loaded. Time Elapsed: %.0f seconds" %(time.time() - start_time)

In [None]:
# Format the raw training data, and split it into training and dev data sets
# This cell should take about 8 minutes to run

start_time = time.time()
print "Formatting raw training data."

# The data_size parameter can be adjusted in order to work with a subset of the training data
data_size = len(train_data_raw)
train_data, train_labels = [], []
dev_data, dev_labels = [], []

# Read through rows of raw training data   
for i in range(1, data_size): 
    
    # Add x, y, accuracy, and time columns to the training data
    next_data = []
    for j in range(1, 5): 
        next_data.append(float(train_data_raw[i][j]))
    
    # Add place_id to the training labels
    next_label = train_data_raw[i][j]
    
    # Add the formatted rows to the training data/labels or dev data/labels
    if (i > data_size/6):
        train_data.append(next_data)
        train_labels.append(next_label)
    
    else:
        dev_data.append(next_data)
        dev_labels.append(next_label)
        
print "Raw training data formatted."
print "Time elapsed. %.0f seconds" %(time.time() - start_time)
print "Training data size: %d" %len(train_data)
print "Dev data size: %d" %len(dev_data)
print "Number of distinct training labels: %d" %len(set(train_labels))
print "Number of distinct dev labels: %d" %len(set(dev_labels))
labels_intersection = set(train_labels).intersection(dev_labels)
print "Number of dev labels not found in the training data: %d" \
    %(len(dev_labels) - len(labels_intersection))

In [None]:
# Preprocess training/dev data

start_time = time.time()
print "Preprocessing training/dev data."

train_data = scale(train_data)
dev_data = scale(dev_data)

print "Training/dev data preprocessed. Time Elapsed: %.0f seconds" %(time.time() - start_time)

In [None]:
# Instantiate and train your model here...
# See the example below
k_neighbors_classifier = KNeighborsClassifier()
k_neighbors_classifier.fit(train_data[:100], train_labels[:100])

In [None]:
# Load/format/preprocess the test data .csv file
# This cell should take about 12 minutes to run
# It only needs to be run if we have a trained model that's ready to test
# Make sure the .csv file is in the same directory as this notebook!

start_time = time.time()
print "Loading/formatting/preprocessing test data."

test_csv = open("test.csv")
reader = csv.reader(test_csv)

test_data_raw = []

for row in reader:
    test_data_raw.append(row)

# Read through rows of raw training data 
test_data = []
for i in range(1, len(test_data_raw)):
    
    # Add x, y, accuracy, and time columns to the training data
    next_row = []
    for j in range(1, 5):
        next_row.append(float(test_data_raw[i][j]))
    
    test_data.append(next_row)

test_data = scale(test_data)
print "Test data loaded/formatted/preprocessed. Time Elapsed: %.0f seconds" \
    %(time.time() - start_time)
print "Test data size: %d" %len(test_data)

In [None]:
# Score your test data here using "predict_proba" and save the results under "predictions"
# See the example below...

predictions = k_neighbors_classifier.predict_proba(test_data[:100]) # Dummy variable for compilation purposes


In [None]:
start_time = time.time()
print "\nExtracting the top 3 labels for each test data point."

# We need a reference list of all possible labels in lexicographic order
labels_sorted = list(set(train_labels))
labels_sorted.sort()

# Find the top 3 prediction probabilites for each test data point
top_3_predictions = []
for i in range(predictions.shape[0]):
    
    first_index = 0
    second_index = 1
    third_index = 2
    
    # Find the indices of the top 3 predictions based on the prediction probabilities
    for j in range(predictions.shape[1]):
        
        if predictions[i][j] > predictions[i][first_index]:
            third_index = second_index
            second_index = first_index
            first_index = j
            
        elif predictions[i][j] > predictions[i][second_index]:
            third_index = second_index
            second_index = j
            
        elif predictions[i][j] > predictions[i][third_index]:
            third_index = j
    
    # Use those indices to find the actual values the predictions from our reference list
    top_3_predictions.append([labels_sorted[first_index], labels_sorted[second_index], labels_sorted[third_index]])

print "Top 3 predictions made. Time elapsed: %.0f seconds" %(time.time() - start_time)

In [None]:
# Write the test predictions to the csv file
# Make sure you have a file called "submission.csv" in the same file as this notebook!

print "Writing predictions to CSV file."
start_time = time.time()

with open('submission.csv', 'r+b') as submission:
    submission_writer = csv.writer(submission, delimiter=',')
    submission_writer.writerow(['row_id', 'place_id'])
    
    for i in range (predictions.shape[0]):
        row_id = str(i)
        place_ids = str(top_3_predictions[i][0]) + " " + str(top_3_predictions[i][1]) + " " + str(top_3_predictions[i][2])
        submission_writer.writerow([row_id, place_ids])

print "Writing done. Time Elapsed: %.0f seconds" %(time.time() - start_time)  