In [23]:
# Import Packages
import csv
import time
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load in the training data .csv file and convert it to a useable format
# This cell should take about 3 minutes to run
start_time = time.time()
print "Loading training data."

train_data_raw = []

with open("train.csv") as train_csv:
    reader = csv.reader(train_csv)

    for row in reader:
        train_data_raw.append(row)
        
print "Training data loaded. Time Elapsed: %.0f seconds" %(time.time() - start_time)

Loading training data.
Training data loaded. Time Elapsed: 126 seconds


In [3]:
data_size = len(train_data_raw) # This parameter can be adjusted

start_time = time.time()
print "Preprocessing training data."

train_data = []
train_labels = []
#dev_data = []
#dev_labels = []

for i in range(1, data_size):
    next_row = []
    for j in range(1, 5):
        next_row.append(float(train_data_raw[i][j]))
    
    train_data.append(next_row)
    train_labels.append(train_data_raw[i][5])
#    if (i > data_size/6):
#        train_data.append(next_row)
#        train_labels.append(train_data_raw[i][5])
#    
#    else:
#        dev_data.append(next_row)
#        dev_labels.append(train_data_raw[i][5])

num_train_labels = len(set(train_labels))
#num_dev_labels = len(set(dev_labels))
#labels_intersection = set(train_labels).intersection(dev_labels)
#dev_only_labels = num_dev_labels - len(labels_intersection)

train_data = scale(train_data)
#dev_data = scale(dev_data)

print "Training data preprocessed. Time Elapsed: %.0f seconds" %(time.time() - start_time)
print "Training data size: %d" %len(train_data)
#print "Dev data size: %d" %len(dev_data)
print "Number of distinct training labels: %d" %num_train_labels
#print "Number of distinct dev labels: %d" %num_dev_labels
#print "Number of dev labels not found in the training data: %d" %dev_only_labels

Preprocessing training data.
Training data preprocessed. Time Elapsed: 528 seconds
Training data size: 29118021
Number of distinct training labels: 108390


In [4]:
# Train a knn model
start_time = time.time()
print "Training a KNN model."
k_neighbors_classifier = KNeighborsClassifier(n_neighbors = 10)
k_neighbors_classifier.fit(train_data, train_labels)
print "Model trained. Time elapsed: %d seconds" %(time.time() - start_time)

Training a KNN model.
Model trained. Time elapsed: 2863 seconds


In [5]:
# Load in the test data .csv file and convert it to a useable format
# This cell should take about 3 minutes to run 
# It only needs to be run if we have a trained model that's ready to test
start_time = time.time()
print "Loading/formatting/preprocessing test data."

test_csv = open("test.csv")
reader = csv.reader(test_csv)

test_data_raw = []

for row in reader:
    test_data_raw.append(row)
     
test_data = []
for i in range(1, len(test_data_raw)):
    next_row = []
    for j in range(1, 5):
        next_row.append(float(test_data_raw[i][j]))
    
    test_data.append(next_row)

test_data = scale(test_data)

print "Test data loaded/formatted/preprocessed Time Elapsed: %.0f seconds" \
    %(time.time() - start_time)

Loading/formatting/preprocessing test data.
Test data loaded/formatted/preprocessed Time Elapsed: 522 seconds


In [61]:
start_time = time.time()
print "Sorting possible labels."
labels_sorted = list(set(train_labels))
labels_sorted.sort()
print "Labels sorted. Time elapsed: %.0f seconds" %(time.time() - start_time)

def top_3_predictions(probabilities):
    
    probabilities = list(probabilities)
    sorted_probabilities = list(probabilities)
    sorted_probabilities.sort()
    
    first_index = probabilities.index(sorted_probabilities[len(sorted_probabilities) - 1])
    second_index = probabilities.index(sorted_probabilities[len(sorted_probabilities) - 2])
    third_index = probabilities.index(sorted_probabilities[len(sorted_probabilities) - 3])
    
    return [labels_sorted[first_index],labels_sorted[second_index],labels_sorted[third_index]]


start_time = time.time()
print "\nPredicting the test data.\n"

predictions = []

batch_size = 100
num_of_batches = int(np.ceil(len(train_data)/float(batch_size)))

for i in range(num_of_batches):
    
    first_index = batch_size * i
    last_index = batch_size * (i+1)
    if last_index > len(test_data) - 1:
        last_index = len(test_data) - 1
        
    probabilities = k_neighbors_classifier.predict_proba(test_data[first_index:last_index])
    
    for j in range(batch_size):
        next_predictions = top_3_predictions(probabilities[j])
        predictions.append(next_predictions)
                
    if (i+1) % 1000 == 0:
        print "(%d / %d) data points predicted after %.0f seconds" \
            %(last_index, len(test_data), (time.time() - start_time))
    
print "\nTest data predicted. Time Elapsed: %.0f seconds" %(time.time() - start_time)

Sorting possible labels.
Labels sorted. Time elapsed: 18 seconds

Predicting the test data.


(100000 / 8607230) data points predicted after 981 seconds

(200000 / 8607230) data points predicted after 1943 seconds

(300000 / 8607230) data points predicted after 2907 seconds

(400000 / 8607230) data points predicted after 3873 seconds

(500000 / 8607230) data points predicted after 4834 seconds

(600000 / 8607230) data points predicted after 5809 seconds

(700000 / 8607230) data points predicted after 6771 seconds

(800000 / 8607230) data points predicted after 7737 seconds

(900000 / 8607230) data points predicted after 8700 seconds

(1000000 / 8607230) data points predicted after 9664 seconds

(1100000 / 8607230) data points predicted after 10627 seconds

(1200000 / 8607230) data points predicted after 11590 seconds

(1300000 / 8607230) data points predicted after 12561 seconds

(1400000 / 8607230) data points predicted after 13523 seconds

(1500000 / 8607230) data points predicted af

IndexError: index 30 is out of bounds for axis 0 with size 30

In [62]:
print "Writing predictions to CSV file."
start_time = time.time()

with open('submission.csv', 'r+b') as submission:
    submission_writer = csv.writer(submission, delimiter=',')
    submission_writer.writerow(['row_id', 'place_id'])
    
    for i in range (len(predictions)):
        row_id = str(i)
        place_ids = str(predictions[i][0]) + " " + str(predictions[i][1]) + " " + str(predictions[i][2])
        submission_writer.writerow([row_id, place_ids])

print "Writing done. Time Elapsed: %.0f seconds" %(time.time() - start_time)  

Writing predictions to CSV file.
Writing done. Time Elapsed: 34 seconds
