In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report


In [2]:
data = pd.read_csv("/Users/cjllop/Code/MIDS/MLearning/Final/Data/train.csv")
test = pd.read_csv("/Users/cjllop/Code/MIDS/MLearning/Final/Data/test.csv")


In [3]:
# Big Picture of Data
print "Train Data:"
print data.shape
print data.columns.values
print "Test Data:"
print test.shape
print test.columns.values


Train Data:
(878049, 9)
['Dates' 'Category' 'Descript' 'DayOfWeek' 'PdDistrict' 'Resolution'
 'Address' 'X' 'Y']
Test Data:
(884262, 7)
['Id' 'Dates' 'DayOfWeek' 'PdDistrict' 'Address' 'X' 'Y']


In [4]:
# Add a column counting days since the min date in the dataset
def add_date_diff(df):
    datetime_vector = pd.to_datetime(df['Dates'])
    date_vector = datetime_vector.dt.date
    date_diff_vector = (date_vector - date_vector.min()) / np.timedelta64(1, 'D')
    df['DateDiff'] = date_diff_vector

add_date_diff(data)
add_date_diff(test)
print data.DateDiff.describe()
print test.DateDiff.describe()


count    878049.000000
mean       2260.778323
std        1325.343365
min           0.000000
25%        1101.000000
50%        2252.000000
75%        3444.000000
max        4510.000000
Name: DateDiff, dtype: float64
count    884262.000000
mean       2259.105934
std        1327.543529
min           0.000000
25%        1088.000000
50%        2250.000000
75%        3444.000000
max        4512.000000
Name: DateDiff, dtype: float64


In [5]:
# Create random dev sample so we can see how that accuracy compares to our Kaggle results
np.random.seed(100)

rows = np.random.choice(data.index, size = len(data) / 10, replace = False)

dev = data.ix[rows]
train = data.drop(rows)

print train.shape
print dev.shape
print test.shape


(790245, 10)
(87804, 10)
(884262, 8)


In [6]:
# Convert to Numpy Format
train_data = np.array(train[['DateDiff','X','Y']].values)
train_labels = np.array(train[['Category']].values.ravel())

dev_data = np.array(dev[['DateDiff','X','Y']].values)
dev_labels = np.array(dev[['Category']].values.ravel())

test_data = np.array(test[['DateDiff','X','Y']].values)

# Normalize Data to Between 0-1
# a + (x-A)*(b-a)/(B-A) 
train_normed = 0 + (np.abs(train_data) - np.abs(train_data).min(axis=0))*(1-0)/(np.abs(train_data).max(axis=0) - np.abs(train_data).min(axis=0)) 
dev_normed = 0 + (np.abs(dev_data) - np.abs(dev_data).min(axis=0))*(1-0)/(np.abs(dev_data).max(axis=0) - np.abs(dev_data).min(axis=0)) 

print train_normed.min(axis=0)
print train_normed.max(axis=0)

[ 0.  0.  0.]
[ 1.  1.  1.]


In [7]:
# Fit a basic KNN with X, and Y
KNNmodel = KNeighborsClassifier(n_neighbors=1)
KNNmodel.fit(train_data[:,1:], train_labels)
dev_predict = KNNmodel.predict(dev_data[:,1:])
print "The KNN with X and Y (n=1) scores: {:.6f}".format(metrics.f1_score(dev['Category'], dev_predict, average='weighted'))

# Fit a basic KNN with DateDiff, X, and Y
KNNmodel = KNeighborsClassifier(n_neighbors=1)
KNNmodel.fit(train_data, train_labels)
dev_predict = KNNmodel.predict(dev_data)
print "The KNN with DateDiff, X and Y (n=1) scores: {:.6f}".format(metrics.f1_score(dev['Category'], dev_predict, average='weighted'))

# Fit a basic KNN with Normalized Data
KNNmodel = KNeighborsClassifier(n_neighbors=1)
KNNmodel.fit(train_normed, train_labels)
dev_predict = KNNmodel.predict(dev_normed)
print "The KNN with normalized DateDiff, X and Y (n=1) scores: {:.6f}".format(metrics.f1_score(dev['Category'], dev_predict, average='weighted'))


The KNN with X and Y (n=1) scores: 0.170861
The KNN with DateDiff, X and Y (n=1) scores: 0.220072
The KNN with normalized DateDiff, X and Y (n=1) scores: 0.217498


  'precision', 'predicted', average, warn_for)


In [8]:
# Use GridSearchCV to find a good number of neighbors.
# IN PROGRESS
#ks = {'n_neighbors': range(1,4)}
ks = {'n_neighbors': [1,5,10,50,100,500,1000,2500,5000,10000]}
KNNGridSearch = GridSearchCV(KNeighborsClassifier(), ks, scoring='f1_weighted')
KNNGridSearch.fit(train_data, train_labels)

# Report out on the accuracies    
print "The scores for each k value was %s " % (KNNGridSearch.grid_scores_)
print "The best k value was %s with accuracy %.4f" % (KNNGridSearch.best_params_, KNNGridSearch.best_score_)


KeyboardInterrupt: 