# Initialization

In [71]:
import csv
import numpy as np
import pandas as pd
import zipfile
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing

See what files are inside the zip from Kaggle:

In [3]:
z = zipfile.ZipFile('train.csv.zip')
print(z.namelist())

['train.csv']


# Data Pre-processing

Read the training data into a pandas data frame; parse the timestamp into year/week/time of day (hour)

In [None]:
kaggle_train = pd.read_csv(z.open('train.csv'), parse_dates=['Dates'])

kaggle_train['Year'] = kaggle_train['Dates'].map(lambda x: x.year)
kaggle_train['Week'] = kaggle_train['Dates'].map(lambda x: x.week)
kaggle_train['Hour'] = kaggle_train['Dates'].map(lambda x: x.hour)

In [None]:
print kaggle_train.info()

For now, I'm going to ignore Descript, Resolution, and Address. Descript and Resolution aren't available for the Kaggle test set, and address will be somewhat complicated to use. Remove these features from the kaggle_train data frame:

In [32]:
raw_kaggle_train = kaggle_train

kaggle_train = kaggle_train.drop(['Descript', 'Resolution', 'Address', 'Dates'], axis=1)

ValueError: labels ['Descript' 'Resolution' 'Address' 'Dates'] not contained in axis

Ultimately, we're going to be using scikit-learn to implement kNN. This requires that the weekday and police district be coded as dummy variables.

In [38]:
DistrictDummies = pd.get_dummies(kaggle_train['PdDistrict'])
WeekdayDummies = pd.get_dummies(kaggle_train['DayOfWeek'])

data=pd.concat([kaggle_train, DistrictDummies, WeekdayDummies], axis=1)
data=data.drop(['DayOfWeek', 'PdDistrict'], axis=1)
print data.head()

         Category           X          Y  Year  Week  Hour  BAYVIEW  CENTRAL  \
0        WARRANTS -122.425892  37.774599  2015    20    23        0        0   
1  OTHER OFFENSES -122.425892  37.774599  2015    20    23        0        0   
2  OTHER OFFENSES -122.424363  37.800414  2015    20    23        0        0   
3   LARCENY/THEFT -122.426995  37.800873  2015    20    23        0        0   
4   LARCENY/THEFT -122.438738  37.771541  2015    20    23        0        0   

   INGLESIDE  MISSION    ...      SOUTHERN  TARAVAL  TENDERLOIN  Friday  \
0          0        0    ...             0        0           0       0   
1          0        0    ...             0        0           0       0   
2          0        0    ...             0        0           0       0   
3          0        0    ...             0        0           0       0   
4          0        0    ...             0        0           0       0   

   Monday  Saturday  Sunday  Thursday  Tuesday  Wednesday  
0       

Split kaggle data into training and test sets

In [42]:
np.random.seed(seed=1)
test_idx = np.random.uniform(0, 1, len(data)) <= 0.3
train = data[test_idx==True]
test = data[test_idx==False]

print train.head()

          Category           X          Y  Year  Week  Hour  BAYVIEW  CENTRAL  \
2   OTHER OFFENSES -122.424363  37.800414  2015    20    23        0        0   
4    LARCENY/THEFT -122.438738  37.771541  2015    20    23        0        0   
5    LARCENY/THEFT -122.403252  37.713431  2015    20    23        0        0   
6    VEHICLE THEFT -122.423327  37.725138  2015    20    23        0        0   
12       VANDALISM -122.412414  37.783004  2015    20    22        0        0   

    INGLESIDE  MISSION    ...      SOUTHERN  TARAVAL  TENDERLOIN  Friday  \
2           0        0    ...             0        0           0       0   
4           0        0    ...             0        0           0       0   
5           1        0    ...             0        0           0       0   
6           1        0    ...             0        0           0       0   
12          0        0    ...             0        0           1       0   

    Monday  Saturday  Sunday  Thursday  Tuesday  Wednesd

# k-NN

In [51]:
import math

def llfun(act, pred):
    """ Logloss function for 1/0 probability
    """
    return (-(~(act == pred)).astype(int) * math.log(1e-15)).sum() / len(act)


In [77]:
basic_features = ['X', 'Y', 'Year', 'Week', 'Hour']
dayOfWeek_features = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
district_features = ['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK','RICHMOND','SOUTHERN', 'TARAVAL', 'TENDERLOIN']

features= basic_features + dayOfWeek_features + district_features

print features

['X', 'Y', 'Year', 'Week', 'Hour', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']


In [78]:
train_X = train[features]

scaler = preprocessing.StandardScaler().fit(train_X)

normalized_train_X = scaler.transform(train_X)
train_y = train['Category'].astype('category')

test_X = test[features]
normalized_test_X=scaler.transform(test_X)
test_actual = test['Category'].astype('category')

In [None]:

# Fit
logloss = []
for i in range(40,41):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(normalized_train_X, train_y)
    
    # Predict on test set
    outcome = knn.predict(normalized_test_X)
    
    # Logloss
    logloss.append(llfun(test_actual, outcome))

print logloss