In [68]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report


In [118]:
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
data = data[data.Y != 90]
#data = data[data.Resolution != 'NONE']
stk_list = ['LARCENY/THEFT','OTHER OFFENSES','NON-CRIMINAL','ASSAULT','DRUG/NARCOTIC']
data = data[data.Category.isin(stk_list)]
#test = test[data.PdDistrict == 'MISSION']
data['Dates'] = pd.to_datetime(data['Dates'])
data['Year'] = data.Dates.dt.year
data['Month'] = data.Dates.dt.month
data['Day'] = data.Dates.dt.day
data['Date'] = data.Dates.dt.date
data['Hour'] = data.Dates.dt.hour
data['DayOfYear'] = data.Dates.dt.dayofyear
data['WeekDay'] = data.Dates.dt.weekday
#
test['Dates'] = pd.to_datetime(test['Dates'])
test['Year'] = test.Dates.dt.year
test['Month'] = test.Dates.dt.month
test['Day'] = test.Dates.dt.day
test['Date'] = test.Dates.dt.date
test['Hour'] = test.Dates.dt.hour
test['DayOfYear'] = test.Dates.dt.dayofyear
test['WeekDay'] = test.Dates.dt.weekday


In [119]:
data['PdCode'] = data['PdDistrict'].replace('SOUTHERN',1)
data['PdCode'] = data['PdCode'].replace('MISSION',2)
data['PdCode'] = data['PdCode'].replace('NORTHERN',3)
data['PdCode'] = data['PdCode'].replace('BAYVIEW',4)
data['PdCode'] = data['PdCode'].replace('CENTRAL',5)
data['PdCode'] = data['PdCode'].replace('TENDERLOIN',6)
data['PdCode'] = data['PdCode'].replace('INGLESIDE',7)
data['PdCode'] = data['PdCode'].replace('TARAVAL',8)
data['PdCode'] = data['PdCode'].replace('PARK',9)
data['PdCode'] = data['PdCode'].replace('RICHMOND',10)
test['PdCode'] = test['PdDistrict'].replace('SOUTHERN',1)
test['PdCode'] = test['PdCode'].replace('MISSION',2)
test['PdCode'] = test['PdCode'].replace('NORTHERN',3)
test['PdCode'] = test['PdCode'].replace('BAYVIEW',4)
test['PdCode'] = test['PdCode'].replace('CENTRAL',5)
test['PdCode'] = test['PdCode'].replace('TENDERLOIN',6)
test['PdCode'] = test['PdCode'].replace('INGLESIDE',7)
test['PdCode'] = test['PdCode'].replace('TARAVAL',8)
test['PdCode'] = test['PdCode'].replace('PARK',9)
test['PdCode'] = test['PdCode'].replace('RICHMOND',10)

In [120]:
data['PdCode']

1      3
2      3
3      3
4      9
5      7
8     10
9      5
10     5
11     8
13     3
14     4
15     4
17     7
18     4
19     6
...
878024     9
878025     4
878028     1
878029     6
878031     4
878032     3
878035     3
878036     3
878039     3
878040     2
878041    10
878042     4
878043     4
878045     7
878046     1
Name: PdCode, Length: 524193, dtype: int64

In [110]:
# Big Picture of Data
print "Train Data:"
print data.shape
print data.columns.values
print "Test Data:"
print test.shape
print test.columns.values


Train Data:
(877982, 17)
['Dates' 'Category' 'Descript' 'DayOfWeek' 'PdDistrict' 'Resolution'
 'Address' 'X' 'Y' 'Year' 'Month' 'Day' 'Date' 'Hour' 'DayOfYear' 'WeekDay'
 'PdCode']
Test Data:
(884262, 15)
['Id' 'Dates' 'DayOfWeek' 'PdDistrict' 'Address' 'X' 'Y' 'Year' 'Month'
 'Day' 'Date' 'Hour' 'DayOfYear' 'WeekDay' 'PdCode']


In [121]:
# Add a column counting days since the min date in the dataset
def add_date_diff(df):
    datetime_vector = pd.to_datetime(df['Dates'])
    date_vector = datetime_vector.dt.date
    date_diff_vector = (date_vector - date_vector.min()) / np.timedelta64(1, 'D')
    df['DateDiff'] = date_diff_vector

add_date_diff(data)
add_date_diff(test)
print data.DateDiff.describe()
print test.DateDiff.describe()


count    524193.000000
mean       2320.083996
std        1321.972685
min           0.000000
25%        1184.000000
50%        2333.000000
75%        3504.000000
max        4510.000000
Name: DateDiff, dtype: float64
count    884262.000000
mean       2259.105934
std        1327.543529
min           0.000000
25%        1088.000000
50%        2250.000000
75%        3444.000000
max        4512.000000
Name: DateDiff, dtype: float64


In [122]:
# Create random dev sample so we can see how that accuracy compares to our Kaggle results
np.random.seed(100)

rows = np.random.choice(data.index, size = len(data) / 10, replace = False)

dev = data.ix[rows]
train = data.drop(rows)

print train.shape
print dev.shape
print test.shape


(471774, 18)
(52419, 18)
(884262, 16)


In [125]:
# Convert to Numpy Format
train_data = np.array(train[['DateDiff','X','Y']].values)
train_labels = np.array(train[['Category']].values.ravel())

dev_data = np.array(dev[['DateDiff','X','Y']].values)
dev_labels = np.array(dev[['Category']].values.ravel())

full_data = np.array(data[['DateDiff','X','Y']].values)
full_labels = np.array(data[['Category']].values.ravel())

test_data = np.array(test[['DateDiff','X','Y']].values)

# Normalize Data to Between 0-1
#a + (x-A)*(b-a)/(B-A) 
train_normed = 0 + (np.abs(train_data) - np.abs(train_data).min(axis=0))*(1-0)/(np.abs(train_data).max(axis=0) - np.abs(train_data).min(axis=0)) 
dev_normed = 0 + (np.abs(dev_data) - np.abs(dev_data).min(axis=0))*(1-0)/(np.abs(dev_data).max(axis=0) - np.abs(dev_data).min(axis=0)) 

print train_normed.min(axis=0)
print train_normed.max(axis=0)

[ 0.  0.  0.]
[ 1.  1.  1.]


In [128]:

# Fit a basic KNN with DateDiff, X, and Y
KNNmodel = KNeighborsClassifier(n_neighbors=1)
KNNmodel.fit(train_data, train_labels)
dev_predict = KNNmodel.predict(dev_data)
print "The KNN with DateDiff, X and Y (k=1) scores: {:.6f}".format(metrics.f1_score(dev_labels, dev_predict, average='weighted'))

# Fit a basic KNN with Normalized Data
KNNmodel = KNeighborsClassifier(n_neighbors=1)
KNNmodel.fit(train_normed, train_labels)
dev_predict = KNNmodel.predict(dev_normed)
print "The KNN with normalized DateDiff, X and Y (k=1) scores: {:.6f}".format(metrics.f1_score(dev_labels, dev_predict, average='weighted'))


The KNN with DateDiff, X and Y (k=1) scores: 0.367664
The KNN with normalized DateDiff, X and Y (k=1) scores: 0.394827


KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=4, n_init=10,
    n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=0)

(5685,)

In [9]:
# Use GridSearchCV to find a good number of neighbors.
#ks = {'n_neighbors': range(1,4)}
ks = {'n_neighbors': [1,2,3,4,5,6,7,8,9,10]}
KNNGridSearch = GridSearchCV(KNeighborsClassifier(), ks, scoring='f1_weighted')
KNNGridSearch.fit(train_data, train_labels)

# Report out on the accuracies    
print "The scores for each k value was %s " % (KNNGridSearch.grid_scores_)
print "The best k value was %s with accuracy %.4f" % (KNNGridSearch.best_params_, KNNGridSearch.best_score_)


The scores for each k value was [mean: 0.00303, std: 0.00382, params: {'n_neighbors': 1}, mean: 0.00314, std: 0.00389, params: {'n_neighbors': 2}, mean: 0.00300, std: 0.00380, params: {'n_neighbors': 3}, mean: 0.00302, std: 0.00378, params: {'n_neighbors': 4}, mean: 0.00299, std: 0.00371, params: {'n_neighbors': 5}, mean: 0.00300, std: 0.00373, params: {'n_neighbors': 6}, mean: 0.00297, std: 0.00372, params: {'n_neighbors': 7}, mean: 0.00297, std: 0.00371, params: {'n_neighbors': 8}, mean: 0.00295, std: 0.00369, params: {'n_neighbors': 9}, mean: 0.00295, std: 0.00368, params: {'n_neighbors': 10}] 
The best k value was {'n_neighbors': 2} with accuracy 0.0031


In [127]:
# Try k = 2
KNNmodel = KNeighborsClassifier(n_neighbors=1)
KNNmodel.fit(train_data, train_labels)
dev_predict = KNNmodel.predict(dev_data)
print "The KNN with DateDiff, X and Y (k=2) scores: {:.6f}".format(metrics.f1_score(dev_labels, dev_predict, average='weighted'))


# Tried a few tests (including the above). So far k=1 seems best.

The KNN with DateDiff, X and Y (k=2) scores: 0.367664


Great, now we've sorted out a naive alg to run. Let's train on all train, then predict on test

In [129]:
def create_submission(preds):
    labels = ["Id",
                "ARSON",
                "ASSAULT",
                "BAD CHECKS",
                "BRIBERY",
                "BURGLARY",
                "DISORDERLY CONDUCT",
                "DRIVING UNDER THE INFLUENCE",
                "DRUG/NARCOTIC",
                "DRUNKENNESS",
                "EMBEZZLEMENT",
                "EXTORTION",
                "FAMILY OFFENSES",
                "FORGERY/COUNTERFEITING",
                "FRAUD",
                "GAMBLING",
                "KIDNAPPING",
                "LARCENY/THEFT",
                "LIQUOR LAWS",
                "LOITERING",
                "MISSING PERSON",
                "NON-CRIMINAL",
                "OTHER OFFENSES",
                "PORNOGRAPHY/OBSCENE MAT",
                "PROSTITUTION",
                "RECOVERED VEHICLE",
                "ROBBERY",
                "RUNAWAY",
                "SECONDARY CODES",
                "SEX OFFENSES FORCIBLE",
                "SEX OFFENSES NON FORCIBLE",
                "STOLEN PROPERTY",
                "SUICIDE",
                "SUSPICIOUS OCC",
                "TREA",
                "TRESPASS",
                "VANDALISM",
                "VEHICLE THEFT",
                "WARRANTS",
                "WEAPON LAWS"
              ]
    head_str = ','.join(labels)

    num_cats = len(labels)
    
    # Make a dummy row to append to
    ids = np.arange(preds.shape[0])[np.newaxis].transpose()
    
    results = np.column_stack((ids, preds))

    # Write results to csv
    np.savetxt('sample.csv', results, fmt='%d', delimiter=',', header=head_str, comments='')

    return results

In [130]:
# Now that we've done this, let's run the KNN on the full train, apply to the test, then format.
KNNmodel = KNeighborsClassifier(n_neighbors=1)
KNNmodel.fit(full_data, full_labels)
dev_predict = KNNmodel.predict_proba(test_data).astype(int)

In [131]:
results = create_submission(dev_predict)

In [132]:
print results.shape

(884262, 6)


In [135]:
KNNmodel

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=1, p=2, weights='uniform')