In [9]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neural_network import BernoulliRBM
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report


In [2]:
data = pd.read_csv("/Users/cjllop/Code/MIDS/MLearning/Final/Data/train.csv")
test = pd.read_csv("/Users/cjllop/Code/MIDS/MLearning/Final/Data/test.csv")


In [3]:
# Big Picture of Data
print "Train Data:"
print data.shape
print data.columns.values
print "Test Data:"
print test.shape
print test.columns.values


Train Data:
(878049, 9)
['Dates' 'Category' 'Descript' 'DayOfWeek' 'PdDistrict' 'Resolution'
 'Address' 'X' 'Y']
Test Data:
(884262, 7)
['Id' 'Dates' 'DayOfWeek' 'PdDistrict' 'Address' 'X' 'Y']


In [4]:
# Add a column counting days since the min date in the dataset
def add_date_diff(df):
    datetime_vector = pd.to_datetime(df['Dates'])
    date_vector = datetime_vector.dt.date
    date_diff_vector = (date_vector - date_vector.min()) / np.timedelta64(1, 'D')
    df['DateDiff'] = date_diff_vector

add_date_diff(data)
add_date_diff(test)
#print data.DateDiff.describe()
#print test.DateDiff.describe()


In [5]:
# Create random dev sample so we can see how that accuracy compares to our Kaggle results
np.random.seed(100)

# Pick 10% of rows for dev
rows = np.random.choice(data.index, size = len(data) / 10, replace = False)

dev = data.ix[rows]
train = data.drop(rows)

print train.shape
print dev.shape
print test.shape


(790245, 10)
(87804, 10)
(884262, 8)


In [6]:
# Convert to Numpy Format
# TODO: Add more features here
train_data = np.array(train[['DateDiff','X','Y']].values)
train_labels = np.array(train[['Category']].values.ravel())

dev_data = np.array(dev[['DateDiff','X','Y']].values)
dev_labels = np.array(dev[['Category']].values.ravel())

full_data = np.array(data[['DateDiff','X','Y']].values)
full_labels = np.array(data[['Category']].values.ravel())

test_data = np.array(test[['DateDiff','X','Y']].values)

# Normalize Data to Between 0-1
# a + (x-A)*(b-a)/(B-A) 
# TODO: Fix normalization to kick out "bad" x/y
train_normed = 0 + (np.abs(train_data) - np.abs(train_data).min(axis=0))*(1-0)/(np.abs(train_data).max(axis=0) - np.abs(train_data).min(axis=0)) 
dev_normed = 0 + (np.abs(dev_data) - np.abs(dev_data).min(axis=0))*(1-0)/(np.abs(dev_data).max(axis=0) - np.abs(dev_data).min(axis=0)) 

print train_normed.min(axis=0)
print train_normed.max(axis=0)

[ 0.  0.  0.]
[ 1.  1.  1.]


In [None]:
# Fit a basic RBM

#NNmodel = BernoulliRBM()
#NNmodel.fit(train_data, train_labels)
dev_predict = NNmodel.predict(dev_data)

#print "The NN with normalized DateDiff, X and Y (k=1) scores: {:.6f}".format(metrics.f1_score(dev_labels, dev_predict, average='weighted'))


Code Below this point is from KNN - here to steal bits from while I work on NN.

In [9]:
# Use GridSearchCV to find a good number of neighbors.
#ks = {'n_neighbors': range(1,4)}
ks = {'n_neighbors': [1,2,3,4,5,6,7,8,9,10]}
KNNGridSearch = GridSearchCV(KNeighborsClassifier(), ks, scoring='f1_weighted')
KNNGridSearch.fit(train_data, train_labels)

# Report out on the accuracies    
print "The scores for each k value was %s " % (KNNGridSearch.grid_scores_)
print "The best k value was %s with accuracy %.4f" % (KNNGridSearch.best_params_, KNNGridSearch.best_score_)


The scores for each k value was [mean: 0.00303, std: 0.00382, params: {'n_neighbors': 1}, mean: 0.00314, std: 0.00389, params: {'n_neighbors': 2}, mean: 0.00300, std: 0.00380, params: {'n_neighbors': 3}, mean: 0.00302, std: 0.00378, params: {'n_neighbors': 4}, mean: 0.00299, std: 0.00371, params: {'n_neighbors': 5}, mean: 0.00300, std: 0.00373, params: {'n_neighbors': 6}, mean: 0.00297, std: 0.00372, params: {'n_neighbors': 7}, mean: 0.00297, std: 0.00371, params: {'n_neighbors': 8}, mean: 0.00295, std: 0.00369, params: {'n_neighbors': 9}, mean: 0.00295, std: 0.00368, params: {'n_neighbors': 10}] 
The best k value was {'n_neighbors': 2} with accuracy 0.0031


In [12]:
# Try k = 2
KNNmodel = KNeighborsClassifier(n_neighbors=1)
KNNmodel.fit(train_data, train_labels)
dev_predict = KNNmodel.predict(dev_data)
print "The KNN with DateDiff, X and Y (k=2) scores: {:.6f}".format(metrics.f1_score(dev_labels, dev_predict, average='weighted'))


# Tried a few tests (including the above). So far k=1 seems best.

The KNN with DateDiff, X and Y (k=2) scores: 0.220072


Great, now we've sorted out a naive alg to run. Let's train on all train, then predict on test

In [28]:
def create_submission(preds):
    labels = ["Id",
                "ARSON",
                "ASSAULT",
                "BAD CHECKS",
                "BRIBERY",
                "BURGLARY",
                "DISORDERLY CONDUCT",
                "DRIVING UNDER THE INFLUENCE",
                "DRUG/NARCOTIC",
                "DRUNKENNESS",
                "EMBEZZLEMENT",
                "EXTORTION",
                "FAMILY OFFENSES",
                "FORGERY/COUNTERFEITING",
                "FRAUD",
                "GAMBLING",
                "KIDNAPPING",
                "LARCENY/THEFT",
                "LIQUOR LAWS",
                "LOITERING",
                "MISSING PERSON",
                "NON-CRIMINAL",
                "OTHER OFFENSES",
                "PORNOGRAPHY/OBSCENE MAT",
                "PROSTITUTION",
                "RECOVERED VEHICLE",
                "ROBBERY",
                "RUNAWAY",
                "SECONDARY CODES",
                "SEX OFFENSES FORCIBLE",
                "SEX OFFENSES NON FORCIBLE",
                "STOLEN PROPERTY",
                "SUICIDE",
                "SUSPICIOUS OCC",
                "TREA",
                "TRESPASS",
                "VANDALISM",
                "VEHICLE THEFT",
                "WARRANTS",
                "WEAPON LAWS"
              ]
    head_str = ','.join(labels)

    num_cats = len(labels)
    
    # Make a dummy row to append to
    ids = np.arange(preds.shape[0])[np.newaxis].transpose()
    
    results = np.column_stack((ids, preds))

    # Write results to csv
    np.savetxt('sample.csv', results, fmt='%d', delimiter=',', header=head_str, comments='')

    return results

In [17]:
# Now that we've done this, let's run the KNN on the full train, apply to the test, then format.
KNNmodel = KNeighborsClassifier(n_neighbors=1)
KNNmodel.fit(full_data, full_labels)
dev_predict = KNNmodel.predict_proba(test_data).astype(int)

In [29]:
results = create_submission(dev_predict)

In [30]:
print results.shape

(884262, 40)
