# MIDS W207 Fall 2017 Final Project
## Baseline Submission
Laura Williams, Kim Vignola, Cyprian Gascoigne  
SF Crime Classification

In [13]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

#SK-Learn Naive Bayes
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

Read in data

In [14]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train_features = list(train.keys())

Examine data

Description of data from Kaggle:  
Dates - timestamp of the crime incident  
Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.  
Descript - detailed description of the crime incident (only in train.csv)  
DayOfWeek - the day of the week  
PdDistrict - name of the Police Department District  
Resolution - how the crime incident was resolved (only in train.csv)  
Address - the approximate street address of the crime incident   
X - Longitude  
Y - Latitude  

In [15]:
print('First rows of train data: \n', train.head())
print('\nFirst rows of test data: \n', test.head())
print("\nThe features in the training data are: \n", train_features)
print("\nThe shape of the train data is", train.shape)
print("The shape of the test data is", test.shape)

First rows of train data: 
                  Dates        Category                      Descript  \
0  2015-05-13 23:53:00        WARRANTS                WARRANT ARREST   
1  2015-05-13 23:53:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
2  2015-05-13 23:33:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
3  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   
4  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   

   DayOfWeek PdDistrict      Resolution                    Address  \
0  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
1  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
2  Wednesday   NORTHERN  ARREST, BOOKED  VANNESS AV / GREENWICH ST   
3  Wednesday   NORTHERN            NONE   1500 Block of LOMBARD ST   
4  Wednesday       PARK            NONE  100 Block of BRODERICK ST   

            X          Y  
0 -122.425892  37.774599  
1 -122.425892  37.774599  
2 -122.424363  37.800414  
3 -122.426

Restructure data for modeling

In [16]:
# Encode string features into numeric features
from sklearn import preprocessing
LE = preprocessing.LabelEncoder()

train_data_all = np.column_stack((LE.fit_transform(train['Dates']),
                                 LE.fit_transform(train['DayOfWeek']),
                                 LE.fit_transform(train['PdDistrict']),
                                 LE.fit_transform(train['Address']),
                                 train['X'],
                                 train['Y']))

train_labels_all = np.array(train['Category'])

test_data_all = np.column_stack((LE.fit_transform(test['Dates']),
                                LE.fit_transform(test['DayOfWeek']),
                                LE.fit_transform(test['PdDistrict']),
                                LE.fit_transform(test['Address']),
                                test['X'],
                                test['Y']))

print("Training data shape is", train_data_all.shape)
print("First few rows of training data are", train_data_all[:3])
print()
print("Training labels shape is", train_labels_all.shape)
print("First few labels of training labels are", train_labels_all[:3])
print()
print("Test data shape is", test_data_all.shape)
print("First few rows of training data are", test_data_all[:3])


Training data shape is (878049, 6)
First few rows of training data are [[  3.89256000e+05   6.00000000e+00   4.00000000e+00   1.97900000e+04
   -1.22425892e+02   3.77745986e+01]
 [  3.89256000e+05   6.00000000e+00   4.00000000e+00   1.97900000e+04
   -1.22425892e+02   3.77745986e+01]
 [  3.89255000e+05   6.00000000e+00   4.00000000e+00   2.26970000e+04
   -1.22424363e+02   3.78004143e+01]]

Training labels shape is (878049,)
First few labels of training labels are ['WARRANTS' 'OTHER OFFENSES' 'OTHER OFFENSES']

Test data shape is (884262, 6)
First few rows of training data are [[  3.92172000e+05   3.00000000e+00   0.00000000e+00   6.40700000e+03
   -1.22399588e+02   3.77350510e+01]
 [  3.92171000e+05   3.00000000e+00   0.00000000e+00   9.74400000e+03
   -1.22391523e+02   3.77324324e+01]
 [  3.92170000e+05   3.00000000e+00   4.00000000e+00   6.33600000e+03
   -1.22426002e+02   3.77922124e+01]]


Set aside 20% of training data as development data

In [17]:
n = train_data_all.shape[0]

shuffle = np.random.permutation(np.arange(train_data_all.shape[0]))

train_data_all = train_data_all[shuffle]
train_labels_all = train_labels_all[shuffle]

n_train = int(0.8*n)

train_data = train_data_all[:n_train,:]
train_labels = train_labels_all[:n_train]
dev_data = train_data_all[n_train:,:]
dev_labels = train_labels_all[n_train:]


print("Training data shape is", train_data.shape)
print("Training labels shape is,", train_labels.shape)
print()
print("Development data shape is", dev_data.shape)
print("Development labels shape is", dev_labels.shape)


Training data shape is (702439, 6)
Training labels shape is, (702439,)

Development data shape is (175610, 6)
Development labels shape is (175610,)


In [18]:
neigh = KNeighborsClassifier(n_neighbors = 1)
neigh.fit(train_data, train_labels)
predict = neigh.predict(test_data_all)
print(predict[:5])

['OTHER OFFENSES' 'NON-CRIMINAL' 'OTHER OFFENSES' 'LARCENY/THEFT'
 'LARCENY/THEFT']


Save CSV file for Kaggle

In [19]:
# THIS IS THE CODE FROM ISABELL - NEEDS TO BE EDITED TO MATCH OUR DATA
# sample_submission = pd.read_csv('sample_submission.csv')
# print(sample_submission.head())

# imageId = range(1,X_test.shape[0]+1)
# d = {'ImageId': imageId, 'Label': preds}

# my_submission = pd.DataFrame(data = d)
# my_submission.to_csv('my_submission.csv', index = False)