## Code for the playground Kaggle SF Competition
Code by: Datageek51 (NP)
This is the code written for the purpose of playground local Pittsburgh competition using
Kaggle competition data set
The data set can be found in following location
' https://www.kaggle.com/c/sf-crime/data'


In [1]:
## comment this if you dont have GPU
from numba import jit, cuda 
## importing the standard stuff
import numpy as np
import pandas as pd

# getting the sklearns; added as needed
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.ensemble import GradientBoostingClassifier


In [2]:
#Loading Data (stored in local drive)
# parse the dates to capture as date
path='D:/DataScience/Kaggle/SFData/'
train = pd.read_csv(path +'train.csv', parse_dates = ["Dates"])
test = pd.read_csv(path + 'test.csv', parse_dates = ["Dates"])

In [3]:
# Peek at the data set
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,CategoryVal
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,1
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,2
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,3
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,3


In [4]:
# Convert categorical variable into dummy/indicator variables for training data
# Then combining Weekday, district (region), hour of crime

##Get dummy indicator values 
district = pd.get_dummies(train.PdDistrict)
days = pd.get_dummies(train.DayOfWeek)
hour = train.Dates.dt.hour
hour = pd.get_dummies(hour)


#Merge days, district and hour to make a new array
train_features = pd.concat([days, district, hour], axis=1)
train_features.replace(0,np.nan).dropna(axis=1,how="all")
train_features

Unnamed: 0,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,BAYVIEW,CENTRAL,INGLESIDE,...,14,15,16,17,18,19,20,21,22,23
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878044,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
878045,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
878046,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
878047,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#Repeat same for test data
district = pd.get_dummies(test.PdDistrict)
days = pd.get_dummies(test.DayOfWeek)
hour = test.Dates.dt.hour
hour = pd.get_dummies(hour)
#Merge days, district & hour of test_data to make a new array
test_data = pd.concat([days, district, hour], axis=1)
test_data

Unnamed: 0,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,BAYVIEW,CENTRAL,INGLESIDE,...,14,15,16,17,18,19,20,21,22,23
0,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884257,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
884258,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
884259,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
884260,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#Similarly, also converting category of crime to numerical values
# first combining the category val with category
# 2/11: This attempt did not help improve score (went to 5) so commenting off
#CatVal = train['Category'] + train['CategoryVal'].astype(str) 
# CatVal2 = train.CategoryVal

#converting category of crime to numerical values
catgry = preprocessing.LabelEncoder()
crime_category = catgry.fit_transform(train.Category)

crime_category

array([37, 21, 21, ..., 16, 35, 12])

In [7]:
# Split the data in train and test sub-sets.
X_train, X_test, y_train, y_test = train_test_split(train_features, crime_category, test_size = 0.15, random_state = 4) 
 
X_train.head()

Unnamed: 0,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,BAYVIEW,CENTRAL,INGLESIDE,...,14,15,16,17,18,19,20,21,22,23
670191,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
337006,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
137695,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
876861,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
282959,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
# initialising the classifier
# class sklearn.ensemble.GradientBoostingClassifier(loss='deviance', 
#     learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, 
#     min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, 
#     verbose=0, max_leaf_nodes=None, warm_start=False)
##@jit(target='gpu')
data_gbc = GradientBoostingClassifier(n_estimators=500, random_state=0)
data_gbc.fit(X_train, y_train)

In [None]:
y_pred = data_gbc.predict_proba(X_test)
print(log_loss(y_test, y_pred))

In [38]:
#Write results
final_pred = data_gbc.predict_proba(test_data)
result=pd.DataFrame(final_pred, columns=crime1.classes_)
path='D:/DataScience/Kaggle/SFData/'
result.to_csv(path + 'submission.csv', index = True, index_label = 'Id' )
