# Model training

In [2]:
#Import and preparation of data
import pandas as pd
import zipfile
import numpy as np
import datetime
import time
from pandas.tseries.holiday import USFederalHolidayCalendar
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start='2003-01-01', end='2015-05-31').to_pydatetime()


z = zipfile.ZipFile('../input/train.csv.zip')
train = pd.read_csv(z.open('train.csv'), parse_dates=['Dates'])

#On sample only : remove #
#train = train.head(100)

#Remove irrelevant coordinates
train = train[train.Y < 90]

#Calculates features 
train['Year'] = train['Dates'].map(lambda x: int(x.year))
train['Month'] = train['Dates'].map(lambda x: int(x.month))
train['Day'] = train['Dates'].map(lambda x: int(x.day))
train['DayOfYear'] = train['Dates'].map(lambda x: int(x.strftime("%m%d")))
train['Time'] = train['Dates'].map(lambda x: int(str(x.hour)))
train['DayOfWeekNb'] = train['Dates'].map(lambda x: x.strftime("%w")) #1 = Monday, 7 = Sunday
train['Holiday'] = train['Dates'].map(lambda x: datetime.datetime(x.year,x.month,x.day) in holidays) #true if holiday

#to add: binary for each district and each category
PdDistrict = np.unique(train['PdDistrict'])
Category = np.unique(train['Category'])
for p in PdDistrict:
    train[p] = (train.PdDistrict == p)
for c in Category:
    train[c] = (train.Category == c)

#Display of 10 first lines to show the structure
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Year,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,...,False,False,False,False,False,False,False,False,True,False
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,...,False,False,False,False,False,False,False,False,False,False
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015,...,False,False,False,False,False,False,False,False,False,False
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,...,False,False,False,False,False,False,False,False,False,False
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,...,False,False,False,False,False,False,False,False,False,False


In [3]:
#Features selection for training
X = train[train.columns[7:26]]

X.head()

Unnamed: 0,X,Y,Year,Month,Day,DayOfYear,Time,DayOfWeekNb,Holiday,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,-122.425892,37.774599,2015,5,13,513,23,3,False,False,False,False,False,True,False,False,False,False,False
1,-122.425892,37.774599,2015,5,13,513,23,3,False,False,False,False,False,True,False,False,False,False,False
2,-122.424363,37.800414,2015,5,13,513,23,3,False,False,False,False,False,True,False,False,False,False,False
3,-122.426995,37.800873,2015,5,13,513,23,3,False,False,False,False,False,True,False,False,False,False,False
4,-122.438738,37.771541,2015,5,13,513,23,3,False,False,False,False,False,False,True,False,False,False,False


In [4]:
#Results selection for training
Y = train[train.columns[27:]]

Y.head()

Unnamed: 0,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Choice of model

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score

#other models to try: Gradient boosting, random forest (to know if a feature is interesting: feature_importances), SVM, Adaboost
#To add : make a model per group of categories

model = RandomForestRegressor(random_state=0, n_estimators=100)


In [None]:
#to add : test features with feature_importances


In [None]:
#Manual Cross-validation
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
model.fit(X_train,Y_train)
model.score(X_test, y_test)

In [None]:
#Automatic cross-validation

#score = cross_val_score(model, X, Y).mean()
#print("Score with the entire dataset = %.2f" % score)

# Application of model to test data for Kaggle submission

In [63]:
#Import test data

z = zipfile.ZipFile('../input/test.csv.zip')
kaggle_test = pd.read_csv(z.open('test.csv'), parse_dates=['Dates'])
#Columns are : Id,Dates,DayOfWeek,PdDistrict,Address,X,Y

#On sample only : remove #
#kaggle_test = kaggle_test.head()

#Calculates features 
kaggle_test['Year'] = kaggle_test['Dates'].map(lambda x: int(x.year))
kaggle_test['Month'] = kaggle_test['Dates'].map(lambda x: int(x.month))
kaggle_test['Day'] = kaggle_test['Dates'].map(lambda x: int(x.day))
kaggle_test['DayOfYear'] = kaggle_test['Dates'].map(lambda x: int(x.strftime("%m%d")))
kaggle_test['Time'] = kaggle_test['Dates'].map(lambda x: int(str(x.hour)))
kaggle_test['DayOfWeekNb'] = kaggle_test['Dates'].map(lambda x: x.strftime("%w")) #1 = Monday, 7 = Sunday
kaggle_test['Holiday'] = kaggle_test['Dates'].map(lambda x: datetime.datetime(x.year,x.month,x.day) in holidays) #true if holiday

#to add: binary for each district and each category
for p in PdDistrict:
    kaggle_test[p] = (kaggle_test.PdDistrict == p)

#Selection of features
kaggle_X_test = kaggle_test[kaggle_test.columns[5:24]]
kaggle_X_test.head()

Unnamed: 0,X,Y,Year,Month,Day,DayOfYear,Time,DayOfWeekNb,Holiday,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,-122.399588,37.735051,2015,5,10,510,23,0,False,True,False,False,False,False,False,False,False,False,False
1,-122.391523,37.732432,2015,5,10,510,23,0,False,True,False,False,False,False,False,False,False,False,False
2,-122.426002,37.792212,2015,5,10,510,23,0,False,False,False,False,False,True,False,False,False,False,False
3,-122.437394,37.721412,2015,5,10,510,23,0,False,False,False,True,False,False,False,False,False,False,False
4,-122.437394,37.721412,2015,5,10,510,23,0,False,False,False,True,False,False,False,False,False,False,False


In [64]:
#kaggle_Y_test is the predictions for each test 
kaggle_Y_test = model.predict(X_test)
kaggle_Y_test.shape

(884186L, 11L)

In [15]:
#Make submission file at format:
#Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
#0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0

import csv

#Get names of categories and sort them to match the desired output
col_names = sorted(train['Category'].unique())

labels = ['Id']
for i in col_names:
    labels.append(i)
    
ids = test['Id']

f = open('SubmissionLinReg.csv','w')
fo = csv.writer(f, lineterminator='\n')
fo.writerow(labels)
for i in range (0,len(test)):
    line = []
    line.append(ids[i])
    for j in range (1,len(labels)):
        #to update with new format of Y_test
        if (kaggle_Y_test[i] == labels[j]):
            line.append(1)
        else:
            line.append(0)
    fo.writerow(line)
f.close() 
