# Model training

In [1]:
#Import and preparation of data
import pandas as pd
import zipfile
import numpy as np
import datetime
import time

z = zipfile.ZipFile('../input/train.csv.zip')
train = pd.read_csv(z.open('train.csv'), parse_dates=['Dates'])

#On sample only : remove #
#train = train.head()

PdDistrictDic = {'CENTRAL':1, 'NORTHERN':2, 'INGLESIDE':3, 'PARK':4, 'MISSION':5, 'TENDERLOIN':6, 'RICHMOND':7, 'TARAVAL':8, 'BAYVIEW':9, 'SOUTHERN':10}
CategoryDic = {'KIDNAPPING':1, 'WEAPON LAWS':2, 'SECONDARY CODES':3, 'WARRANTS':4, 'PROSTITUTION':5, 'EMBEZZLEMENT':6, 'LOITERING':7, 'SUICIDE':8, 'DRIVING UNDER THE INFLUENCE':9, 'SEX OFFENSES FORCIBLE':10, 'ROBBERY':11, 'BURGLARY':12, 'SUSPICIOUS OCC':13, 'FAMILY OFFENSES':14, 'BRIBERY':15, 'FORGERY/COUNTERFEITING':16, 'BAD CHECKS':17, 'DRUNKENNESS':18, 'GAMBLING':19, 'OTHER OFFENSES':20, 'RECOVERED VEHICLE':21, 'FRAUD':22, 'ARSON':23, 'DRUG/NARCOTIC':24, 'TRESPASS':25, 'LARCENY/THEFT':26, 'VANDALISM':27, 'NON-CRIMINAL':28, 'EXTORTION':29, 'PORNOGRAPHY/OBSCENE MAT':30, 'LIQUOR LAWS':31, 'SEX OFFENSES NON FORCIBLE':32, 'TREA':33, 'VEHICLE THEFT':34, 'STOLEN PROPERTY':35, 'ASSAULT':36, 'MISSING PERSON':37, 'DISORDERLY CONDUCT':38, 'RUNAWAY':39}

#Calculates features in a numeric format
train['Year'] = train['Dates'].map(lambda x: int(x.year))
train['Month'] = train['Dates'].map(lambda x: int(x.month))
train['Day'] = train['Dates'].map(lambda x: int(x.day))
train['DayOfYear'] = train['Dates'].map(lambda x: int(x.strftime("%m%d")))
train['Time'] = train['Dates'].map(lambda x: int(str(x.hour)+str(x.minute)))
train['DayOfWeekNb'] = train['Dates'].map(lambda x: x.strftime("%w")) #1 = Monday, 7 = Sunday
train['Xnb'] = -train['X'] #X is always negative => Xnb will be positive
train['Ynb'] = train['Y']
train['PdDistrictNb'] = train['PdDistrict'].map(lambda x: PdDistrictDic[x])
train['CategoryNb'] = train['Category'].map(lambda x: CategoryDic[x])

#Display of 10 first lines to show the structure
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Year,Month,Day,DayOfYear,Time,DayOfWeekNb,Xnb,Ynb,PdDistrictNb,CategoryNb
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,513,2353,3,122.425892,37.774599,2,4
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,513,2353,3,122.425892,37.774599,2,20
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015,5,13,513,2333,3,122.424363,37.800414,2,20
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,5,13,513,2330,3,122.426995,37.800873,2,26
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,5,13,513,2330,3,122.438738,37.771541,4,26


In [4]:
#Selection of features
X = train[['PdDistrictNb','Year','Month','DayOfYear','Time','DayOfWeekNb','Xnb','Ynb']]

#X and Y definition
Y = train['Category']


# Choice of model

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#model = KNeighborsClassifier(n_neighbors = 5)
model = LogisticRegression()
model.fit(X,Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [7]:
#Import test data

z = zipfile.ZipFile('../input/test.csv.zip')
test = pd.read_csv(z.open('test.csv'), parse_dates=['Dates'])
#Columns are : Id,Dates,DayOfWeek,PdDistrict,Address,X,Y

#On sample only : remove #
#test = test.head()

PdDistrictDic = {'CENTRAL':1, 'NORTHERN':2, 'INGLESIDE':3, 'PARK':4, 'MISSION':5, 'TENDERLOIN':6, 'RICHMOND':7, 'TARAVAL':8, 'BAYVIEW':9, 'SOUTHERN':10}
CategoryDic = {'KIDNAPPING':1, 'WEAPON LAWS':2, 'SECONDARY CODES':3, 'WARRANTS':4, 'PROSTITUTION':5, 'EMBEZZLEMENT':6, 'LOITERING':7, 'SUICIDE':8, 'DRIVING UNDER THE INFLUENCE':9, 'SEX OFFENSES FORCIBLE':10, 'ROBBERY':11, 'BURGLARY':12, 'SUSPICIOUS OCC':13, 'FAMILY OFFENSES':14, 'BRIBERY':15, 'FORGERY/COUNTERFEITING':16, 'BAD CHECKS':17, 'DRUNKENNESS':18, 'GAMBLING':19, 'OTHER OFFENSES':20, 'RECOVERED VEHICLE':21, 'FRAUD':22, 'ARSON':23, 'DRUG/NARCOTIC':24, 'TRESPASS':25, 'LARCENY/THEFT':26, 'VANDALISM':27, 'NON-CRIMINAL':28, 'EXTORTION':29, 'PORNOGRAPHY/OBSCENE MAT':30, 'LIQUOR LAWS':31, 'SEX OFFENSES NON FORCIBLE':32, 'TREA':33, 'VEHICLE THEFT':34, 'STOLEN PROPERTY':35, 'ASSAULT':36, 'MISSING PERSON':37, 'DISORDERLY CONDUCT':38, 'RUNAWAY':39}

#Calculates features in a numeric format
test['Year'] = test['Dates'].map(lambda x: int(x.year))
test['Month'] = test['Dates'].map(lambda x: int(x.month))
test['Day'] = test['Dates'].map(lambda x: int(x.day))
test['DayOfYear'] = test['Dates'].map(lambda x: int(x.strftime("%m%d")))
test['Time'] = test['Dates'].map(lambda x: int(str(x.hour)+str(x.minute)))
test['DayOfWeekNb'] = test['Dates'].map(lambda x: x.strftime("%w")) #1 = Monday, 7 = Sunday
test['Xnb'] = -test['X'] #X is always negative => Xnb will be positive
test['Ynb'] = test['Y']
test['PdDistrictNb'] = test['PdDistrict'].map(lambda x: PdDistrictDic[x])

#Selection of features
X_test = test[['PdDistrictNb','Year','Month','DayOfYear','Time','DayOfWeekNb','Xnb','Ynb']]
X_test.head()


Unnamed: 0,PdDistrictNb,Year,Month,DayOfYear,Time,DayOfWeekNb,Xnb,Ynb
0,9,2015,5,510,2359,0,122.399588,37.735051
1,9,2015,5,510,2351,0,122.391523,37.732432
2,2,2015,5,510,2350,0,122.426002,37.792212
3,3,2015,5,510,2345,0,122.437394,37.721412
4,3,2015,5,510,2345,0,122.437394,37.721412


In [8]:
#Y_test is the predictions for each test in a numeric format
Y_test = model.predict(X_test)
Y_test.shape

(884262L,)

# Output file generation

In [None]:
#Make submission file at format:
#Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
#0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0

import csv

#Get names of categories and sort them to match the desired output
col_names = sorted(train['Category'].unique())

labels = ['Id']
for i in col_names:
    labels.append(i)
    
ids = test['Id']

f = open('sampleSubmission.csv','w')
fo = csv.writer(f, lineterminator='\n')
fo.writerow(labels)
for i in range (0,len(test)):
    line = []
    line.append(ids[i])
    for j in range (1,len(labels)):
        if (Y_test[i] == labels[j]):
            line.append(1)
        else:
            line.append(0)
    fo.writerow(line)
f.close() 
