In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [2]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


# Data preprocessing

In [3]:
X = train.copy()
X['Dates'] = pd.to_datetime(X['Dates'], errors='coerce')
X['DayOfWeek'] = X['Dates'].dt.weekday
# X['Day'] = X['Dates'].dt.day
# X['Month'] = X['Dates'].dt.month
# X['Year'] = X['Dates'].dt.year
# X['Hour'] = X['Dates'].dt.hour
# X['Minute'] = X['Dates'].dt.minute

from datetime import datetime
import time
X['Timestamp'] = train['Dates'].apply(lambda x: time.mktime(datetime.strptime(x, '%Y-%m-%d %H:%M:%S').timetuple()))

from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
X['PdDistrict'] = le1.fit_transform(X['PdDistrict'])
# le2 = LabelEncoder()
# X['Address'] = le2.fit_transform(X['Address'])

X.drop(columns=['Dates', 'Descript', 'Resolution', 'Address'], inplace=True)

X.head()


Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,Timestamp
0,WARRANTS,2,4,-122.425892,37.774599,1431529000.0
1,OTHER OFFENSES,2,4,-122.425892,37.774599,1431529000.0
2,OTHER OFFENSES,2,4,-122.424363,37.800414,1431528000.0
3,LARCENY/THEFT,2,4,-122.426995,37.800873,1431527000.0
4,LARCENY/THEFT,2,5,-122.438738,37.771541,1431527000.0


In [4]:
X.drop_duplicates(inplace=True)
X.shape

(812473, 6)

In [5]:
X_test = test.copy()
X_test['Dates'] = pd.to_datetime(X_test['Dates'], errors='coerce')
X_test['DayOfWeek'] = X_test['Dates'].dt.weekday
# X_test['Day'] = X_test['Dates'].dt.day
# X_test['Month'] = X_test['Dates'].dt.month
# X_test['Year'] = X_test['Dates'].dt.year
# X_test['Hour'] = X_test['Dates'].dt.hour
# X_test['Minute'] = X_test['Dates'].dt.minute
from datetime import datetime
import time
X_test['Timestamp'] = test['Dates'].apply(lambda x: time.mktime(datetime.strptime(x, '%Y-%m-%d %H:%M:%S').timetuple()))


X_test['PdDistrict'] = le1.transform(X_test['PdDistrict'])
# X_test['Address'] = le2.transform(X_test['Address'])

X_test.drop(columns=['Id', 'Dates', 'Address'], inplace=True)


In [6]:
from sklearn.model_selection import train_test_split

train_data = X.drop('Category', axis=1)
target_data = X['Category']

x_train, x_test, y_train, y_test  = train_test_split(train_data, target_data)

print(train_data.shape, x_train.shape, x_test.shape)

(812473, 5) (609354, 5) (203119, 5)


# Estimator

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

forest = RandomForestClassifier()
param_grid = { 
    'n_estimators': [100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [5, 10, 20, 50],
    'criterion' :['gini', 'entropy']
}
cv_forest = GridSearchCV(estimator=forest, param_grid=param_grid, cv= 3)
cv_forest.fit(x_train, y_train)

print('training set accuracy:', cv_forest.score(x_train, y_train))
print('test set accuracy:', cv_forest.score(x_test, y_test))

In [None]:
cv_forest.best_params_

In [None]:
forest = RandomForestClassifier(cv_forest.best_params_)

print('training set accuracy:', forest.score(x_train, y_train))
print('test set accuracy:', forest.score(x_test, y_test))

In [None]:
predictions = forest.predict_proba(X_test)

In [None]:
answer = train['Category'].value_counts().keys().tolist()
answer.sort()

output = pd.DataFrame({'Id': test['Id']})
output[answer] = pd.DataFrame(predictions)
output

In [None]:
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")