In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,scale
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss

In [None]:
# Dates have to extracted
# data = pd.read_csv('train.csv', parse_dates=['Dates'])
data = pd.read_csv('train.csv', parse_dates=['Dates'])
test = pd.read_csv('test.csv', parse_dates=['Dates'])

In [None]:
data.info()

In [None]:
# No null values in the data-frame
data.isnull().values.any()

In [None]:
labelencoder = LabelEncoder()

In [None]:
data['Hour'] = data.Dates.dt.hour
data['Minutes'] = data.Dates.dt.minute
data['Year'] = data.Dates.dt.year
data['Month'] = data.Dates.dt.month
data['Day'] = data.Dates.dt.day
data['DayOfWeekNum'] = labelencoder.fit_transform(data['DayOfWeek'])

test['Hour'] = test.Dates.dt.hour
test['Minutes'] = test.Dates.dt.minute
test['Year'] = test.Dates.dt.year
test['Month'] = test.Dates.dt.month
test['Day'] = test.Dates.dt.day
test['DayOfWeekNum'] = labelencoder.fit_transform(test['DayOfWeek'])

In [None]:
data.columns

In [None]:
data['ResolutionNum'] = labelencoder.fit_transform(data['Resolution'])
data['PdDistrictNum'] = labelencoder.fit_transform(data['PdDistrict'])
data['CategoryNum'] = labelencoder.fit_transform(data['Category'])

test['ResolutionNum'] = labelencoder.fit_transform(test['Resolution'])
test['PdDistrictNum'] = labelencoder.fit_transform(test['PdDistrict'])

In [None]:
data = data[data.X < -121]
data = data[data.Y < 40]

test = test[test.X < -121]
test = test[test.Y < 40]

In [None]:
data['Address_CrossRoad'] = data['Address'].str.contains('/')
test['Address_CrossRoad'] = test['Address'].str.contains('/')

topN_address_list = data['Address'].value_counts()
topN_address_list = topN_address_list[topN_address_list >=100]
topN_address_list = topN_address_list.index

data['Address_clean'] = data['Address']
test['Address_clean'] = test['Address']
data.loc[~data['Address'].isin(topN_address_list), 'Address_clean'] = 'Others'
test.loc[~test['Address'].isin(topN_address_list), 'Address_clean'] = 'Others'

crossload = data[data['Address_clean'].str.contains('/')]
crossroad_list = crossload['Address_clean'].unique()

In [None]:
for address in crossroad_list:
    address_split = address.split('/')
    reverse_address = address_split[1].strip() + ' / ' + address_split[0].strip()
    data.loc[data['Address_clean'] == reverse_address, 'Address_clean'] = address
    test.loc[test['Address_clean'] == reverse_address, 'Address_clean'] = address
crossload = data[data['Address_clean'].str.contains('/')]
crossroad_list = crossload['Address_clean'].unique()

le = LabelEncoder()
data['Address_clean_encode'] = le.fit_transform(data['Address_clean'])

In [None]:
le = LabelEncoder()
test['Address_clean_encode'] = le.fit_transform(test['Address_clean'])

In [None]:
corr = data.corr()
print(corr['CategoryNum'].sort_values(ascending=False))

In [None]:
features = ['X','Y','Hour','Minutes','Year','Month','Day','DayOfWeekNum', 'PdDistrictNum',
       'ResolutionNum','Address_CrossRoad', 'Address_clean_encode']

In [None]:
# Random seed has been set - As per the guidlines of the competition
train_, test_ = train_test_split(data, test_size=0.3, random_state=3, shuffle=True)

In [None]:
X_train_ = train_[features]
y_train_ = train_['CategoryNum']

X_test_ = test_[features]
y_test_ = test_['CategoryNum']

X_train = data[features]
y_train = data['CategoryNum']

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial',max_iter=1000)
clf.fit(X_train,y_train)
pred = clf.predict_proba(X_test)
log_loss(y_test,pred)

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=8)
clf.fit(X_train,y_train)
print(clf.score(X_test,y_test))
pred = clf.predict_proba(X_test)
print(log_loss(y_test,pred))

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,y_train)
pred = clf.predict_proba(X_test)
log_loss(y_test,pred)

In [None]:
import xgboost as xgb
seed = 42
model = xgb.XGBClassifier(objective='multi:softprob', seed=seed, max_depth=8)

In [None]:
score = -1 * cross_val_score(model, X_train, y_train, scoring='neg_log_loss', cv=3, n_jobs=-1)

In [None]:
print("Score = {0:.5f}".format(score.mean()))

In [None]:
model.fit(X_train,y_train)

In [None]:
import pickle

In [None]:
pickle.dump(model, open("xgboost.p", "wb"))

In [None]:
model = pickle.load(open("xgboost.p", "rb"))
model

In [None]:
predictions = model.predict_proba(X_test)

In [None]:
submission = pd.DataFrame(predictions)
submission.columns = sorted(data.Category.unique())
submission['Id'] = test['Id']

In [None]:
submission.to_csv('submission.csv', index=False)

TODO
- Pick relevant/independent columns and pass to the classifier
- Normalize the data
- Perform Exploratory Data Analysis, Visualization
- Finalize the feature list
- Build the models
- Voila!