In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
from sklearn.model_selection import train_test_split

In [2]:
# Dates have to extracted
data = pd.read_csv('train.csv', parse_dates=['Dates'])

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 867873 entries, 0 to 867872
Data columns (total 10 columns):
Dates         867873 non-null datetime64[ns]
Category      867873 non-null object
Descript      867873 non-null object
DayOfWeek     867873 non-null object
PdDistrict    867873 non-null object
Resolution    867873 non-null object
Address       867873 non-null object
X             867873 non-null float64
Y             867873 non-null float64
Id            867873 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(1), object(6)
memory usage: 66.2+ MB


In [4]:
# No null values in the data-frame
data.isnull().values.any()

False

In [5]:
# Dates

data_week_dict = {
    'Monday': 1,
    'Tuesday':2,
    'Wednesday':3,
    'Thursday':4,
    'Friday':5,
    'Saturday':6,
    'Sunday':7
}

data['Hour'] = data.Dates.dt.hour
data['Minutes'] = data.Dates.dt.minute
data['Year'] = data.Dates.dt.year
data['Month'] = data.Dates.dt.month
data['Day'] = data.Dates.dt.day
data['DayOfWeekNum'] = data['DayOfWeek'].replace(data_week_dict)

In [6]:
# Address
# Think of an alternative solution - Issue when the attribute contains two streets
# Logically, 'street A / street B' == 'street B / street A'
# Such cases would fails this step

def street_from_address(address):
    street = address.split()
    return (' '.join(street[-2:]))

data['Street'] = data['Address'].apply(lambda a:street_from_address(a))

streets = data['Street'].unique()
data_street_dict = dict()
ctr = 1
for s in streets:
    data_street_dict[s] = ctr
    ctr += 1

data['StreetNum'] = data['Street'].replace(data_street_dict)

In [7]:
# Police Department District

district = data['PdDistrict'].unique()
data_district_dict = {}
ctr = 1

for d in district:
    data_district_dict[d] = ctr
    ctr += 1

data['PdDistrictNum'] = data['PdDistrict'].replace(data_district_dict)

In [8]:
# X and Y

data = data[data.X < -121]
data = data[data.Y < 40]

In [9]:
# Resolution

resolutions = data['Resolution'].unique()
data_resolution_dict = dict()
ctr = 1
for r in resolutions:
    data_resolution_dict[r] = ctr
    ctr += 1

data['ResolutionNum'] = data['Resolution'].replace(data_resolution_dict)

In [10]:
# Target Variable - Category

target = data['Category'].unique()
data_category_dict = dict()
ctr = 1
for t in target:
    data_category_dict[t] = ctr
    ctr += 1

data['CategoryNum'] = data['Category'].replace(data_category_dict)

In [11]:
corr = data.corr()
print(corr['CategoryNum'].sort_values(ascending=False))

CategoryNum      1.000000
ResolutionNum    0.025444
StreetNum        0.015281
DayOfWeekNum     0.011669
Month            0.009588
Id               0.004396
Day              0.000296
Year            -0.005110
PdDistrictNum   -0.005962
Hour            -0.013211
Y               -0.013261
X               -0.023153
Minutes         -0.046142
Name: CategoryNum, dtype: float64


In [12]:
data.drop('Dates', axis=1, inplace=True)
data.drop('Descript', axis=1, inplace=True)
data.drop('DayOfWeek', axis=1, inplace=True)
data.drop('PdDistrict', axis=1, inplace=True)
data.drop('Resolution', axis=1, inplace=True)
data.drop('Address', axis=1, inplace=True)
data.drop('Street', axis=1, inplace=True)
data.drop('Category', axis=1, inplace=True)

In [13]:
# Random seed has been set - As per the guidlines of the competition
train, validate = train_test_split(data, test_size=0.2, random_state=3, shuffle=True)

In [14]:
data.columns

Index(['X', 'Y', 'Id', 'Hour', 'Minutes', 'Year', 'Month', 'Day',
       'DayOfWeekNum', 'StreetNum', 'PdDistrictNum', 'ResolutionNum',
       'CategoryNum'],
      dtype='object')

In [15]:
y_train = data['CategoryNum']
X_train = data.loc[:, data.columns!='CategoryNum']

In [16]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score

seed = 42

model = xgb.XGBClassifier(objective='multi:softprob', seed=seed)

In [None]:
score = -1 * cross_val_score(model, X_train, y_train, scoring='neg_log_loss', cv=5).mean()

In [None]:
print("Score = {0:.5f}".format(score))

TODO
- Pick relevant/independent columns and pass to the classifier
- Normalize the data
- Perform Exploratory Data Analysis, Visualization
- Finalize the feature list
- Build the models
- Voila!