In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,scale
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss

In [2]:
# Dates have to extracted
# data = pd.read_csv('train.csv', parse_dates=['Dates'])
data = pd.read_csv('./dataset/train.csv', parse_dates=['Dates'])
test = pd.read_csv('./dataset/test.csv', parse_dates=['Dates'])

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 867873 entries, 0 to 867872
Data columns (total 10 columns):
Dates         867873 non-null datetime64[ns]
Category      867873 non-null object
Descript      867873 non-null object
DayOfWeek     867873 non-null object
PdDistrict    867873 non-null object
Resolution    867873 non-null object
Address       867873 non-null object
X             867873 non-null float64
Y             867873 non-null float64
Id            867873 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(1), object(6)
memory usage: 66.2+ MB


In [4]:
# No null values in the data-frame
data.isnull().values.any()

False

In [5]:
# Dates
data_week_dict = {
    'Monday': 1,
    'Tuesday':2,
    'Wednesday':3,
    'Thursday':4,
    'Friday':5,
    'Saturday':6,
    'Sunday':7
}

data['Hour'] = data.Dates.dt.hour
data['Minutes'] = data.Dates.dt.minute
data['Year'] = data.Dates.dt.year
data['Month'] = data.Dates.dt.month
data['Day'] = data.Dates.dt.day
data['DayOfWeekNum'] = data['DayOfWeek'].replace(data_week_dict)

test['Hour'] = test.Dates.dt.hour
test['Minutes'] = test.Dates.dt.minute
test['Year'] = test.Dates.dt.year
test['Month'] = test.Dates.dt.month
test['Day'] = test.Dates.dt.day
test['DayOfWeekNum'] = test['DayOfWeek'].replace(data_week_dict)

In [6]:
data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Id,Hour,Minutes,Year,Month,Day,DayOfWeekNum
0,2013-06-28 17:40:00,SEX OFFENSES FORCIBLE,"FORCIBLE RAPE, BODILY FORCE",Friday,MISSION,NONE,2100 Block of MISSION ST,-122.419331,37.762264,141546,17,40,2013,6,28,5
1,2004-02-19 02:46:00,LIQUOR LAWS,CONSUMING ALCOHOL IN PUBLIC VIEW,Thursday,SOUTHERN,"ARREST, BOOKED",1000 Block of MARKET ST,-122.41134,37.781271,794152,2,46,2004,2,19,4
2,2007-11-14 00:01:00,FRAUD,"CREDIT CARD, THEFT BY USE OF",Wednesday,SOUTHERN,NONE,800 Block of BRYANT ST,-122.403405,37.775421,531205,0,1,2007,11,14,3
3,2007-12-27 18:30:00,ROBBERY,ROBBERY OF A CHAIN STORE WITH A GUN,Thursday,BAYVIEW,DISTRICT ATTORNEY REFUSES TO PROSECUTE,2400 Block of SAN BRUNO AV,-122.404715,37.730161,523137,18,30,2007,12,27,4
4,2012-09-09 17:02:00,OTHER OFFENSES,PROBATION VIOLATION,Sunday,SOUTHERN,"ARREST, BOOKED",4TH ST / STEVENSON ST,-122.405239,37.785265,200968,17,2,2012,9,9,7


In [7]:
labelencoder = LabelEncoder()

In [8]:
def street_from_address(address):
    street = address.split()
    return (' '.join(street[-2:]))

In [9]:
data['Street'] = data['Address'].apply(lambda a:street_from_address(a))
data['StreetNum'] = labelencoder.fit_transform(data['Street'])
data['ResolutionNum'] = labelencoder.fit_transform(data['Resolution'])
data['PdDistrictNum'] = labelencoder.fit_transform(data['PdDistrict'])
data['CategoryNum'] = labelencoder.fit_transform(data['Category'])

test['Street'] = test['Address'].apply(lambda a:street_from_address(a))
test['StreetNum'] = labelencoder.fit_transform(test['Street'])
test['ResolutionNum'] = labelencoder.fit_transform(test['Resolution'])
test['PdDistrictNum'] = labelencoder.fit_transform(test['PdDistrict'])

In [10]:
data = data[data.X < -121]
data = data[data.Y < 40]

test = test[test.X < -121]
test = test[test.Y < 40]

In [11]:
def getCapsAddress(i):
    s=''
    for j in i.split():
        if(j.isupper()):
            s=s+' '+j
    return s[1:]

data['newAddress'] = data.Address.apply(lambda a:getCapsAddress(a))
test['newAddress'] = test.Address.apply(lambda a:getCapsAddress(a))

In [12]:
data['newAddressNum'] = labelencoder.fit_transform(data.newAddress)
test['newAddressNum'] = labelencoder.fit_transform(test.newAddress)

In [28]:
data['Address_CrossRoad'] = data['Address'].str.contains('/')
test['Address_CrossRoad'] = test['Address'].str.contains('/')

topN_address_list = data['Address'].value_counts()
topN_address_list = topN_address_list[topN_address_list >=100]
topN_address_list = topN_address_list.index
print(topN_address_list)

data['Address_clean'] = data['Address']
test['Address_clean'] = test['Address']
data.loc[~data['Address'].isin(topN_address_list), 'Address_clean'] = 'Others'
test.loc[~test['Address'].isin(topN_address_list), 'Address_clean'] = 'Others'
print(data.shape)

crossload = data[data['Address_clean'].str.contains('/')]
crossroad_list = crossload['Address_clean'].unique()
print(len(crossroad_list))

Index(['800 Block of BRYANT ST', '800 Block of MARKET ST',
       '2000 Block of MISSION ST', '1000 Block of POTRERO AV',
       '900 Block of MARKET ST', '0 Block of TURK ST', '0 Block of 6TH ST',
       '300 Block of ELLIS ST', '400 Block of ELLIS ST',
       '16TH ST / MISSION ST',
       ...
       '0 Block of ZOO RD', '14TH ST / FOLSOM ST', 'POLK ST / HEMLOCK ST',
       'MARKET ST / HYDE ST', '700 Block of OAK ST',
       '800 Block of MISSOURI ST', '300 Block of 8TH ST',
       '0 Block of LOCKSLEY AV', '1000 Block of GRANT AV',
       '700 Block of GONZALEZ DR'],
      dtype='object', length=1616)
(867806, 26)
475


In [19]:
for address in crossroad_list:
    address_split = address.split('/')
    reverse_address = address_split[1].strip() + ' / ' + address_split[0].strip()
    data.loc[data['Address_clean'] == reverse_address, 'Address_clean'] = address
    test.loc[test['Address_clean'] == reverse_address, 'Address_clean'] = address
crossload = data[data['Address_clean'].str.contains('/')]
crossroad_list = crossload['Address_clean'].unique()
print(len(crossroad_list))

le = LabelEncoder()
data['Address_clean_encode'] = le.fit_transform(data['Address_clean'])
print(data.shape)

321
(867806, 26)


In [20]:
le = LabelEncoder()
test['Address_clean_encode'] = le.fit_transform(test['Address_clean'])

In [22]:
corr = data.corr()
print(corr['CategoryNum'].sort_values(ascending=False))

CategoryNum             1.000000
Address_clean_encode    0.070796
Address_CrossRoad       0.069896
ResolutionNum           0.039676
Hour                    0.023701
Id                      0.016881
DayOfWeekNum            0.000537
Day                     0.000354
Month                  -0.000360
StreetNum              -0.014526
Year                   -0.016795
newAddressNum          -0.019639
Minutes                -0.021945
X                      -0.030525
PdDistrictNum          -0.039477
Y                      -0.058963
Name: CategoryNum, dtype: float64


In [None]:
# data.drop('Dates', axis=1, inplace=True)
# data.drop('Descript', axis=1, inplace=True)
# data.drop('DayOfWeek', axis=1, inplace=True)
# data.drop('PdDistrict', axis=1, inplace=True)
# data.drop('Resolution', axis=1, inplace=True)
# data.drop('Address', axis=1, inplace=True)
# data.drop('Street', axis=1, inplace=True)
# data.drop('Category', axis=1, inplace=True)
# data.drop('Minutes', axis=1, inplace=True)
# data.drop('newAddress', axis=1, inplace=True)
# data.drop('Id',axis=1,inplace=True)

In [24]:
features=['X','Y','Hour','Minutes','Year','Month','Day','DayOfWeekNum', 'PdDistrictNum',
       'ResolutionNum','Address_CrossRoad', 'Address_clean_encode']

In [None]:
# for i in data.CategoryNum.unique():
#     print(i,labelencoder.inverse_transform(data.CategoryNum.unique())[i])
#     data[data.CategoryNum==i].hist(bins=50, figsize=(20,15))
#     plt.show()
data.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# Random seed has been set - As per the guidlines of the competition
train, test = train_test_split(data, test_size=0.3, random_state=3, shuffle=True)

In [29]:
test.columns

Index(['Dates', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address',
       'X', 'Y', 'Id', 'Hour', 'Minutes', 'Year', 'Month', 'Day',
       'DayOfWeekNum', 'Street', 'StreetNum', 'ResolutionNum', 'PdDistrictNum',
       'newAddress', 'newAddressNum', 'Address_clean', 'Address_clean_encode',
       'Address_CrossRoad'],
      dtype='object')

In [30]:
y_train = data['CategoryNum']
X_train = data[features]
# y_test = test['CategoryNum']
X_test = test[features]

In [31]:
X_test

Unnamed: 0,X,Y,Hour,Minutes,Year,Month,Day,DayOfWeekNum,PdDistrictNum,ResolutionNum,Address_CrossRoad,Address_clean_encode
0,-122.407878,37.785968,23,0,2010,7,4,7,9,11,False,163
1,-122.443597,37.782644,22,43,2004,6,26,6,5,0,False,566
2,-122.400474,37.785029,21,26,2013,2,9,6,7,1,True,1148
3,-122.419698,37.777301,22,30,2006,12,3,7,4,1,True,1217
4,-122.451488,37.767516,8,45,2014,9,21,7,5,11,True,1217
5,-122.412597,37.783932,18,0,2010,8,29,7,9,11,True,1152
6,-122.424641,37.772789,17,0,2013,2,10,7,4,11,False,137
7,-122.481353,37.720399,16,0,2005,9,19,1,8,11,True,1217
8,-122.421973,37.775998,12,0,2011,8,9,2,4,11,False,484
9,-122.445239,37.786977,21,0,2013,6,2,7,6,11,False,1217


In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial',max_iter=1000)
clf.fit(X_train,y_train)
pred = clf.predict_proba(X_test)
log_loss(y_test,pred)

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=8)
clf.fit(X_train,y_train)
print(clf.score(X_test,y_test))
pred = clf.predict_proba(X_test)
print(log_loss(y_test,pred))

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,y_train)
pred = clf.predict_proba(X_test)
log_loss(y_test,pred)

In [None]:
import xgboost as xgb

seed = 42

model = xgb.XGBClassifier(objective='multi:softprob', seed=seed, max_depth=8)

In [None]:
score = -1 * cross_val_score(model, X_train, y_train, scoring='neg_log_loss', cv=3, n_jobs=-1)

In [None]:
print("Score = {0:.5f}".format(score.mean()))

In [None]:
score

In [None]:
model.fit(X_train,y_train)

In [32]:
import pickle

In [None]:
pickle.dump(model, open("xgboost.p", "wb"))

In [33]:
model = pickle.load(open("xgboost.p", "rb"))
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=8, min_child_weight=1, missing=nan, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=1)

In [34]:
predictions = model.predict_proba(X_test)


In [39]:
submission = pd.DataFrame(predictions)
submission.columns = sorted(data.Category.unique())
submission['Id'] = test['Id']
submission

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS,Id
0,0.000956,0.078766,0.000212,0.000242,0.045514,0.001092,0.000207,0.002382,0.000528,0.000741,...,0.003909,0.000440,0.000438,0.028829,0.001466,0.038261,0.017003,0.001522,0.000608,349598
1,0.001083,0.139933,0.000326,0.000517,0.047235,0.002588,0.008872,0.072907,0.033673,0.000451,...,0.002700,0.015117,0.000187,0.001102,0.009922,0.030629,0.005442,0.183368,0.020100,766313
2,0.000657,0.009994,0.000075,0.000138,0.000140,0.000985,0.001370,0.009002,0.001624,0.000086,...,0.000370,0.000864,0.000077,0.008263,0.000257,0.005309,0.000423,0.002769,0.002722,169887
3,0.000607,0.038261,0.000135,0.000156,0.000199,0.002054,0.001413,0.028552,0.001666,0.000142,...,0.000931,0.000495,0.000137,0.004256,0.000510,0.007984,0.001268,0.003562,0.005445,594704
4,0.000970,0.085833,0.000198,0.000288,0.000319,0.002533,0.000443,0.008532,0.005583,0.000229,...,0.001665,0.000760,0.000161,0.042659,0.000280,0.040706,0.034647,0.006989,0.004317,47900
5,0.000388,0.124670,0.000270,0.000272,0.001106,0.000671,0.000211,0.005204,0.000574,0.000237,...,0.005084,0.000788,0.000208,0.029322,0.000496,0.067996,0.049184,0.001996,0.002758,339260
6,0.000975,0.064233,0.000425,0.000262,0.094735,0.001577,0.000175,0.001613,0.000340,0.000462,...,0.004367,0.001130,0.000346,0.027753,0.003828,0.103837,0.085180,0.001056,0.001742,169575
7,0.000709,0.051881,0.000274,0.000300,0.000693,0.002007,0.000204,0.003618,0.001271,0.000236,...,0.007085,0.001334,0.000197,0.023912,0.000495,0.063291,0.166824,0.000577,0.002370,681483
8,0.000766,0.078792,0.001693,0.000240,0.059472,0.002245,0.000182,0.004636,0.000225,0.005033,...,0.005985,0.001356,0.000715,0.061744,0.005340,0.050486,0.031995,0.001316,0.002199,276938
9,0.001093,0.045059,0.000264,0.000357,0.094936,0.001020,0.000188,0.000922,0.000337,0.001357,...,0.004013,0.000599,0.000719,0.036509,0.002653,0.099935,0.053769,0.000474,0.000635,146369


In [41]:
submission.to_csv('submission.csv', index=False)

TODO
- Pick relevant/independent columns and pass to the classifier
- Normalize the data
- Perform Exploratory Data Analysis, Visualization
- Finalize the feature list
- Build the models
- Voila!