In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss

In [2]:
# Dates have to extracted
# data = pd.read_csv('train.csv', parse_dates=['Dates'])
data = pd.read_csv('./dataset/train.csv', parse_dates=['Dates'])
test = pd.read_csv('./dataset/test.csv', parse_dates=['Dates'])

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 867873 entries, 0 to 867872
Data columns (total 10 columns):
Dates         867873 non-null datetime64[ns]
Category      867873 non-null object
Descript      867873 non-null object
DayOfWeek     867873 non-null object
PdDistrict    867873 non-null object
Resolution    867873 non-null object
Address       867873 non-null object
X             867873 non-null float64
Y             867873 non-null float64
Id            867873 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(1), object(6)
memory usage: 66.2+ MB


In [65]:
# No null values in the data-frame
data.isnull().values.any()

True

In [5]:
# Dates
data_week_dict = {
    'Monday': 1,
    'Tuesday':2,
    'Wednesday':3,
    'Thursday':4,
    'Friday':5,
    'Saturday':6,
    'Sunday':7
}

data['Hour'] = data.Dates.dt.hour
data['Minutes'] = data.Dates.dt.minute
data['Year'] = data.Dates.dt.year
data['Month'] = data.Dates.dt.month
data['Day'] = data.Dates.dt.day
data['DayOfWeekNum'] = data['DayOfWeek'].replace(data_week_dict)

test['Hour'] = test.Dates.dt.hour
test['Minutes'] = test.Dates.dt.minute
test['Year'] = test.Dates.dt.year
test['Month'] = test.Dates.dt.month
test['Day'] = test.Dates.dt.day
test['DayOfWeekNum'] = test['DayOfWeek'].replace(data_week_dict)

In [6]:
data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Id,Hour,Minutes,Year,Month,Day,DayOfWeekNum
0,2013-06-28 17:40:00,SEX OFFENSES FORCIBLE,"FORCIBLE RAPE, BODILY FORCE",Friday,MISSION,NONE,2100 Block of MISSION ST,-122.419331,37.762264,141546,17,40,2013,6,28,5
1,2004-02-19 02:46:00,LIQUOR LAWS,CONSUMING ALCOHOL IN PUBLIC VIEW,Thursday,SOUTHERN,"ARREST, BOOKED",1000 Block of MARKET ST,-122.41134,37.781271,794152,2,46,2004,2,19,4
2,2007-11-14 00:01:00,FRAUD,"CREDIT CARD, THEFT BY USE OF",Wednesday,SOUTHERN,NONE,800 Block of BRYANT ST,-122.403405,37.775421,531205,0,1,2007,11,14,3
3,2007-12-27 18:30:00,ROBBERY,ROBBERY OF A CHAIN STORE WITH A GUN,Thursday,BAYVIEW,DISTRICT ATTORNEY REFUSES TO PROSECUTE,2400 Block of SAN BRUNO AV,-122.404715,37.730161,523137,18,30,2007,12,27,4
4,2012-09-09 17:02:00,OTHER OFFENSES,PROBATION VIOLATION,Sunday,SOUTHERN,"ARREST, BOOKED",4TH ST / STEVENSON ST,-122.405239,37.785265,200968,17,2,2012,9,9,7


In [7]:
def newMin(i):
    if(i<15):
        return 0
        
    elif(i>=15 and i<30):
        return 15

    elif(i>=30 and i<45):
        return 30
    
    elif(i>=45):
        return 45

data['newMin'] = data.Minutes.apply(lambda a:newMin(a))
test['newMin'] = test.Minutes.apply(lambda a:newMin(a))

In [8]:
# https://www.studentflights.com.au/destinations/san-francisco/weather
def season(i):
    if i in [2, 3]:
        return 1
    if i in [4,5]:
        return 2
    if i in [6,7]:
        return 3
    if i in [8,9]:
        return 4
    if i in [10,11]:
        return 5
    if i in [12,1]:
        return 6
    
    
data['seasons'] = data.Month.apply(lambda a:season(a))
test['seasons'] = test.Month.apply(lambda a:season(a))

In [9]:
data.seasons.value_counts()

2    155948
5    151471
1    145447
3    139225
4    138860
6    136922
Name: seasons, dtype: int64

In [10]:
labelencoder = LabelEncoder()

In [11]:
data['ResolutionNum'] = labelencoder.fit_transform(data['Resolution'])
data['PdDistrictNum'] = labelencoder.fit_transform(data['PdDistrict'])
data['CategoryNum'] = labelencoder.fit_transform(data['Category'])

test['ResolutionNum'] = labelencoder.fit_transform(test['Resolution'])
test['PdDistrictNum'] = labelencoder.fit_transform(test['PdDistrict'])

In [12]:
data = data[data.X < -121]
data = data[data.Y < 40]

test = test[test.X < -121]
test = test[test.Y < 40]

In [13]:
def getCapsAddress(i):
    s=''
    for j in i.split():
        if(j.isupper()):
            s=s+' '+j
    return s[1:]

data['newAddress'] = data.Address.apply(lambda a:getCapsAddress(a))
test['newAddress'] = test.Address.apply(lambda a:getCapsAddress(a))

In [14]:
data['newAddressNum'] = labelencoder.fit_transform(data.newAddress)
test['newAddressNum'] = labelencoder.fit_transform(test.newAddress)

In [15]:
data['Address_CrossRoad'] = data['Address'].str.contains('/')
test['Address_CrossRoad'] = test['Address'].str.contains('/')

topN_address_list = data['Address'].value_counts()
topN_address_list = topN_address_list[topN_address_list >=100]
topN_address_list = topN_address_list.index
print(topN_address_list)

data['Address_clean'] = data['Address']
test['Address_clean'] = test['Address']
data.loc[~data['Address'].isin(topN_address_list), 'Address_clean'] = 'Others'
test.loc[~test['Address'].isin(topN_address_list), 'Address_clean'] = 'Others'
print(data.shape)

crossload = data[data['Address_clean'].str.contains('/')]
crossroad_list = crossload['Address_clean'].unique()
print(len(crossroad_list))

Index(['800 Block of BRYANT ST', '800 Block of MARKET ST',
       '2000 Block of MISSION ST', '1000 Block of POTRERO AV',
       '900 Block of MARKET ST', '0 Block of TURK ST', '0 Block of 6TH ST',
       '300 Block of ELLIS ST', '400 Block of ELLIS ST',
       '16TH ST / MISSION ST',
       ...
       '800 Block of MISSOURI ST', '500 Block of JACKSON ST',
       'CAPP ST / 19TH ST', '1000 Block of GRANT AV',
       '700 Block of GONZALEZ DR', 'MARKET ST / HYDE ST',
       '14TH ST / FOLSOM ST', '900 Block of GRANT AV', '700 Block of OAK ST',
       '300 Block of 8TH ST'],
      dtype='object', length=1616)
(867806, 25)
475


In [16]:
for address in crossroad_list:
    address_split = address.split('/')
    reverse_address = address_split[1].strip() + ' / ' + address_split[0].strip()
    data.loc[data['Address_clean'] == reverse_address, 'Address_clean'] = address
    test.loc[test['Address_clean'] == reverse_address, 'Address_clean'] = address
crossload = data[data['Address_clean'].str.contains('/')]
crossroad_list = crossload['Address_clean'].unique()
print(len(crossroad_list))

le = LabelEncoder()
data['Address_clean_encode'] = le.fit_transform(data['Address_clean'])
print(data.shape)

321
(867806, 26)


In [17]:
le = LabelEncoder()
test['Address_clean_encode'] = le.fit_transform(test['Address_clean'])

In [18]:
def is_weekend(day):
    if day in ['Friday', 'Saturday', 'Sunday']:
        return True
    else:
        return False
    
data['is_weekend'] = data.DayOfWeek.apply(lambda x : is_weekend(x))
test['is_weekend'] = test.DayOfWeek.apply(lambda x : is_weekend(x))

In [19]:
def night_time(time):
    if time >= 22 or time <= 6:
        return True
    else:
        return False

data['is_night_time'] = data.Hour.apply(lambda x : night_time(x))
test['is_night_time'] = test.Hour.apply(lambda x : night_time(x))

In [20]:
import holidays
us_holidays = holidays.US()
def is_holiday(date):
    if date in us_holidays:
        return True
    else:
        return False

data['is_holiday'] = data.Dates.dt.date.apply(lambda x: is_holiday(x))
test['is_holiday'] = test.Dates.dt.date.apply(lambda x: is_holiday(x))

In [21]:
def get_address_char(address):
    strings = address.strip().split('/')
    if(len(strings) == 1):
        return [strings[0].strip()[-2:].strip()]
    else:
        return [strings[0].strip()[-2:].strip(), strings[1][-2:].strip()]

In [22]:
def get_tags(all_address):
    all_tags = []
    for address in all_address:
        tags = get_address_char(address)
        for tag in tags:
            if(len(tag) != 0 and tag.isdigit() == False):
                all_tags.append(tag)
    return list(set(all_tags))

In [23]:
all_tags = get_tags(data.Address)

In [24]:
data['tags'] = data.Address.apply(lambda x: get_address_char(x))
test['tags'] = test.Address.apply(lambda x: get_address_char(x))

In [25]:
def makeDict(col):
    col = col[0]
    all_dict = {}
    for i in all_tags:
        all_dict[i]=0
    for i in col:
        all_dict[i]=1
    return all_dict

In [26]:
all_dicts_data = data[['tags']].apply(makeDict,axis=1)
all_dicts_test = test[['tags']].apply(makeDict,axis=1)

In [27]:
data_dicts_pd = pd.DataFrame(list(all_dicts_data),index=data.index)
test_dicts_pd = pd.DataFrame(list(all_dicts_test),index=test.index)

In [28]:
# data_dicts_pd.drop(columns=['','80'],inplace=True)

In [29]:
data = pd.concat([data,data_dicts_pd],axis=1)
test = pd.concat([test,test_dicts_pd],axis=1)

In [30]:
corr = data.corr()
print(corr['CategoryNum'].sort_values(ascending=False))

CategoryNum             1.000000
Address_clean_encode    0.070796
Address_CrossRoad       0.069896
AV                      0.041185
ResolutionNum           0.039676
Hour                    0.023701
Id                      0.016881
WY                      0.013034
is_night_time           0.011986
BL                      0.006744
TR                      0.006426
RD                      0.003983
CR                      0.003544
NO                      0.002595
DR                      0.002450
RW                      0.002233
MS                      0.001754
HY                      0.001571
AR                      0.001330
RK                      0.001163
LN                      0.000733
AY                      0.000666
DayOfWeekNum            0.000537
CT                      0.000534
is_holiday              0.000504
EX                      0.000372
Day                     0.000354
is_weekend              0.000065
seasons                -0.000069
ER                     -0.000258
Month     

In [85]:
xyscaler = StandardScaler()
xyscaler.fit(data[["X","Y"]])
data[["X","Y"]] = xyscaler.transform(data[["X","Y"]])

# data["X_reduced"] = data.X.apply(lambda x: "{0:.2f}".format(x)).astype(float)
# data["Y_reduced"] = data.Y.apply(lambda x: "{0:.2f}".format(x)).astype(float)
# data["X_reduced_cat"] = pd.Categorical.from_array(data.X_reduced).codes
# data["Y_reduced_cat"] = pd.Categorical.from_array(data.Y_reduced).codes

data["rot_45_X"] = .707*data["Y"] + .707*data["X"]
data["rot_45_Y"] = .707* data["Y"] - .707* data["X"]

data["rot_30_X"] = (1.732/2)*data["X"] + (1./2)*data["Y"]
data["rot_30_Y"] = (1.732/2)* data["Y"] - (1./2)* data["X"]

data["rot_60_X"] = (1./2)*data["X"] + (1.732/2)*data["Y"]
data["rot_60_Y"] = (1./2)* data["Y"] - (1.732/2)* data["X"]

data["radial_r"] = np.sqrt( np.power(data["Y"],2) + np.power(data["X"],2) )
data['XY'] = data.X * data.Y

In [213]:
xyscaler = StandardScaler()
xyscaler.fit(test[["X","Y"]])
test[["X","Y"]] = xyscaler.transform(test[["X","Y"]])

# test["X_reduced"] = test.X.apply(lambda x: "{0:.2f}".format(x)).astype(float)
# test["Y_reduced"] = test.Y.apply(lambda x: "{0:.2f}".format(x)).astype(float)
# test["X_reduced_cat"] = pd.Categorical.from_array(test.X_reduced).codes
# test["Y_reduced_cat"] = pd.Categorical.from_array(test.Y_reduced).codes

test["rot_45_X"] = .707*test["Y"] + .707*test["X"]
test["rot_45_Y"] = .707* test["Y"] - .707* test["X"]

test["rot_30_X"] = (1.732/2)*test["X"] + (1./2)*test["Y"]
test["rot_30_Y"] = (1.732/2)* test["Y"] - (1./2)* test["X"]

test["rot_60_X"] = (1./2)*test["X"] + (1.732/2)*test["Y"]
test["rot_60_Y"] = (1./2)* test["Y"] - (1.732/2)* test["X"]

test["radial_r"] = np.sqrt( np.power(test["Y"],2) + np.power(test["X"],2) )
test['XY'] = test.X * test.Y

In [86]:
data.loc[:,['X','Y','rot_45_X','rot_45_Y','rot_30_X','rot_30_Y','rot_60_X','rot_60_Y','radial_r','XY']].head()

Unnamed: 0,X,Y,rot_45_X,rot_45_Y,rot_30_X,rot_30_Y,rot_60_X,rot_60_Y,radial_r,XY
0,0.135687,-0.197559,-0.043743,-0.235605,0.018725,-0.238929,-0.103243,-0.216284,0.239667,-0.026806
1,0.451779,0.588986,0.73582,0.097005,0.685733,0.284172,0.735951,-0.096747,0.742299,0.266091
2,0.765623,0.346893,0.786549,-0.296042,0.836476,-0.082402,0.683221,-0.489583,0.840543,0.265589
3,0.713787,-1.526104,-0.574308,-1.583603,-0.144913,-1.6785,-0.964713,-1.381191,1.684781,-1.089313
4,0.693086,0.754293,1.023297,0.043273,0.977359,0.306675,0.999761,-0.223066,1.024366,0.52279


In [33]:
data.is_holiday.replace(False, 0, inplace=True)
data.is_holiday.replace(True, 1, inplace=True)

In [61]:
def street_addr(x):
    street=x.split(' ')
    return (''.join(street[-1]))

data['Address_Type'] = data['Address'].apply(lambda x:street_addr(x))
test['Address_Type'] = test['Address'].apply(lambda x:street_addr(x))

for x in [data,test]:
    x['is_street'] = (x['Address_Type'] == 'ST')
    x['is_avenue'] = (x['Address_Type'] == 'AV')

data['is_street'] = data['is_street'].apply(lambda x:int(x))
data['is_avenue'] = data['is_avenue'].apply(lambda x:int(x))

test['is_avenue'] = test['is_avenue'].apply(lambda x:int(x))
test['is_street'] = test['is_street'].apply(lambda x:int(x))

In [64]:
def is_block(x):
    if 'Block' in x:
        return 1
    else:
        return 0

data['is_block'] = data['Address'].apply(lambda x:is_block(x)) 
test['is_block'] = test['Address'].apply(lambda x:is_block(x))

In [216]:
data.columns

Index(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y', 'Id', 'Hour', 'Minutes', 'Year',
       'Month', 'Day', 'DayOfWeekNum', 'newMin', 'seasons', 'ResolutionNum',
       'PdDistrictNum', 'CategoryNum', 'newAddress', 'newAddressNum',
       'Address_CrossRoad', 'Address_clean', 'Address_clean_encode',
       'is_weekend', 'is_night_time', 'is_holiday', 'tags', '', '80', 'AL',
       'AR', 'AV', 'AY', 'BL', 'CR', 'CT', 'DR', 'ER', 'EX', 'HY', 'LN', 'MS',
       'NO', 'PL', 'PZ', 'RD', 'RK', 'RW', 'ST', 'TI', 'TR', 'WK', 'WY',
       'X_reduced', 'Y_reduced', 'rot_45_X', 'rot_45_Y', 'rot_30_X',
       'rot_30_Y', 'rot_60_X', 'rot_60_Y', 'radial_r', 'Address_Type',
       'is_street', 'is_avenue', 'is_block', 'XY'],
      dtype='object')

In [159]:
features=['X','Y','Hour','Minutes','Year','Month','Day','DayOfWeekNum', 'PdDistrictNum',
          'Address_CrossRoad', 'Address_clean_encode'] + ['rot_45_X','rot_45_Y','rot_30_X','rot_30_Y','rot_60_X','rot_60_Y','radial_r','XY']

In [160]:
# for i in data.CategoryNum.unique():
#     print(i,labelencoder.inverse_transform(data.CategoryNum.unique())[i])
#     data[data.CategoryNum==i].hist(bins=50, figsize=(20,15))
#     plt.show()

# data.hist(bins=50, figsize=(20,15))
# plt.show()

In [161]:
# Random seed has been set - As per the guidlines of the competition
train_, test_ = train_test_split(data, test_size=0.3, random_state=3, shuffle=True)

In [162]:
ytrain_ = train_['CategoryNum']
Xtrain_ = train_[features]
ytest_ = test_['CategoryNum']
Xtest_ = test_[features]

In [214]:
y_train = data['CategoryNum']
X_train = data[features]
# y_test = test['CategoryNum']
X_test = test[features]

In [117]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    random_state=42, 
    solver='sag', 
    multi_class='multinomial',
    max_iter=100,
    n_jobs=-1
)
# clf.fit(X_train,y_train)
# pred = clf.predict_proba(X_test)
# log_loss(y_test,pred)

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
# clf.fit(Xtrain_,ytrain_)
# print(clf.score(Xtest_,ytest_))
# pred = clf.predict_proba(Xtest_)
# print(log_loss(ytest_,pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier

max_depth = 8

model = DecisionTreeClassifier(
    max_depth=max_depth
)

In [None]:
importances = dt_model.feature_importances_
indices = np.argsort(importances)

In [None]:
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [204]:
from sklearn.ensemble import RandomForestClassifier

random_state = 42
max_depth = 16
min_weight_fraction_leaf = 0.02
n_estimators = 100
n_jobs = -1

model = RandomForestClassifier(
    random_state=random_state,
    max_depth=max_depth,
    n_estimators=n_estimators,
    n_jobs=n_jobs,
#     min_weight_fraction_leaf=min_weight_fraction_leaf
)

# model.fit(Xtrain_,ytrain_)
# print(model.score(Xtest_,ytest_))
# pred = model.predict_proba(Xtest_)
# print(log_loss(ytest_,pred))

In [188]:
import xgboost as xgb

seed = 42
max_depth = 8
learning_rate = 0.2
min_child_weight = 1
n_estimators = 100

model = xgb.XGBClassifier(
    objective='multi:softprob', 
    seed=seed, 
    max_depth=max_depth,
    nthread=0,
    n_jobs=-1,
#     min_child_weight=min_child_weight,
#     learning_rate=learning_rate,
    n_estimators = n_estimators
)

In [205]:
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=16, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [206]:
score = -1 * cross_val_score(model, Xtrain_, ytrain_, scoring='neg_log_loss', cv=3, n_jobs=8)

In [207]:
print("Score = {0:.6f}".format(score.mean()))
print(score)

Score = 2.281929
[2.28079949 2.28281308 2.28217386]


In [193]:
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'max_depth': [6,7,8,14,15,16,17],
#     'min_weight_fraction_leaf': [0.2,0.3],
#     'min_samples_split': [2,3,4,5],
    'n_estimators': [75,100,125]
}
model_gscv = GridSearchCV(
    estimator=model,
    scoring='neg_log_loss', 
    param_grid=param_grid, 
    cv = 2,
    n_jobs = -1
)

In [194]:
model_gscv.fit(Xtrain_, ytrain_)

GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.02, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [75, 100, 125], 'max_depth': [6, 7, 8, 14, 15, 16, 17]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=0)

In [195]:
model_gscv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=14, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.02, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [196]:
model_gscv.best_score_

-2.4604508184194342

In [197]:
means = model_gscv.cv_results_['mean_test_score']
stds = model_gscv.cv_results_['std_test_score']
params = model_gscv.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

-2.476288 (0.000517) with: {'n_estimators': 75, 'max_depth': 6}
-2.475718 (0.000380) with: {'n_estimators': 100, 'max_depth': 6}
-2.475492 (0.000198) with: {'n_estimators': 125, 'max_depth': 6}
-2.467486 (0.000750) with: {'n_estimators': 75, 'max_depth': 7}
-2.466590 (0.000516) with: {'n_estimators': 100, 'max_depth': 7}
-2.466422 (0.000542) with: {'n_estimators': 125, 'max_depth': 7}
-2.463292 (0.000304) with: {'n_estimators': 75, 'max_depth': 8}
-2.462515 (0.000370) with: {'n_estimators': 100, 'max_depth': 8}
-2.462541 (0.000674) with: {'n_estimators': 125, 'max_depth': 8}
-2.461409 (0.000200) with: {'n_estimators': 75, 'max_depth': 14}
-2.460451 (0.000164) with: {'n_estimators': 100, 'max_depth': 14}
-2.460526 (0.000373) with: {'n_estimators': 125, 'max_depth': 14}
-2.461409 (0.000200) with: {'n_estimators': 75, 'max_depth': 15}
-2.460451 (0.000164) with: {'n_estimators': 100, 'max_depth': 15}
-2.460526 (0.000373) with: {'n_estimators': 125, 'max_depth': 15}
-2.461409 (0.000200) wit

# PCA

In [44]:
from sklearn.decomposition import PCA
pca = PCA(n_components=17)
pca.fit(Xtrain_, ytrain_)

PCA(copy=True, iterated_power='auto', n_components=17, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [45]:
Xtrain_pca = pca.transform(Xtrain_)
Xtest_pca = pca.transform(Xtest_)

In [54]:
Xtrain_pca.shape, Xtrain_.shape

((607464, 17), (607464, 37))

In [59]:
score = -1 * cross_val_score(model, Xtrain_pca, ytrain_, scoring='neg_log_loss', cv=3, n_jobs=8)

In [60]:
print("Score = {0:.6f}".format(score.mean()))
print(score)

Score = 2.509333
[2.5094304  2.50973531 2.50883184]


# Stacking

In [None]:
from sklearn.model_selection import StratifiedKFold
def Stacking(model,train,y,test,n_fold):
    folds = StratifiedKFold(n_splits=n_fold,random_state=42)
    test_pred = np.empty((0,1),float)
    train_pred = np.empty((0,1),float)
    for train_indices,val_indices in folds.split(train,y.values):
        x_train,x_val = train.iloc[train_indices],train.iloc[val_indices]
        y_train,y_val = y.iloc[train_indices],y.iloc[val_indices]

        model.fit(X = x_train, y = y_train)
        train_pred = np.append(train_pred,model.predict(x_val))
    test_pred = np.append(test_pred,model.predict(test))
    return test_pred.reshape(-1,1),train_pred

In [None]:
f1=['X','Y','Hour','Minutes','Year','Month','Day','DayOfWeekNum', 'PdDistrictNum',
          'Address_CrossRoad', 'Address_clean_encode']

In [None]:
tr1, te1 = train_test_split(data, test_size=0.3, random_state=3, shuffle=True)

In [None]:
ytr1 = tr1['CategoryNum']
Xtr1 = tr1[f1]
yte1 = te1['CategoryNum']
Xte1 = te1[f1]

In [None]:
rf1 = xgb.XGBClassifier(
    seed=42,
    max_depth=8,
    learning_rate=0.2,
    nthread=8,
    n_jobs=8,
    n_estimators=100
)
test_pred1 , train_pred1=Stacking(
    model = rf1, 
    n_fold = 3, 
    train = Xtr1,
    test = Xte1,
    y = ytr1
)

train_pred1=pd.DataFrame(train_pred1)
test_pred1=pd.DataFrame(test_pred1)

In [None]:
f2=['X','Y','Hour','Minutes','Year','Month','Day','DayOfWeekNum', 'PdDistrictNum',
          'Address_CrossRoad', 'Address_clean_encode','is_weekend', 'is_night_time', 'is_holiday']

In [None]:
tr2, te2 = train_test_split(data, test_size=0.3, random_state=3, shuffle=True)

In [None]:
ytr2 = tr2['CategoryNum']
Xtr2 = tr2[f2]
yte2 = te2['CategoryNum']
Xte2 = te2[f2]

In [None]:
rf2 = RandomForestClassifier(
    random_state=42,
    max_depth=16,
    n_jobs=8,
    n_estimators=100
)
test_pred2 , train_pred2=Stacking(
    model = rf2, 
    n_fold = 3, 
    train = Xtr2,
    test = Xte2,
    y = ytr2
)

train_pred2=pd.DataFrame(train_pred2)
test_pred2=pd.DataFrame(test_pred2)

In [None]:
df = pd.concat([train_pred1, train_pred2], axis=1)
df_test = pd.concat([test_pred1, test_pred2], axis=1)

In [None]:
df.columns[df.columns.duplicated()]

In [None]:
model = xgb.XGBClassifier(
    seed=42,
    max_depth=6,
    n_jobs=-1,
    n_estimators=100
)
model.fit(df,ytr2)

In [None]:
log_loss(yte2,model.predict_proba(df_test))

In [None]:
score = -1 * cross_val_score(model, Xtr1, ytrain_, scoring='neg_log_loss', cv=3, n_jobs=8)

In [116]:
data[['Address','newAddress']]

Unnamed: 0,Address,newAddress
0,2100 Block of MISSION ST,MISSION ST
1,1000 Block of MARKET ST,MARKET ST
2,800 Block of BRYANT ST,BRYANT ST
3,2400 Block of SAN BRUNO AV,SAN BRUNO AV
4,4TH ST / STEVENSON ST,4TH ST STEVENSON ST
5,15TH ST / NATOMA ST,15TH ST NATOMA ST
6,0 Block of MARK LN,MARK LN
7,100 Block of PUTNAM ST,PUTNAM ST
8,HALE ST / SAN BRUNO AV,HALE ST SAN BRUNO AV
9,ELLIS ST / LARKIN ST,ELLIS ST LARKIN ST


In [None]:
model.fit(X_train,y_train)

In [209]:
import pickle

In [None]:
pickle.dump(model, open("xgboost_wo_res61118.p", "wb"))

In [211]:
model = pickle.load(open("xgboost_wo_res301118.p", "rb"))
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=nan, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=42, silent=True, subsample=1)

In [215]:
predictions = model.predict_proba(X_test)

XGBoostError: need to call fit or load_model beforehand

In [None]:
submission = pd.DataFrame(predictions)
submission.columns = sorted(data.Category.unique())
submission['Id'] = test['Id']
submission

In [None]:
submission.to_csv('submission.csv', index=False)