In [288]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,scale
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss

In [289]:
# Dates have to extracted
# data = pd.read_csv('train.csv', parse_dates=['Dates'])
data = pd.read_csv('./dataset/train.csv', parse_dates=['Dates'])
test = pd.read_csv('./dataset/test.csv', parse_dates=['Dates'])

In [367]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 867806 entries, 0 to 867872
Data columns (total 65 columns):
Dates                   867806 non-null datetime64[ns]
Category                867806 non-null object
Descript                867806 non-null object
DayOfWeek               867806 non-null object
PdDistrict              867806 non-null object
Resolution              867806 non-null object
Address                 867806 non-null object
X                       867806 non-null float64
Y                       867806 non-null float64
Id                      867806 non-null int64
Hour                    867806 non-null int64
Minutes                 867806 non-null int64
Year                    867806 non-null int64
Month                   867806 non-null int64
Day                     867806 non-null int64
DayOfWeekNum            867806 non-null int64
newMin                  867806 non-null int64
seasons                 867806 non-null int64
ResolutionNum           867806 non-null in

In [291]:
# No null values in the data-frame
data.isnull().values.any()

False

In [292]:
# Dates
data_week_dict = {
    'Monday': 1,
    'Tuesday':2,
    'Wednesday':3,
    'Thursday':4,
    'Friday':5,
    'Saturday':6,
    'Sunday':7
}

data['Hour'] = data.Dates.dt.hour
data['Minutes'] = data.Dates.dt.minute
data['Year'] = data.Dates.dt.year
data['Month'] = data.Dates.dt.month
data['Day'] = data.Dates.dt.day
data['DayOfWeekNum'] = data['DayOfWeek'].replace(data_week_dict)

test['Hour'] = test.Dates.dt.hour
test['Minutes'] = test.Dates.dt.minute
test['Year'] = test.Dates.dt.year
test['Month'] = test.Dates.dt.month
test['Day'] = test.Dates.dt.day
test['DayOfWeekNum'] = test['DayOfWeek'].replace(data_week_dict)

In [293]:
data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Id,Hour,Minutes,Year,Month,Day,DayOfWeekNum
0,2013-06-28 17:40:00,SEX OFFENSES FORCIBLE,"FORCIBLE RAPE, BODILY FORCE",Friday,MISSION,NONE,2100 Block of MISSION ST,-122.419331,37.762264,141546,17,40,2013,6,28,5
1,2004-02-19 02:46:00,LIQUOR LAWS,CONSUMING ALCOHOL IN PUBLIC VIEW,Thursday,SOUTHERN,"ARREST, BOOKED",1000 Block of MARKET ST,-122.41134,37.781271,794152,2,46,2004,2,19,4
2,2007-11-14 00:01:00,FRAUD,"CREDIT CARD, THEFT BY USE OF",Wednesday,SOUTHERN,NONE,800 Block of BRYANT ST,-122.403405,37.775421,531205,0,1,2007,11,14,3
3,2007-12-27 18:30:00,ROBBERY,ROBBERY OF A CHAIN STORE WITH A GUN,Thursday,BAYVIEW,DISTRICT ATTORNEY REFUSES TO PROSECUTE,2400 Block of SAN BRUNO AV,-122.404715,37.730161,523137,18,30,2007,12,27,4
4,2012-09-09 17:02:00,OTHER OFFENSES,PROBATION VIOLATION,Sunday,SOUTHERN,"ARREST, BOOKED",4TH ST / STEVENSON ST,-122.405239,37.785265,200968,17,2,2012,9,9,7


In [294]:
def newMin(i):
    if(i<15):
        return 0
        
    elif(i>=15 and i<30):
        return 15

    elif(i>=30 and i<45):
        return 30
    
    elif(i>=45):
        return 45

data['newMin'] = data.Minutes.apply(lambda a:newMin(a))
test['newMin'] = test.Minutes.apply(lambda a:newMin(a))

In [382]:
# https://www.studentflights.com.au/destinations/san-francisco/weather
def season(i):
    if i in [2, 3]:
        return 1
    if i in [4,5]:
        return 2
    if i in [6,7]:
        return 3
    if i in [8,9]:
        return 4
    if i in [10,11]:
        return 5
    if i in [12,1]:
        return 6
    
    
data['seasons'] = data.Month.apply(lambda a:season(a))
test['seasons'] = test.Month.apply(lambda a:season(a))

In [383]:
data.seasons.value_counts()

2    155941
5    151451
1    145442
3    139213
4    138843
6    136916
Name: seasons, dtype: int64

In [296]:
labelencoder = LabelEncoder()

In [297]:
data['ResolutionNum'] = labelencoder.fit_transform(data['Resolution'])
data['PdDistrictNum'] = labelencoder.fit_transform(data['PdDistrict'])
data['CategoryNum'] = labelencoder.fit_transform(data['Category'])

test['ResolutionNum'] = labelencoder.fit_transform(test['Resolution'])
test['PdDistrictNum'] = labelencoder.fit_transform(test['PdDistrict'])

In [298]:
data = data[data.X < -121]
data = data[data.Y < 40]

test = test[test.X < -121]
test = test[test.Y < 40]

In [299]:
def getCapsAddress(i):
    s=''
    for j in i.split():
        if(j.isupper()):
            s=s+' '+j
    return s[1:]

data['newAddress'] = data.Address.apply(lambda a:getCapsAddress(a))
test['newAddress'] = test.Address.apply(lambda a:getCapsAddress(a))

In [300]:
data['newAddressNum'] = labelencoder.fit_transform(data.newAddress)
test['newAddressNum'] = labelencoder.fit_transform(test.newAddress)

In [301]:
data['Address_CrossRoad'] = data['Address'].str.contains('/')
test['Address_CrossRoad'] = test['Address'].str.contains('/')

topN_address_list = data['Address'].value_counts()
topN_address_list = topN_address_list[topN_address_list >=100]
topN_address_list = topN_address_list.index
print(topN_address_list)

data['Address_clean'] = data['Address']
test['Address_clean'] = test['Address']
data.loc[~data['Address'].isin(topN_address_list), 'Address_clean'] = 'Others'
test.loc[~test['Address'].isin(topN_address_list), 'Address_clean'] = 'Others'
print(data.shape)

crossload = data[data['Address_clean'].str.contains('/')]
crossroad_list = crossload['Address_clean'].unique()
print(len(crossroad_list))

Index(['800 Block of BRYANT ST', '800 Block of MARKET ST',
       '2000 Block of MISSION ST', '1000 Block of POTRERO AV',
       '900 Block of MARKET ST', '0 Block of TURK ST', '0 Block of 6TH ST',
       '300 Block of ELLIS ST', '400 Block of ELLIS ST',
       '16TH ST / MISSION ST',
       ...
       '0 Block of ZOO RD', 'MARKET ST / HYDE ST', '1800 Block of SUTTER ST',
       'CAPP ST / 19TH ST', '0 Block of LAGUNA ST', '500 Block of JACKSON ST',
       '900 Block of GRANT AV', '14TH ST / FOLSOM ST',
       '2300 Block of GOLDEN GATE AV', '0 Block of LOCKSLEY AV'],
      dtype='object', length=1616)
(867806, 25)
475


In [302]:
for address in crossroad_list:
    address_split = address.split('/')
    reverse_address = address_split[1].strip() + ' / ' + address_split[0].strip()
    data.loc[data['Address_clean'] == reverse_address, 'Address_clean'] = address
    test.loc[test['Address_clean'] == reverse_address, 'Address_clean'] = address
crossload = data[data['Address_clean'].str.contains('/')]
crossroad_list = crossload['Address_clean'].unique()
print(len(crossroad_list))

le = LabelEncoder()
data['Address_clean_encode'] = le.fit_transform(data['Address_clean'])
print(data.shape)

321
(867806, 26)


In [303]:
le = LabelEncoder()
test['Address_clean_encode'] = le.fit_transform(test['Address_clean'])

In [304]:
def is_weekend(day):
    if day in ['Friday', 'Saturday', 'Sunday']:
        return True
    else:
        return False
    
data['is_weekend'] = data.DayOfWeek.apply(lambda x : is_weekend(x))
test['is_weekend'] = test.DayOfWeek.apply(lambda x : is_weekend(x))

In [305]:
def night_time(time):
    if time >= 22 or time <= 6:
        return True
    else:
        return False

data['is_night_time'] = data.Hour.apply(lambda x : night_time(x))
test['is_night_time'] = test.Hour.apply(lambda x : night_time(x))

In [306]:
import holidays
us_holidays = holidays.US()
def is_holiday(date):
    if date in us_holidays:
        return True
    else:
        return False

data['is_holiday'] = data.Dates.dt.date.apply(lambda x: is_holiday(x))
test['is_holiday'] = test.Dates.dt.date.apply(lambda x: is_holiday(x))

In [307]:
def get_address_char(address):
    strings = address.strip().split('/')
    if(len(strings) == 1):
        return [strings[0].strip()[-2:].strip()]
    else:
        return [strings[0].strip()[-2:].strip(), strings[1][-2:].strip()]

In [308]:
def get_tags(all_address):
    all_tags = []
    for address in all_address:
        tags = get_address_char(address)
        for tag in tags:
            if(len(tag) != 0 and tag.isdigit() == False):
                all_tags.append(tag)
    return list(set(all_tags))

In [309]:
all_tags = get_tags(data.Address)

In [310]:
data['tags'] = data.Address.apply(lambda x: get_address_char(x))
test['tags'] = test.Address.apply(lambda x: get_address_char(x))

In [311]:
def makeDict(col):
    col = col[0]
    all_dict = {}
    for i in all_tags:
        all_dict[i]=0
    for i in col:
        all_dict[i]=1
    return all_dict

In [312]:
all_dicts_data = data[['tags']].apply(makeDict,axis=1)
all_dicts_test = test[['tags']].apply(makeDict,axis=1)

In [313]:
data_dicts_pd = pd.DataFrame(list(all_dicts_data),index=data.index)
test_dicts_pd = pd.DataFrame(list(all_dicts_test),index=test.index)

In [314]:
# data_dicts_pd.drop(columns=['','80'],inplace=True)

In [315]:
data = pd.concat([data,data_dicts_pd],axis=1)
test = pd.concat([test,test_dicts_pd],axis=1)

In [316]:
corr = data.corr()
print(corr['CategoryNum'].sort_values(ascending=False))

CategoryNum             1.000000
Address_clean_encode    0.070796
Address_CrossRoad       0.069896
AV                      0.041185
ResolutionNum           0.039676
Hour                    0.023701
Id                      0.016881
WY                      0.013034
is_night_time           0.011986
BL                      0.006744
TR                      0.006426
RD                      0.003983
CR                      0.003544
NO                      0.002595
DR                      0.002450
RW                      0.002233
MS                      0.001754
HY                      0.001571
AR                      0.001330
RK                      0.001163
LN                      0.000733
AY                      0.000666
DayOfWeekNum            0.000537
CT                      0.000534
is_holiday              0.000504
EX                      0.000372
Day                     0.000354
is_weekend              0.000065
ER                     -0.000258
Month                  -0.000360
seasons   

In [317]:
data["X_reduced"] = data.X.apply(lambda x: "{0:.2f}".format(x)).astype(float)
data["Y_reduced"] = data.Y.apply(lambda x: "{0:.2f}".format(x)).astype(float)
# data["X_reduced_cat"] = pd.Categorical.from_array(data.X_reduced).codes
# data["Y_reduced_cat"] = pd.Categorical.from_array(data.Y_reduced).codes

data["rot_45_X"] = .707*data["Y"] + .707*data["X"]
data["rot_45_Y"] = .707* data["Y"] - .707* data["X"]

data["rot_30_X"] = (1.732/2)*data["X"] + (1./2)*data["Y"]
data["rot_30_Y"] = (1.732/2)* data["Y"] - (1./2)* data["X"]

data["rot_60_X"] = (1./2)*data["X"] + (1.732/2)*data["Y"]
data["rot_60_Y"] = (1./2)* data["Y"] - (1.732/2)* data["X"]

data["radial_r"] = np.sqrt( np.power(data["Y"],2) + np.power(data["X"],2) )

In [None]:
data.loc[:,['X','Y','rot_45_X','rot_45_Y','rot_30_X','rot_30_Y','rot_60_X','rot_60_Y','radial_r']].head()

In [370]:
data.is_holiday.replace(False, 0, inplace=True)
data.is_holiday.replace(True, 1, inplace=True)

In [388]:
data.Month.value_counts()

10    79327
5     78748
4     77193
3     75455
1     72673
11    72124
9     71126
6     70041
2     69987
7     69172
8     67717
12    64243
Name: Month, dtype: int64

In [349]:
features=['X','Y','Hour','Minutes','Year','Month','Day','DayOfWeekNum', 'PdDistrictNum',
          'Address_CrossRoad', 'Address_clean_encode','is_weekend', 'seasons']

In [350]:
# for i in data.CategoryNum.unique():
#     print(i,labelencoder.inverse_transform(data.CategoryNum.unique())[i])
#     data[data.CategoryNum==i].hist(bins=50, figsize=(20,15))
#     plt.show()

# data.hist(bins=50, figsize=(20,15))
# plt.show()

In [351]:
# Random seed has been set - As per the guidlines of the competition
train_, test_ = train_test_split(data, test_size=0.3, random_state=3, shuffle=True)

In [352]:
ytrain_ = train_['CategoryNum']
Xtrain_ = train_[features]
ytest_ = test_['CategoryNum']
Xtest_ = test_[features]

In [None]:
y_train = data['CategoryNum']
X_train = data[features]
# y_test = test['CategoryNum']
X_test = test[features]

In [353]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    random_state=42, 
    solver='sag', 
    multi_class='multinomial',
    max_iter=100
)
# clf.fit(X_train,y_train)
# pred = clf.predict_proba(X_test)
# log_loss(y_test,pred)

In [358]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
# clf.fit(Xtrain_,ytrain_)
# print(clf.score(Xtest_,ytest_))
# pred = clf.predict_proba(Xtest_)
# print(log_loss(ytest_,pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier

max_depth = 8

model = DecisionTreeClassifier(
    max_depth=max_depth
)

In [None]:
importances = dt_model.feature_importances_
indices = np.argsort(importances)

In [None]:
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [322]:
from sklearn.ensemble import RandomForestClassifier

random_state = 42
max_depth = 13
min_weight_fraction_leaf = 0.2
n_estimators = 125
n_jobs = -1

model = RandomForestClassifier(
    random_state=random_state,
    max_depth=max_depth,
    n_estimators=n_estimators,
    n_jobs=n_jobs,
    min_weight_fraction_leaf=min_weight_fraction_leaf
)

# model.fit(Xtrain_,ytrain_)
# print(model.score(Xtest_,ytest_))
# pred = model.predict_proba(Xtest_)
# print(log_loss(ytest_,pred))

In [None]:
import xgboost as xgb

seed = 42
max_depth = 17
learning_rate = 0.2
min_child_weight = 1
n_estimators = 100

model = xgb.XGBClassifier(
    objective='multi:softprob', 
    seed=seed, 
    max_depth=max_depth,
    nthread=8,
    n_jobs=8,
#     min_child_weight=min_child_weight,
#     learning_rate=learning_rate,
    n_estimators = n_estimators
)

In [359]:
model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [360]:
score = -1 * cross_val_score(model, Xtrain_, ytrain_, scoring='neg_log_loss', cv=3, n_jobs=8)

ValueError: Input X must be non-negative

In [None]:
print("Score = {0:.6f}".format(score.mean()))
print(score)

In [336]:
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'var_smoothing': [1e-3,1e-4,1e-5]
#     'max_depth': [11,12,13,14,15,16],
#     'min_weight_fraction_leaf': [0.2,0.3],
#     'min_samples_split': [2,3,4,5],
#     'n_estimators': [100,120,140]
}
model_gscv = GridSearchCV(
    estimator=model,
    scoring='neg_log_loss', 
    param_grid=param_grid, 
    cv = 3,
    n_jobs = -1
)

In [341]:
model_gscv.fit(Xtrain_, ytrain_)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=GaussianNB(priors=None, var_smoothing=0.0001),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'var_smoothing': [0.001, 0.0001, 1e-05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=0)

In [342]:
model_gscv.best_estimator_

GaussianNB(priors=None, var_smoothing=1e-05)

In [343]:
model_gscv.best_score_

-2.5869996413165532

In [344]:
means = model_gscv.cv_results_['mean_test_score']
stds = model_gscv.cv_results_['std_test_score']
params = model_gscv.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

-2.634167 (0.000894) with: {'var_smoothing': 0.001}
-2.610727 (0.001324) with: {'var_smoothing': 0.0001}
-2.587000 (0.001703) with: {'var_smoothing': 1e-05}


# Stacking

In [147]:
from sklearn.model_selection import StratifiedKFold
def Stacking(model,train,y,test,n_fold):
    folds = StratifiedKFold(n_splits=n_fold,random_state=42)
    test_pred = np.empty((0,1),float)
    train_pred = np.empty((0,1),float)
    for train_indices,val_indices in folds.split(train,y.values):
        x_train,x_val = train.iloc[train_indices],train.iloc[val_indices]
        y_train,y_val = y.iloc[train_indices],y.iloc[val_indices]

        model.fit(X = x_train, y = y_train)
        train_pred = np.append(train_pred,model.predict(x_val))
    test_pred = np.append(test_pred,model.predict(test))
    return test_pred.reshape(-1,1),train_pred

In [174]:
f1=['X','Y','Hour','Minutes','Year','Month','Day','DayOfWeekNum', 'PdDistrictNum',
          'Address_CrossRoad', 'Address_clean_encode']

In [175]:
tr1, te1 = train_test_split(data, test_size=0.3, random_state=3, shuffle=True)

In [176]:
ytr1 = tr1['CategoryNum']
Xtr1 = tr1[f1]
yte1 = te1['CategoryNum']
Xte1 = te1[f1]

In [177]:
rf1 = xgb.XGBClassifier(
    seed=42,
    max_depth=8,
    learning_rate=0.2,
    nthread=8,
    n_jobs=8,
    n_estimators=100
)
test_pred1 , train_pred1=Stacking(
    model = rf1, 
    n_fold = 3, 
    train = Xtr1,
    test = Xte1,
    y = ytr1
)

train_pred1=pd.DataFrame(train_pred1)
test_pred1=pd.DataFrame(test_pred1)

In [178]:
f2=['X','Y','Hour','Minutes','Year','Month','Day','DayOfWeekNum', 'PdDistrictNum',
          'Address_CrossRoad', 'Address_clean_encode','is_weekend', 'is_night_time', 'is_holiday']

In [179]:
tr2, te2 = train_test_split(data, test_size=0.3, random_state=3, shuffle=True)

In [180]:
ytr2 = tr2['CategoryNum']
Xtr2 = tr2[f2]
yte2 = te2['CategoryNum']
Xte2 = te2[f2]

In [212]:
rf2 = RandomForestClassifier(
    random_state=42,
    max_depth=16,
    n_jobs=8,
    n_estimators=100
)
test_pred2 , train_pred2=Stacking(
    model = rf2, 
    n_fold = 3, 
    train = Xtr2,
    test = Xte2,
    y = ytr2
)

train_pred2=pd.DataFrame(train_pred2)
test_pred2=pd.DataFrame(test_pred2)

In [213]:
df = pd.concat([train_pred1, train_pred2], axis=1)
df_test = pd.concat([test_pred1, test_pred2], axis=1)

In [251]:
df.columns[df.columns.duplicated()]

Int64Index([0], dtype='int64')

In [206]:
model = xgb.XGBClassifier(
    seed=42,
    max_depth=6,
    n_jobs=-1,
    n_estimators=100
)
model.fit(df,ytr2)

ValueError: feature_names must be unique

In [None]:
log_loss(yte2,model.predict_proba(df_test))

In [None]:
score = -1 * cross_val_score(model, Xtr1, ytrain_, scoring='neg_log_loss', cv=3, n_jobs=8)

In [44]:
model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=17, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [None]:
import pickle

In [None]:
pickle.dump(model, open("xgboost_wo_res61118.p", "wb"))

In [None]:
model = pickle.load(open("xgboost_wo_res61118.p", "rb"))
model

In [45]:
predictions = model.predict_proba(X_test)

In [46]:
submission = pd.DataFrame(predictions)
submission.columns = sorted(data.Category.unique())
submission['Id'] = test['Id']
submission

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS,Id
0,0.000036,0.476309,0.000233,0.000242,0.036013,0.000433,0.000168,0.009692,0.006666,0.000567,...,0.013575,0.000420,0.000032,0.012394,0.003479,0.007904,0.021006,0.017390,0.000372,349598
1,0.005912,0.116710,0.000439,0.000132,0.032900,0.002960,0.004389,0.020555,0.010872,0.000799,...,0.002175,0.003020,0.000620,0.038174,0.030918,0.072586,0.052766,0.054046,0.005919,766313
2,0.000076,0.052853,0.000099,0.000069,0.007950,0.001916,0.004463,0.052251,0.005985,0.000195,...,0.000820,0.005490,0.000000,0.026113,0.004160,0.030095,0.016484,0.040883,0.008977,169887
3,0.000151,0.093901,0.000000,0.000000,0.002943,0.011508,0.018034,0.035823,0.002994,0.000000,...,0.000933,0.002341,0.000000,0.006861,0.000210,0.069291,0.041847,0.054320,0.004874,594704
4,0.000167,0.048924,0.000011,0.000158,0.004773,0.003965,0.001656,0.075136,0.004829,0.000096,...,0.001220,0.008017,0.000023,0.070810,0.001791,0.024489,0.012838,0.089483,0.003714,47900
5,0.000048,0.143566,0.000000,0.000106,0.019010,0.000907,0.001562,0.118124,0.007255,0.000000,...,0.006510,0.001285,0.000338,0.013514,0.002055,0.020817,0.052111,0.020256,0.003101,339260
6,0.000828,0.085016,0.002838,0.000616,0.071502,0.001507,0.000037,0.017374,0.002233,0.001144,...,0.009071,0.004396,0.000247,0.026872,0.007014,0.072765,0.052919,0.020568,0.003413,169575
7,0.000101,0.009636,0.000013,0.000000,0.007244,0.000114,0.000000,0.003475,0.001359,0.000000,...,0.010757,0.002552,0.000000,0.008055,0.001570,0.058663,0.269546,0.012796,0.003242,681483
8,0.000680,0.089670,0.001642,0.000242,0.059607,0.003115,0.000170,0.019560,0.001424,0.003963,...,0.008908,0.004331,0.000809,0.048213,0.008066,0.043183,0.034757,0.021600,0.004981,276938
9,0.000213,0.042845,0.007373,0.000009,0.116248,0.000795,0.000440,0.007232,0.000907,0.000628,...,0.005011,0.002422,0.000259,0.026610,0.015196,0.071248,0.072966,0.013322,0.002394,146369


In [47]:
submission.to_csv('submission.csv', index=False)