In [74]:
# -*- encoding:urf-8 -*-
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table like and matrices
import pandas as pd
import numpy as np

# Modeling Helper
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import missingno as msno

# Configure visualization
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 8,6

poi_path = '../../UAI_data/input/poi_re.csv'
train_Aug_path = '../../UAI_data/input/train_Aug_re.csv'
train_Jul_path = '../../UAI_data/input/train_July_re.csv'
train_jul_demand_path = '../../UAI_data/input/tain_jul_demand.csv'
train_aug_demand_path = '../../UAI_data/input/tain_aug_demand.csv'
location_path = '../../UAI_data/input/location_cls.csv'
holiday_path = '../../UAI_data/input/holiday.csv'
weather_path = '../../UAI_data/input/weather.csv'
test_path = '../../UAI_data/input/test.csv'

poi = pd.read_csv(poi_path,encoding='gbk')
train_aug = pd.read_csv(train_Aug_path,encoding='gbk')
train_jul = pd.read_csv(train_Jul_path,encoding='gbk')
train_jul_demand = pd.read_csv(train_jul_demand_path,encoding='gbk')
train_aug_demand = pd.read_csv(train_aug_demand_path,encoding='gbk')
loc_cls = pd.read_csv(location_path)
holiday = pd.read_csv(holiday_path)
weather = pd.read_csv(weather_path)
test = pd.read_csv(test_path,encoding='gbk')

###  Helper Functions

In [75]:
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))

In [76]:
# train_jul_exc = train_jul[train_jul['driver_id'] != 2].copy()
train_jul_exc = train_jul.copy()
coord_jul = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['start_geo_id'].agg({'demand_count':'count'})
coord_jul.loc[:,'estimate_money_mean'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_money'].mean()['estimate_money']
# coord_jul.loc[:,'estimate_money_median'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_money'].median()['estimate_money']
# coord_jul.loc[:,'estimate_money_max'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_money'].max()['estimate_money']
# coord_jul.loc[:,'estimate_money_min'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_money'].min()['estimate_money']
coord_jul.loc[:,'estimate_distance_mean'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_distance'].mean()['estimate_distance']
# coord_jul.loc[:,'estimate_distance_median'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_distance'].median()['estimate_distance']
# coord_jul.loc[:,'estimate_distance_max'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_distance'].max()['estimate_distance']
# coord_jul.loc[:,'estimate_distance_min'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_distance'].min()['estimate_distance']
coord_jul.loc[:,'estimate_term_mean'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_term'].mean()['estimate_term']
# coord_jul.loc[:,'estimate_term_median'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_term'].median()['estimate_term']
# coord_jul.loc[:,'estimate_term_max'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_term'].max()['estimate_term']
# coord_jul.loc[:,'estimate_term_min'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_term'].min()['estimate_term']

# coord_jul.to_csv('train_jul_exc.csv',index=False)
coord_jul

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,estimate_money_mean,estimate_distance_mean,estimate_term_mean
0,2017-07-01,0,0,10,1,101.000000,14431.000000,24.000000
1,2017-07-01,0,0,21,1,39.000000,4905.000000,9.000000
2,2017-07-01,0,0,64,1,78.860000,8764.000000,9.000000
3,2017-07-01,0,1,1,2,28.500000,2437.000000,4.500000
4,2017-07-01,0,1,15,2,67.000000,10955.000000,21.500000
5,2017-07-01,0,1,37,1,63.000000,10010.000000,20.000000
6,2017-07-01,0,1,39,2,67.110000,11493.000000,17.500000
7,2017-07-01,0,1,89,2,59.675000,9208.500000,18.000000
8,2017-07-01,0,1,95,1,90.000000,15677.000000,31.000000
9,2017-07-01,0,1,160,2,46.520000,6347.000000,12.000000


In [77]:
test

Unnamed: 0,test_id,create_date,create_hour,start_geo_id,end_geo_id
0,0,2017/8/1,21,24,27
1,1,2017/8/1,21,24,9
2,2,2017/8/1,21,24,22
3,3,2017/8/1,17,24,9
4,4,2017/8/1,17,55,55
5,5,2017/8/1,17,110,88
6,6,2017/8/1,9,23,21
7,7,2017/8/1,21,16,50
8,8,2017/8/1,13,6,24
9,9,2017/8/1,21,6,22


In [78]:
train_aug_demand

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count
0,2017-08-01,0,1,1,1
1,2017-08-01,0,1,9,1
2,2017-08-01,0,1,41,1
3,2017-08-01,0,1,73,1
4,2017-08-01,0,1,101,1
5,2017-08-01,0,1,191,1
6,2017-08-01,0,1,198,1
7,2017-08-01,0,3,56,1
8,2017-08-01,0,5,1,1
9,2017-08-01,0,5,5,3


In [79]:
test['create_hour'].value_counts()

22    443
20    411
21    383
17    346
13    334
9     313
18    261
15    254
8     246
16    242
11    240
7     234
19    225
10    211
14    201
12    180
23    147
6      76
0      70
1      61
5      47
3      30
2      25
4      20
Name: create_hour, dtype: int64

### Valid set split

In [80]:
# train_aug_exc = train_aug[train_aug['driver_id'] != 2].copy()
train_aug_exc = train_aug.copy()
coord_aug = train_aug_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['start_geo_id'].agg({'demand_count':'count'})
# coord_aug.to_csv('train_aug_exc.csv',index=False)
coord_aug

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count
0,2017-08-01,0,1,1,1
1,2017-08-01,0,1,9,1
2,2017-08-01,0,1,41,1
3,2017-08-01,0,1,73,1
4,2017-08-01,0,1,101,1
5,2017-08-01,0,1,191,1
6,2017-08-01,0,1,198,1
7,2017-08-01,0,3,56,1
8,2017-08-01,0,5,1,1
9,2017-08-01,0,5,5,3


In [81]:
dict_list = {
'l_50' : [0,1,2,3,4,5],
'l_100' : [6],
'l_150' : [12,23],
'l_200' : [10,14,19],
'l_250' : [7,8,11,15,16,18],
'l_300' : [9,13],
'l_350' : [17],
'l_400' : [20,21]
# 'l_450' : [22]
}
l_22 = coord_aug[coord_aug['create_hour'] == 22].index
r = np.random.choice(l_22,450)
valid = coord_aug.iloc[r]
for key,l in dict_list.items():
    num =  int(key.split('_')[1])
    for hour in l:
        l_tmp = coord_aug[coord_aug['create_hour'] == hour].index
        r = np.random.choice(l_tmp,num)
        tmp_valid = coord_aug.iloc[r].copy()
        valid = pd.concat([valid,tmp_valid],axis=0)
valid.sort_values(['create_date','create_hour'],inplace=True)
valid

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count
82,2017-08-01,0,21,10,1
64,2017-08-01,0,17,22,1
26,2017-08-01,0,6,70,1
70,2017-08-01,0,18,17,1
167,2017-08-01,0,39,6,2
326,2017-08-01,2,23,47,1
304,2017-08-01,2,5,39,1
299,2017-08-01,2,1,26,1
367,2017-08-01,2,57,15,1
352,2017-08-01,2,34,86,1


In [82]:
valid['create_hour'].unique()

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22,  1,  3,  5,  7,  9,
       11, 13, 15, 17, 19, 21, 23], dtype=int64)

## Feature Extracting
### 1. order list attribution

In [83]:
train_tr = coord_jul.copy()
valid_tr = valid.copy()
train_tr
# valid_tr
# valid_tr['demand_count']

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,estimate_money_mean,estimate_distance_mean,estimate_term_mean
0,2017-07-01,0,0,10,1,101.000000,14431.000000,24.000000
1,2017-07-01,0,0,21,1,39.000000,4905.000000,9.000000
2,2017-07-01,0,0,64,1,78.860000,8764.000000,9.000000
3,2017-07-01,0,1,1,2,28.500000,2437.000000,4.500000
4,2017-07-01,0,1,15,2,67.000000,10955.000000,21.500000
5,2017-07-01,0,1,37,1,63.000000,10010.000000,20.000000
6,2017-07-01,0,1,39,2,67.110000,11493.000000,17.500000
7,2017-07-01,0,1,89,2,59.675000,9208.500000,18.000000
8,2017-07-01,0,1,95,1,90.000000,15677.000000,31.000000
9,2017-07-01,0,1,160,2,46.520000,6347.000000,12.000000


In [84]:
# # time attribution
# holiday['create_date'] = pd.to_datetime(holiday['create_date'])
# train_tr['create_date'] = pd.to_datetime(train_tr['create_date'])
# valid_tr['create_date'] = pd.to_datetime(valid_tr['create_date'])
# train_tr = pd.merge(train_tr,holiday,on='create_date',how='left')
# valid_tr = pd.merge(valid_tr,holiday,on='create_date',how='left')
# train_tr['dayOfWeek'] = train_tr['create_date'].dt.dayofweek
# valid_tr['dayOfWeek'] = valid_tr['create_date'].dt.dayofweek
# train_tr

# time attribution
holiday['create_date'] = pd.to_datetime(holiday['create_date'])
train_tr['create_date'] = pd.to_datetime(train_tr['create_date'])
test['create_date'] = pd.to_datetime(test['create_date'])
train_tr = pd.merge(train_tr,holiday,on='create_date',how='left')
test = pd.merge(test,holiday,on='create_date',how='left')
train_tr['dayOfWeek'] = train_tr['create_date'].dt.dayofweek
test['dayOfWeek'] = test['create_date'].dt.dayofweek
test

Unnamed: 0,test_id,create_date,create_hour,start_geo_id,end_geo_id,holiday,dayOfWeek
0,0,2017-08-01,21,24,27,0,1
1,1,2017-08-01,21,24,9,0,1
2,2,2017-08-01,21,24,22,0,1
3,3,2017-08-01,17,24,9,0,1
4,4,2017-08-01,17,55,55,0,1
5,5,2017-08-01,17,110,88,0,1
6,6,2017-08-01,9,23,21,0,1
7,7,2017-08-01,21,16,50,0,1
8,8,2017-08-01,13,6,24,0,1
9,9,2017-08-01,21,6,22,0,1


In [85]:
# coord_money_hour = train_tr.groupby(['create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_money_mean'].agg({'money_hour_mean':'mean'})
# coord_money_day = train_tr.groupby(['dayOfWeek','start_geo_id','end_geo_id'],as_index=False)['estimate_money_mean'].agg({'money_day_mean':'mean'})
# coord_dis_hour = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['estimate_distance_mean'].agg({'dis_hour_mean':'mean'})
# coord_term_hour = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['estimate_term_mean'].agg({'term_hour_mean':'mean'})
# train_tr = pd.merge(train_tr,coord_money_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
# train_tr = pd.merge(train_tr,coord_money_day,on=['dayOfWeek','start_geo_id','end_geo_id'],how='left')
# train_tr = pd.merge(train_tr,coord_dis_hour,on=['start_geo_id','end_geo_id'],how='left')
# train_tr = pd.merge(train_tr,coord_term_hour,on=['start_geo_id','end_geo_id'],how='left')
# valid_tr = pd.merge(valid_tr,coord_money_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
# valid_tr = pd.merge(valid_tr,coord_money_day,on=['dayOfWeek','start_geo_id','end_geo_id'],how='left')
# valid_tr = pd.merge(valid_tr,coord_dis_hour,on=['start_geo_id','end_geo_id'],how='left')
# valid_tr = pd.merge(valid_tr,coord_term_hour,on=['start_geo_id','end_geo_id'],how='left')
# valid_tr

coord_money_hour = train_tr.groupby(['create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_money_mean'].agg({'money_hour_mean':'mean'})
coord_money_day = train_tr.groupby(['dayOfWeek','start_geo_id','end_geo_id'],as_index=False)['estimate_money_mean'].agg({'money_day_mean':'mean'})
coord_dis_hour = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['estimate_distance_mean'].agg({'dis_hour_mean':'mean'})
coord_term_hour = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['estimate_term_mean'].agg({'term_hour_mean':'mean'})
train_tr = pd.merge(train_tr,coord_money_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
# train_tr = pd.merge(train_tr,coord_money_day,on=['dayOfWeek','start_geo_id','end_geo_id'],how='left')
train_tr = pd.merge(train_tr,coord_dis_hour,on=['start_geo_id','end_geo_id'],how='left')
train_tr = pd.merge(train_tr,coord_term_hour,on=['start_geo_id','end_geo_id'],how='left')
test = pd.merge(test,coord_money_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
test = pd.merge(test,coord_money_day,on=['dayOfWeek','start_geo_id','end_geo_id'],how='left')
test = pd.merge(test,coord_dis_hour,on=['start_geo_id','end_geo_id'],how='left')
test = pd.merge(test,coord_term_hour,on=['start_geo_id','end_geo_id'],how='left')
test

Unnamed: 0,test_id,create_date,create_hour,start_geo_id,end_geo_id,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,term_hour_mean
0,0,2017-08-01,21,24,27,0,1,37.148061,35.196824,3780.303913,7.027449
1,1,2017-08-01,21,24,9,0,1,39.427175,36.230314,3876.549475,7.254807
2,2,2017-08-01,21,24,22,0,1,43.077712,39.877738,4350.430965,8.080604
3,3,2017-08-01,17,24,9,0,1,35.306929,36.230314,3876.549475,7.254807
4,4,2017-08-01,17,55,55,0,1,31.662917,28.740887,2163.154505,3.811067
5,5,2017-08-01,17,110,88,0,1,60.450000,88.596000,12946.467742,24.962366
6,6,2017-08-01,9,23,21,0,1,61.754622,56.878443,6722.896237,12.909258
7,7,2017-08-01,21,16,50,0,1,64.195916,61.556540,8558.499190,16.434430
8,8,2017-08-01,13,6,24,0,1,41.542258,42.673683,4856.039547,9.189806
9,9,2017-08-01,21,6,22,0,1,60.678244,56.977282,7705.057098,14.609754


### 2.Location
#### 2.1 Pure location

In [86]:
# loc_start = loc_cls.copy()
# loc_start.rename(columns={'location_id':'start_geo_id','cluster':'start_cluster'},inplace=True)
# train_tr = pd.merge(train_tr,loc_start,on='start_geo_id',how='left')
# loc_end = loc_cls.copy()
# loc_end.rename(columns={'location_id':'end_geo_id','cluster':'end_cluster'},inplace=True)
# train_tr = pd.merge(train_tr,loc_end,on='end_geo_id',how='left')
# loc_start = loc_cls.copy()
# loc_start.rename(columns={'location_id':'start_geo_id','cluster':'start_cluster'},inplace=True)
# valid_tr = pd.merge(valid_tr,loc_start,on='start_geo_id',how='left')
# loc_end = loc_cls.copy()
# loc_end.rename(columns={'location_id':'end_geo_id','cluster':'end_cluster'},inplace=True)
# valid_tr = pd.merge(valid_tr,loc_end,on='end_geo_id',how='left')
# # valid_tr['end_cluster'].astype('int',inplace=True)
# valid_tr

loc_start = loc_cls.copy()
loc_start.rename(columns={'location_id':'start_geo_id','cluster':'start_cluster'},inplace=True)
train_tr = pd.merge(train_tr,loc_start,on='start_geo_id',how='left')
loc_end = loc_cls.copy()
loc_end.rename(columns={'location_id':'end_geo_id','cluster':'end_cluster'},inplace=True)
train_tr = pd.merge(train_tr,loc_end,on='end_geo_id',how='left')
loc_start = loc_cls.copy()
loc_start.rename(columns={'location_id':'start_geo_id','cluster':'start_cluster'},inplace=True)
test = pd.merge(test,loc_start,on='start_geo_id',how='left')
loc_end = loc_cls.copy()
loc_end.rename(columns={'location_id':'end_geo_id','cluster':'end_cluster'},inplace=True)
test = pd.merge(test,loc_end,on='end_geo_id',how='left')
# test['end_cluster'].astype('int',inplace=True)
test

Unnamed: 0,test_id,create_date,create_hour,start_geo_id,end_geo_id,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,term_hour_mean,start_cluster,end_cluster
0,0,2017-08-01,21,24,27,0,1,37.148061,35.196824,3780.303913,7.027449,1.0,5.0
1,1,2017-08-01,21,24,9,0,1,39.427175,36.230314,3876.549475,7.254807,1.0,5.0
2,2,2017-08-01,21,24,22,0,1,43.077712,39.877738,4350.430965,8.080604,1.0,5.0
3,3,2017-08-01,17,24,9,0,1,35.306929,36.230314,3876.549475,7.254807,1.0,5.0
4,4,2017-08-01,17,55,55,0,1,31.662917,28.740887,2163.154505,3.811067,1.0,1.0
5,5,2017-08-01,17,110,88,0,1,60.450000,88.596000,12946.467742,24.962366,0.0,5.0
6,6,2017-08-01,9,23,21,0,1,61.754622,56.878443,6722.896237,12.909258,5.0,1.0
7,7,2017-08-01,21,16,50,0,1,64.195916,61.556540,8558.499190,16.434430,5.0,1.0
8,8,2017-08-01,13,6,24,0,1,41.542258,42.673683,4856.039547,9.189806,5.0,1.0
9,9,2017-08-01,21,6,22,0,1,60.678244,56.977282,7705.057098,14.609754,5.0,5.0


In [72]:
# coord_se_mean = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_mean':'mean'})
# coord_se_median = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_median':'median'})
# coord_se_max = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_max':'max'})
# coord_se_min = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_min':'min'})
# train_tr = pd.merge(train_tr,coord_se_mean,on=['start_geo_id','end_geo_id'],how='left')
# train_tr = pd.merge(train_tr,coord_se_median,on=['start_geo_id','end_geo_id'],how='left')
# train_tr = pd.merge(train_tr,coord_se_max,on=['start_geo_id','end_geo_id'],how='left')
# train_tr = pd.merge(train_tr,coord_se_min,on=['start_geo_id','end_geo_id'],how='left')
# valid_tr = pd.merge(valid_tr,coord_se_mean,on=['start_geo_id','end_geo_id'],how='left')
# valid_tr = pd.merge(valid_tr,coord_se_median,on=['start_geo_id','end_geo_id'],how='left')
# valid_tr = pd.merge(valid_tr,coord_se_max,on=['start_geo_id','end_geo_id'],how='left')
# valid_tr = pd.merge(valid_tr,coord_se_min,on=['start_geo_id','end_geo_id'],how='left')
# valid_tr

coord_se_mean = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_mean':'mean'})
coord_se_median = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_median':'median'})
coord_se_max = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_max':'max'})
coord_se_min = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_min':'min'})
train_tr = pd.merge(train_tr,coord_se_mean,on=['start_geo_id','end_geo_id'],how='left')
train_tr = pd.merge(train_tr,coord_se_median,on=['start_geo_id','end_geo_id'],how='left')
train_tr = pd.merge(train_tr,coord_se_max,on=['start_geo_id','end_geo_id'],how='left')
train_tr = pd.merge(train_tr,coord_se_min,on=['start_geo_id','end_geo_id'],how='left')
test = pd.merge(test,coord_se_mean,on=['start_geo_id','end_geo_id'],how='left')
test = pd.merge(test,coord_se_median,on=['start_geo_id','end_geo_id'],how='left')
test = pd.merge(test,coord_se_max,on=['start_geo_id','end_geo_id'],how='left')
test = pd.merge(test,coord_se_min,on=['start_geo_id','end_geo_id'],how='left')
test

Unnamed: 0,test_id,create_date,create_hour,start_geo_id,end_geo_id,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,term_hour_mean,start_cluster,end_cluster,sloc_eloc_mean,sloc_eloc_median,sloc_eloc_max,sloc_eloc_min
0,0,2017-08-01,21,24,27,0,1,37.148061,35.196824,3780.303913,7.027449,1.0,5.0,7.548701,4.0,198.0,1.0
1,1,2017-08-01,21,24,9,0,1,39.427175,36.230314,3876.549475,7.254807,1.0,5.0,6.206133,4.0,131.0,1.0
2,2,2017-08-01,21,24,22,0,1,43.077712,39.877738,4350.430965,8.080604,1.0,5.0,5.031469,3.0,58.0,1.0
3,3,2017-08-01,17,24,9,0,1,35.306929,36.230314,3876.549475,7.254807,1.0,5.0,6.206133,4.0,131.0,1.0
4,4,2017-08-01,17,55,55,0,1,31.662917,28.740887,2163.154505,3.811067,1.0,1.0,5.014523,3.0,53.0,1.0
5,5,2017-08-01,17,110,88,0,1,60.450000,88.596000,12946.467742,24.962366,0.0,5.0,1.645161,1.0,8.0,1.0
6,6,2017-08-01,9,23,21,0,1,61.754622,56.878443,6722.896237,12.909258,5.0,1.0,2.590361,2.0,17.0,1.0
7,7,2017-08-01,21,16,50,0,1,64.195916,61.556540,8558.499190,16.434430,5.0,1.0,2.273973,1.0,21.0,1.0
8,8,2017-08-01,13,6,24,0,1,41.542258,42.673683,4856.039547,9.189806,5.0,1.0,5.471483,4.0,51.0,1.0
9,9,2017-08-01,21,6,22,0,1,60.678244,56.977282,7705.057098,14.609754,5.0,5.0,2.817967,2.0,23.0,1.0


In [73]:
# coord_se_mean = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloccl_eloccl_mean':'mean'})
# coord_se_median = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloccl_eloccl_median':'median'})
# coord_se_max = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloccl_eloccl_max':'max'})
# coord_se_min = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloccl_eloccl_min':'min'})
# train_tr = pd.merge(train_tr,coord_se_mean,on=['start_cluster','end_cluster'],how='left')
# train_tr = pd.merge(train_tr,coord_se_median,on=['start_cluster','end_cluster'],how='left')
# train_tr = pd.merge(train_tr,coord_se_max,on=['start_cluster','end_cluster'],how='left')
# train_tr = pd.merge(train_tr,coord_se_min,on=['start_cluster','end_cluster'],how='left')
# valid_tr = pd.merge(valid_tr,coord_se_mean,on=['start_cluster','end_cluster'],how='left')
# valid_tr = pd.merge(valid_tr,coord_se_median,on=['start_cluster','end_cluster'],how='left')
# valid_tr = pd.merge(valid_tr,coord_se_max,on=['start_cluster','end_cluster'],how='left')
# valid_tr = pd.merge(valid_tr,coord_se_min,on=['start_cluster','end_cluster'],how='left')
# valid_tr

coord_se_mean = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloccl_eloccl_mean':'mean'})
coord_se_median = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloccl_eloccl_median':'median'})
coord_se_max = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloccl_eloccl_max':'max'})
coord_se_min = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloccl_eloccl_min':'min'})
train_tr = pd.merge(train_tr,coord_se_mean,on=['start_cluster','end_cluster'],how='left')
train_tr = pd.merge(train_tr,coord_se_median,on=['start_cluster','end_cluster'],how='left')
train_tr = pd.merge(train_tr,coord_se_max,on=['start_cluster','end_cluster'],how='left')
train_tr = pd.merge(train_tr,coord_se_min,on=['start_cluster','end_cluster'],how='left')
test = pd.merge(test,coord_se_mean,on=['start_cluster','end_cluster'],how='left')
test = pd.merge(test,coord_se_median,on=['start_cluster','end_cluster'],how='left')
test = pd.merge(test,coord_se_max,on=['start_cluster','end_cluster'],how='left')
test = pd.merge(test,coord_se_min,on=['start_cluster','end_cluster'],how='left')
test



Unnamed: 0,test_id,create_date,create_hour,start_geo_id,end_geo_id,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,...,start_cluster,end_cluster,sloc_eloc_mean,sloc_eloc_median,sloc_eloc_max,sloc_eloc_min,sloccl_eloccl_mean,sloccl_eloccl_median,sloccl_eloccl_max,sloccl_eloccl_min
0,0,2017-08-01,21,24,27,0,1,37.148061,35.196824,3780.303913,...,1.0,5.0,7.548701,4.0,198.0,1.0,2.068860,1.0,198.0,1.0
1,1,2017-08-01,21,24,9,0,1,39.427175,36.230314,3876.549475,...,1.0,5.0,6.206133,4.0,131.0,1.0,2.068860,1.0,198.0,1.0
2,2,2017-08-01,21,24,22,0,1,43.077712,39.877738,4350.430965,...,1.0,5.0,5.031469,3.0,58.0,1.0,2.068860,1.0,198.0,1.0
3,3,2017-08-01,17,24,9,0,1,35.306929,36.230314,3876.549475,...,1.0,5.0,6.206133,4.0,131.0,1.0,2.068860,1.0,198.0,1.0
4,4,2017-08-01,17,55,55,0,1,31.662917,28.740887,2163.154505,...,1.0,1.0,5.014523,3.0,53.0,1.0,2.311007,1.0,178.0,1.0
5,5,2017-08-01,17,110,88,0,1,60.450000,88.596000,12946.467742,...,0.0,5.0,1.645161,1.0,8.0,1.0,1.568431,1.0,36.0,1.0
6,6,2017-08-01,9,23,21,0,1,61.754622,56.878443,6722.896237,...,5.0,1.0,2.590361,2.0,17.0,1.0,1.928592,1.0,144.0,1.0
7,7,2017-08-01,21,16,50,0,1,64.195916,61.556540,8558.499190,...,5.0,1.0,2.273973,1.0,21.0,1.0,1.928592,1.0,144.0,1.0
8,8,2017-08-01,13,6,24,0,1,41.542258,42.673683,4856.039547,...,5.0,1.0,5.471483,4.0,51.0,1.0,1.928592,1.0,144.0,1.0
9,9,2017-08-01,21,6,22,0,1,60.678244,56.977282,7705.057098,...,5.0,5.0,2.817967,2.0,23.0,1.0,1.854976,1.0,73.0,1.0


In [47]:
# coord_se_sum = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_sum':'sum'})
# coord_s_sum = coord_se_sum.groupby('start_geo_id',as_index=False)['sloc_eloc_sum'].agg({'sloc_sum':'sum'})
# coord_e_sum = coord_se_sum.groupby('end_geo_id',as_index=False)['sloc_eloc_sum'].agg({'eloc_sum':'sum'})
# coord_se_sum = pd.merge(coord_se_sum,coord_s_sum,on='start_geo_id',how='left')
# coord_se_sum = pd.merge(coord_se_sum,coord_e_sum,on='end_geo_id',how='left')
# coord_se_sum.loc[:,'se_start_ratio'] = coord_se_sum['sloc_eloc_sum'] / (1.0 * coord_se_sum['sloc_sum'])
# coord_se_sum.loc[:,'se_end_ratio'] = coord_se_sum['sloc_eloc_sum'] / (1.0 * coord_se_sum['eloc_sum'])
# del coord_se_sum['sloc_eloc_sum'],coord_se_sum['sloc_sum'],coord_se_sum['eloc_sum']
# # coord_se_sum
# train_tr = pd.merge(train_tr,coord_se_sum,on=['start_geo_id','end_geo_id'],how='left')
# valid_tr = pd.merge(valid_tr,coord_se_sum,on=['start_geo_id','end_geo_id'],how='left')
# valid_tr

coord_se_sum = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_sum':'sum'})
coord_s_sum = coord_se_sum.groupby('start_geo_id',as_index=False)['sloc_eloc_sum'].agg({'sloc_sum':'sum'})
coord_e_sum = coord_se_sum.groupby('end_geo_id',as_index=False)['sloc_eloc_sum'].agg({'eloc_sum':'sum'})
coord_se_sum = pd.merge(coord_se_sum,coord_s_sum,on='start_geo_id',how='left')
coord_se_sum = pd.merge(coord_se_sum,coord_e_sum,on='end_geo_id',how='left')
coord_se_sum.loc[:,'se_start_ratio'] = coord_se_sum['sloc_eloc_sum'] / (1.0 * coord_se_sum['sloc_sum'])
coord_se_sum.loc[:,'se_end_ratio'] = coord_se_sum['sloc_eloc_sum'] / (1.0 * coord_se_sum['eloc_sum'])
del coord_se_sum['sloc_eloc_sum'],coord_se_sum['sloc_sum'],coord_se_sum['eloc_sum']
# coord_se_sum
train_tr = pd.merge(train_tr,coord_se_sum,on=['start_geo_id','end_geo_id'],how='left')
test = pd.merge(test,coord_se_sum,on=['start_geo_id','end_geo_id'],how='left')
test

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,...,sloc_eloc_mean,sloc_eloc_median,sloc_eloc_max,sloc_eloc_min,sloccl_eloccl_mean,sloccl_eloccl_median,sloccl_eloccl_max,sloccl_eloccl_min,se_start_ratio,se_end_ratio
0,2017-08-01,0,61,88,1,0,1,41.672500,38.398509,4089.633020,...,2.093385,1.0,19.0,1.0,1.854976,1.0,73.0,1.0,0.038086,0.050659
1,2017-08-01,0,27,24,5,0,1,49.989116,39.540151,4374.194941,...,5.965392,4.0,125.0,1.0,1.928592,1.0,144.0,1.0,0.129360,0.067775
2,2017-08-01,0,22,33,1,0,1,69.983333,69.719722,10380.126552,...,1.482143,1.0,8.0,1.0,1.854976,1.0,73.0,1.0,0.005817,0.023341
3,2017-08-01,0,56,31,1,0,1,41.685952,34.675828,3637.115753,...,1.807927,1.0,22.0,1.0,1.854976,1.0,73.0,1.0,0.041932,0.075977
4,2017-08-01,0,6,9,1,0,1,48.440833,52.794470,6517.478098,...,3.256410,2.0,33.0,1.0,1.854976,1.0,73.0,1.0,0.036285,0.057128
5,2017-08-01,0,9,109,1,0,1,78.272500,73.178333,11969.869863,...,1.232877,1.0,3.0,1.0,1.928592,1.0,144.0,1.0,0.003748,0.034312
6,2017-08-01,0,24,105,1,0,1,,,18247.761905,...,1.714286,1.5,3.0,1.0,1.406877,1.0,23.0,1.0,0.000416,0.036364
7,2017-08-01,0,99,9,2,0,1,75.000000,74.000000,15274.145161,...,1.322581,1.0,4.0,1.0,1.854976,1.0,73.0,1.0,0.023768,0.001677
8,2017-08-01,0,9,109,1,0,1,78.272500,73.178333,11969.869863,...,1.232877,1.0,3.0,1.0,1.928592,1.0,144.0,1.0,0.003748,0.034312
9,2017-08-01,0,63,57,1,0,1,49.000000,49.871071,7104.459856,...,1.441860,1.0,7.0,1.0,1.404628,1.0,34.0,1.0,0.009805,0.034822


In [48]:
# coord_se_sum = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloc_eloc_sum':'sum'})
# coord_s_sum = coord_se_sum.groupby('start_cluster',as_index=False)['sloc_eloc_sum'].agg({'sloc_sum':'sum'})
# coord_e_sum = coord_se_sum.groupby('end_cluster',as_index=False)['sloc_eloc_sum'].agg({'eloc_sum':'sum'})
# coord_se_sum = pd.merge(coord_se_sum,coord_s_sum,on='start_cluster',how='left')
# coord_se_sum = pd.merge(coord_se_sum,coord_e_sum,on='end_cluster',how='left')
# coord_se_sum.loc[:,'se_start_cls_ratio'] = coord_se_sum['sloc_eloc_sum'] / (1.0 * coord_se_sum['sloc_sum'])
# coord_se_sum.loc[:,'se_end_cls_ratio'] = coord_se_sum['sloc_eloc_sum'] / (1.0 * coord_se_sum['eloc_sum'])
# del coord_se_sum['sloc_eloc_sum'],coord_se_sum['sloc_sum'],coord_se_sum['eloc_sum']
# # coord_se_sum
# train_tr = pd.merge(train_tr,coord_se_sum,on=['start_cluster','end_cluster'],how='left')
# valid_tr = pd.merge(valid_tr,coord_se_sum,on=['start_cluster','end_cluster'],how='left')
# valid_tr

coord_se_sum = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloc_eloc_sum':'sum'})
coord_s_sum = coord_se_sum.groupby('start_cluster',as_index=False)['sloc_eloc_sum'].agg({'sloc_sum':'sum'})
coord_e_sum = coord_se_sum.groupby('end_cluster',as_index=False)['sloc_eloc_sum'].agg({'eloc_sum':'sum'})
coord_se_sum = pd.merge(coord_se_sum,coord_s_sum,on='start_cluster',how='left')
coord_se_sum = pd.merge(coord_se_sum,coord_e_sum,on='end_cluster',how='left')
coord_se_sum.loc[:,'se_start_cls_ratio'] = coord_se_sum['sloc_eloc_sum'] / (1.0 * coord_se_sum['sloc_sum'])
coord_se_sum.loc[:,'se_end_cls_ratio'] = coord_se_sum['sloc_eloc_sum'] / (1.0 * coord_se_sum['eloc_sum'])
del coord_se_sum['sloc_eloc_sum'],coord_se_sum['sloc_sum'],coord_se_sum['eloc_sum']
# coord_se_sum
train_tr = pd.merge(train_tr,coord_se_sum,on=['start_cluster','end_cluster'],how='left')
test = pd.merge(test,coord_se_sum,on=['start_cluster','end_cluster'],how='left')
test

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,...,sloc_eloc_max,sloc_eloc_min,sloccl_eloccl_mean,sloccl_eloccl_median,sloccl_eloccl_max,sloccl_eloccl_min,se_start_ratio,se_end_ratio,se_start_cls_ratio,se_end_cls_ratio
0,2017-08-01,0,61,88,1,0,1,41.672500,38.398509,4089.633020,...,19.0,1.0,1.854976,1.0,73.0,1.0,0.038086,0.050659,0.579441,0.566873
1,2017-08-01,0,27,24,5,0,1,49.989116,39.540151,4374.194941,...,125.0,1.0,1.928592,1.0,144.0,1.0,0.129360,0.067775,0.263846,0.518449
2,2017-08-01,0,22,33,1,0,1,69.983333,69.719722,10380.126552,...,8.0,1.0,1.854976,1.0,73.0,1.0,0.005817,0.023341,0.579441,0.566873
3,2017-08-01,0,56,31,1,0,1,41.685952,34.675828,3637.115753,...,22.0,1.0,1.854976,1.0,73.0,1.0,0.041932,0.075977,0.579441,0.566873
4,2017-08-01,0,6,9,1,0,1,48.440833,52.794470,6517.478098,...,33.0,1.0,1.854976,1.0,73.0,1.0,0.036285,0.057128,0.579441,0.566873
5,2017-08-01,0,9,109,1,0,1,78.272500,73.178333,11969.869863,...,3.0,1.0,1.928592,1.0,144.0,1.0,0.003748,0.034312,0.263846,0.518449
6,2017-08-01,0,24,105,1,0,1,,,18247.761905,...,3.0,1.0,1.406877,1.0,23.0,1.0,0.000416,0.036364,0.037296,0.237244
7,2017-08-01,0,99,9,2,0,1,75.000000,74.000000,15274.145161,...,4.0,1.0,1.854976,1.0,73.0,1.0,0.023768,0.001677,0.579441,0.566873
8,2017-08-01,0,9,109,1,0,1,78.272500,73.178333,11969.869863,...,3.0,1.0,1.928592,1.0,144.0,1.0,0.003748,0.034312,0.263846,0.518449
9,2017-08-01,0,63,57,1,0,1,49.000000,49.871071,7104.459856,...,7.0,1.0,1.404628,1.0,34.0,1.0,0.009805,0.034822,0.040195,0.466298


### Time 

In [49]:
# map_hour = {1:1,2:1,3:1,4:1,5:1,6:1,7:2,8:2,9:2,10:3,11:3,12:3,13:4,14:4,15:4,16:5,17:5,18:5,19:6,20:6,21:6,22:0,23:0,0:0}
# train_tr.loc[:,'hour_cls'] = train_tr['create_hour'].map(lambda x: map_hour[x])
# valid_tr.loc[:,'hour_cls'] = valid_tr['create_hour'].map(lambda x: map_hour[x])
# valid_tr

map_hour = {1:1,2:1,3:1,4:1,5:1,6:1,7:2,8:2,9:2,10:3,11:3,12:3,13:4,14:4,15:4,16:5,17:5,18:5,19:6,20:6,21:6,22:0,23:0,0:0}
train_tr.loc[:,'hour_cls'] = train_tr['create_hour'].map(lambda x: map_hour[x])
test.loc[:,'hour_cls'] = test['create_hour'].map(lambda x: map_hour[x])
test

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,...,sloc_eloc_min,sloccl_eloccl_mean,sloccl_eloccl_median,sloccl_eloccl_max,sloccl_eloccl_min,se_start_ratio,se_end_ratio,se_start_cls_ratio,se_end_cls_ratio,hour_cls
0,2017-08-01,0,61,88,1,0,1,41.672500,38.398509,4089.633020,...,1.0,1.854976,1.0,73.0,1.0,0.038086,0.050659,0.579441,0.566873,0
1,2017-08-01,0,27,24,5,0,1,49.989116,39.540151,4374.194941,...,1.0,1.928592,1.0,144.0,1.0,0.129360,0.067775,0.263846,0.518449,0
2,2017-08-01,0,22,33,1,0,1,69.983333,69.719722,10380.126552,...,1.0,1.854976,1.0,73.0,1.0,0.005817,0.023341,0.579441,0.566873,0
3,2017-08-01,0,56,31,1,0,1,41.685952,34.675828,3637.115753,...,1.0,1.854976,1.0,73.0,1.0,0.041932,0.075977,0.579441,0.566873,0
4,2017-08-01,0,6,9,1,0,1,48.440833,52.794470,6517.478098,...,1.0,1.854976,1.0,73.0,1.0,0.036285,0.057128,0.579441,0.566873,0
5,2017-08-01,0,9,109,1,0,1,78.272500,73.178333,11969.869863,...,1.0,1.928592,1.0,144.0,1.0,0.003748,0.034312,0.263846,0.518449,0
6,2017-08-01,0,24,105,1,0,1,,,18247.761905,...,1.0,1.406877,1.0,23.0,1.0,0.000416,0.036364,0.037296,0.237244,0
7,2017-08-01,0,99,9,2,0,1,75.000000,74.000000,15274.145161,...,1.0,1.854976,1.0,73.0,1.0,0.023768,0.001677,0.579441,0.566873,0
8,2017-08-01,0,9,109,1,0,1,78.272500,73.178333,11969.869863,...,1.0,1.928592,1.0,144.0,1.0,0.003748,0.034312,0.263846,0.518449,0
9,2017-08-01,0,63,57,1,0,1,49.000000,49.871071,7104.459856,...,1.0,1.404628,1.0,34.0,1.0,0.009805,0.034822,0.040195,0.466298,0


In [50]:
coord_hour = train_tr.groupby('create_hour', as_index=False)['demand_count'].agg({'demand_count_h_avg':'std'})
coord_hour

Unnamed: 0,create_hour,demand_count_h_avg
0,0,1.935778
1,1,1.572084
2,2,0.739049
3,3,0.925068
4,4,0.971026
5,5,1.045984
6,6,1.257506
7,7,2.156388
8,8,3.42592
9,9,2.654506


### 3. Location + Time

In [51]:
# coord_hour = train_tr.groupby(['create_hour','start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'demand_count_h_avg':'mean'})
# train_tr = pd.merge(train_tr,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
# valid_tr = pd.merge(valid_tr,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
# coord_hour = train_tr.groupby(['create_hour','start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'demand_count_h_median':'median'})
# train_tr = pd.merge(train_tr,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
# valid_tr = pd.merge(valid_tr,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
# coord_hour = train_tr.groupby(['create_hour','start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'demand_count_h_max':'max'})
# train_tr = pd.merge(train_tr,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
# valid_tr = pd.merge(valid_tr,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
# coord_hour = train_tr.groupby(['create_hour','start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'demand_count_h_std':'std'})
# train_tr = pd.merge(train_tr,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
# valid_tr = pd.merge(valid_tr,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
# valid_tr

coord_hour = train_tr.groupby(['create_hour','start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'demand_count_h_avg':'mean'})
train_tr = pd.merge(train_tr,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
test = pd.merge(test,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
coord_hour = train_tr.groupby(['create_hour','start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'demand_count_h_median':'median'})
train_tr = pd.merge(train_tr,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
test = pd.merge(test,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
coord_hour = train_tr.groupby(['create_hour','start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'demand_count_h_max':'max'})
train_tr = pd.merge(train_tr,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
test = pd.merge(test,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
coord_hour = train_tr.groupby(['create_hour','start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'demand_count_h_std':'std'})
train_tr = pd.merge(train_tr,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
test = pd.merge(test,coord_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
test

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,...,sloccl_eloccl_min,se_start_ratio,se_end_ratio,se_start_cls_ratio,se_end_cls_ratio,hour_cls,demand_count_h_avg,demand_count_h_median,demand_count_h_max,demand_count_h_std
0,2017-08-01,0,61,88,1,0,1,41.672500,38.398509,4089.633020,...,1.0,0.038086,0.050659,0.579441,0.566873,0,1.500000,1.5,2.0,0.707107
1,2017-08-01,0,27,24,5,0,1,49.989116,39.540151,4374.194941,...,1.0,0.129360,0.067775,0.263846,0.518449,0,2.187500,2.0,7.0,1.641899
2,2017-08-01,0,22,33,1,0,1,69.983333,69.719722,10380.126552,...,1.0,0.005817,0.023341,0.579441,0.566873,0,1.333333,1.0,2.0,0.577350
3,2017-08-01,0,56,31,1,0,1,41.685952,34.675828,3637.115753,...,1.0,0.041932,0.075977,0.579441,0.566873,0,1.714286,2.0,3.0,0.755929
4,2017-08-01,0,6,9,1,0,1,48.440833,52.794470,6517.478098,...,1.0,0.036285,0.057128,0.579441,0.566873,0,1.888889,1.0,4.0,1.364225
5,2017-08-01,0,9,109,1,0,1,78.272500,73.178333,11969.869863,...,1.0,0.003748,0.034312,0.263846,0.518449,0,1.250000,1.0,2.0,0.500000
6,2017-08-01,0,24,105,1,0,1,,,18247.761905,...,1.0,0.000416,0.036364,0.037296,0.237244,0,,,,
7,2017-08-01,0,99,9,2,0,1,75.000000,74.000000,15274.145161,...,1.0,0.023768,0.001677,0.579441,0.566873,0,1.000000,1.0,1.0,
8,2017-08-01,0,9,109,1,0,1,78.272500,73.178333,11969.869863,...,1.0,0.003748,0.034312,0.263846,0.518449,0,1.250000,1.0,2.0,0.500000
9,2017-08-01,0,63,57,1,0,1,49.000000,49.871071,7104.459856,...,1.0,0.009805,0.034822,0.040195,0.466298,0,1.000000,1.0,1.0,0.000000


In [52]:
coord = train_tr.groupby(['create_hour','start_geo_id'],as_index=False)['demand_count'].agg({'demand_count_start_avg':'mean'})
coord

Unnamed: 0,create_hour,start_geo_id,demand_count_start_avg
0,0,0,1.166667
1,0,1,1.304054
2,0,2,1.451613
3,0,3,1.114286
4,0,4,1.000000
5,0,5,1.403175
6,0,6,1.858942
7,0,7,1.000000
8,0,8,1.192771
9,0,9,1.380952


In [53]:
np.setdiff1d(train_tr.columns,valid_tr.columns)
# np.setdiff1d(test.columns,train_tr.columns)

array(['estimate_distance_mean', 'estimate_money_mean',
       'estimate_term_mean'], dtype=object)

### Training

In [54]:
do_not_use_list = ['create_date','demand_count','estimate_distance_mean','estimate_money_mean','estimate_term_mean','test_id']
predictors = [f for f in train_tr.columns if f not in do_not_use_list]
print predictors

['create_hour', 'start_geo_id', 'end_geo_id', 'holiday', 'dayOfWeek', 'money_hour_mean', 'money_day_mean', 'dis_hour_mean', 'term_hour_mean', 'start_cluster', 'end_cluster', 'sloc_eloc_mean', 'sloc_eloc_median', 'sloc_eloc_max', 'sloc_eloc_min', 'sloccl_eloccl_mean', 'sloccl_eloccl_median', 'sloccl_eloccl_max', 'sloccl_eloccl_min', 'se_start_ratio', 'se_end_ratio', 'se_start_cls_ratio', 'se_end_cls_ratio', 'hour_cls', 'demand_count_h_avg', 'demand_count_h_median', 'demand_count_h_max', 'demand_count_h_std']


In [55]:
import xgboost as xgb
params = {'min_child_weight': 100, 'eta': 0.1, 'colsample_bytree': 0.3, 'max_depth': 7,
                'subsample': 0.8, 'lambda': 1, 'nthread': 4, 'booster' : 'gbtree', 'silent': 1,
                'eval_metric': 'rmse', 'objective': 'reg:linear'}
boostRound = 200

# print train_feat_1[predictors]

xgbtrain = xgb.DMatrix(train_tr[predictors], train_tr['demand_count'],missing=np.nan)
# xgbvalid = xgb.DMatrix(test_feat_1[predictors])
model = xgb.train(params, xgbtrain, num_boost_round=boostRound)
param_score = pd.Series(model.get_fscore()).sort_values(ascending=False)

In [56]:
param_score

money_day_mean           924
demand_count_h_avg       869
sloc_eloc_mean           737
money_hour_mean          662
create_hour              635
demand_count_h_std       620
dayOfWeek                491
demand_count_h_max       489
dis_hour_mean            480
sloc_eloc_max            453
se_start_ratio           436
demand_count_h_median    435
se_end_ratio             379
end_geo_id               302
start_geo_id             294
hour_cls                 277
term_hour_mean           234
sloccl_eloccl_mean       163
holiday                  121
sloccl_eloccl_max        117
se_start_cls_ratio       100
sloc_eloc_median          90
se_end_cls_ratio          88
end_cluster               43
start_cluster             41
sloc_eloc_min             16
dtype: int64

In [57]:
def score(pred,valid):
    return np.sum(np.abs(pred - valid)) / (1.0 * len(pred))

In [58]:
# xgbvalid = xgb.DMatrix(valid_tr[predictors],missing=np.nan)
# valid_tr.loc[:,'result'] = model.predict(xgbvalid)
# valid_tr['result'].fillna(1,inplace=True)
# valid_tr['result']

xgbvalid = xgb.DMatrix(test[predictors],missing=np.nan)
test.loc[:,'count'] = model.predict(xgbvalid)
test['count'].fillna(1,inplace=True)
test['count']

0       1.401881
1       3.126156
2       1.297411
3       1.815964
4       1.563119
5       1.196426
6       0.864470
7       0.972226
8       1.196426
9       1.018885
10      1.026850
11      2.013469
12      2.064049
13      1.555836
14      0.962198
15      2.470418
16      1.001604
17      2.630617
18      2.037871
19      1.060671
20      1.031324
21      1.045748
22      1.004269
23      1.103458
24      0.999608
25      1.059696
26      2.068449
27      0.989648
28      1.072222
29      1.132207
          ...   
4970    2.855361
4971    2.962372
4972    1.412968
4973    2.681015
4974    1.915522
4975    3.081016
4976    1.667643
4977    1.586754
4978    0.982547
4979    3.919162
4980    0.941707
4981    1.392402
4982    1.280351
4983    1.116122
4984    0.891575
4985    2.381450
4986    0.966437
4987    0.864019
4988    1.142321
4989    1.821763
4990    2.474492
4991    3.814942
4992    2.097934
4993    1.754267
4994    0.864019
4995    0.990872
4996    1.915522
4997    1.8324

In [59]:
# score(valid_tr['result'],valid_tr['demand_count'])

0.9408746798038483

In [53]:
test['test_id'] = test['test_id'].astype('int')
test[['test_id','count']].to_csv('result.csv',index=False)