In [1]:
# -*- encoding:urf-8 -*-
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table like and matrices
import pandas as pd
import numpy as np

# Modeling Helper
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import missingno as msno

# Configure visualization
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 8,6

poi_path = '../../UAI_data/input/poi_re.csv'
train_Aug_path = '../../UAI_data/input/train_Aug_re.csv'
train_Jul_path = '../../UAI_data/input/train_July_re.csv'
train_jul_demand_path = '../../UAI_data/input/tain_jul_demand.csv'
train_aug_demand_path = '../../UAI_data/input/tain_aug_demand.csv'
location_path = '../../UAI_data/input/location_cls.csv'
holiday_path = '../../UAI_data/input/holiday.csv'
weather_path = '../../UAI_data/input/weather.csv'
test_path = '../../UAI_data/input/test.csv'

poi = pd.read_csv(poi_path,encoding='gbk')
train_aug = pd.read_csv(train_Aug_path,encoding='gbk')
train_jul = pd.read_csv(train_Jul_path,encoding='gbk')
train_jul_demand = pd.read_csv(train_jul_demand_path,encoding='gbk')
train_aug_demand = pd.read_csv(train_aug_demand_path,encoding='gbk')
loc_cls = pd.read_csv(location_path)
holiday = pd.read_csv(holiday_path)
weather = pd.read_csv(weather_path)
test = pd.read_csv(test_path,encoding='gbk')

###  Helper Functions

In [2]:
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))

In [5]:
train_jul_exc = train_jul[train_jul['driver_id'] != 2].copy()
coord_jul = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['start_geo_id'].agg({'demand_count':'count'})
coord_jul.loc[:,'estimate_money_mean'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_money'].mean()['estimate_money']
# coord_jul.loc[:,'estimate_money_median'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_money'].median()['estimate_money']
# coord_jul.loc[:,'estimate_money_max'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_money'].max()['estimate_money']
# coord_jul.loc[:,'estimate_money_min'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_money'].min()['estimate_money']
coord_jul.loc[:,'estimate_distance_mean'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_distance'].mean()['estimate_distance']
# coord_jul.loc[:,'estimate_distance_median'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_distance'].median()['estimate_distance']
# coord_jul.loc[:,'estimate_distance_max'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_distance'].max()['estimate_distance']
# coord_jul.loc[:,'estimate_distance_min'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_distance'].min()['estimate_distance']
coord_jul.loc[:,'estimate_term_mean'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_term'].mean()['estimate_term']
# coord_jul.loc[:,'estimate_term_median'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_term'].median()['estimate_term']
# coord_jul.loc[:,'estimate_term_max'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_term'].max()['estimate_term']
# coord_jul.loc[:,'estimate_term_min'] = train_jul_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_term'].min()['estimate_term']

coord_jul.to_csv('train_jul_exc.csv',index=False)
coord_jul

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,estimate_money_mean,estimate_distance_mean,estimate_term_mean
0,2017-07-01,0,0,10,1,101.000,14431.0,24.0
1,2017-07-01,0,0,21,1,39.000,4905.0,9.0
2,2017-07-01,0,1,1,2,28.500,2437.0,4.5
3,2017-07-01,0,1,15,2,67.000,10955.0,21.5
4,2017-07-01,0,1,37,1,63.000,10010.0,20.0
5,2017-07-01,0,1,39,2,67.110,11493.0,17.5
6,2017-07-01,0,1,89,1,58.000,8905.0,17.0
7,2017-07-01,0,1,95,1,90.000,15677.0,31.0
8,2017-07-01,0,1,160,2,46.520,6347.0,12.0
9,2017-07-01,0,1,303,1,66.000,11493.0,17.0


In [6]:
test

Unnamed: 0,test_id,create_date,create_hour,start_geo_id_new,end_geo_id_new
0,0,2017-08-01,21,24,27
1,1,2017-08-01,21,24,9
2,2,2017-08-01,21,24,22
3,3,2017-08-01,17,24,9
4,4,2017-08-01,17,55,55
5,5,2017-08-01,17,110,88
6,6,2017-08-01,9,23,21
7,7,2017-08-01,21,16,50
8,8,2017-08-01,13,6,24
9,9,2017-08-01,21,6,22


In [9]:
train_aug_demand

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count
0,2017-08-01,0,1,1,1
1,2017-08-01,0,1,9,1
2,2017-08-01,0,1,41,1
3,2017-08-01,0,1,73,1
4,2017-08-01,0,1,101,1
5,2017-08-01,0,1,191,1
6,2017-08-01,0,1,198,1
7,2017-08-01,0,3,56,1
8,2017-08-01,0,5,1,1
9,2017-08-01,0,5,5,3


In [10]:
test['create_hour'].value_counts()

22    443
20    411
21    383
17    346
13    334
9     313
18    261
15    254
8     246
16    242
11    240
7     234
19    225
10    211
14    201
12    180
23    147
6      76
0      70
1      61
5      47
3      30
2      25
4      20
Name: create_hour, dtype: int64

### Valid set split

In [16]:
train_aug_exc = train_aug[train_aug['driver_id'] != 2].copy()
coord_aug = train_aug_exc.groupby(['create_date','create_hour','start_geo_id','end_geo_id'],as_index=False)['start_geo_id'].agg({'demand_count':'count'})
coord_aug.to_csv('train_aug_exc.csv',index=False)
coord_aug

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count
0,2017-08-01,0,1,1,1
1,2017-08-01,0,1,9,1
2,2017-08-01,0,1,41,1
3,2017-08-01,0,1,73,1
4,2017-08-01,0,1,101,1
5,2017-08-01,0,1,191,1
6,2017-08-01,0,1,198,1
7,2017-08-01,0,3,56,1
8,2017-08-01,0,5,1,1
9,2017-08-01,0,5,5,3


In [30]:
dict_list = {
'l_50' : [0,1,2,3,4,5],
'l_100' : [6],
'l_150' : [12,23],
'l_200' : [10,14,19],
'l_250' : [7,8,11,15,16,18],
'l_300' : [9,13],
'l_350' : [17],
'l_400' : [20,21]
# 'l_450' : [22]
}
l_22 = coord_aug[coord_aug['create_hour'] == 22].index
r = np.random.choice(l_22,450)
valid = coord_aug.iloc[r]
for key,l in dict_list.items():
    num =  int(key.split('_')[1])
    for hour in l:
        l_tmp = coord_aug[coord_aug['create_hour'] == hour].index
        r = np.random.choice(l_tmp,num)
        tmp_valid = coord_aug.iloc[r].copy()
        valid = pd.concat([valid,tmp_valid],axis=0)
valid.sort_values(['create_date','create_hour'],inplace=True)
valid

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count
150,2017-08-01,0,31,31,1
80,2017-08-01,0,21,21,2
35,2017-08-01,0,9,49,1
102,2017-08-01,0,22,118,1
166,2017-08-01,0,39,90,3
9,2017-08-01,0,5,5,3
43,2017-08-01,0,13,27,2
126,2017-08-01,0,24,105,1
62,2017-08-01,0,17,47,1
223,2017-08-01,0,70,25,1


In [32]:
valid['create_hour'].unique()

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22,  1,  3,  5,  7,  9,
       11, 13, 15, 17, 19, 21, 23], dtype=int64)

## Feature Extracting
### 1. order list attribution

In [89]:
train_tr = coord_jul.copy()
valid_tr = valid.copy()
train_tr
# valid_tr
# valid_tr['demand_count']

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,estimate_money_mean,estimate_distance_mean,estimate_term_mean
0,2017-07-01,0,0,10,1,101.000,14431.0,24.0
1,2017-07-01,0,0,21,1,39.000,4905.0,9.0
2,2017-07-01,0,1,1,2,28.500,2437.0,4.5
3,2017-07-01,0,1,15,2,67.000,10955.0,21.5
4,2017-07-01,0,1,37,1,63.000,10010.0,20.0
5,2017-07-01,0,1,39,2,67.110,11493.0,17.5
6,2017-07-01,0,1,89,1,58.000,8905.0,17.0
7,2017-07-01,0,1,95,1,90.000,15677.0,31.0
8,2017-07-01,0,1,160,2,46.520,6347.0,12.0
9,2017-07-01,0,1,303,1,66.000,11493.0,17.0


In [90]:
# time attribution
holiday['create_date'] = pd.to_datetime(holiday['create_date'])
train_tr['create_date'] = pd.to_datetime(train_tr['create_date'])
valid_tr['create_date'] = pd.to_datetime(valid_tr['create_date'])
train_tr = pd.merge(train_tr,holiday,on='create_date',how='left')
valid_tr = pd.merge(valid_tr,holiday,on='create_date',how='left')
train_tr['dayOfWeek'] = train_tr['create_date'].dt.dayofweek
valid_tr['dayOfWeek'] = valid_tr['create_date'].dt.dayofweek
train_tr

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,estimate_money_mean,estimate_distance_mean,estimate_term_mean,holiday,dayOfWeek
0,2017-07-01,0,0,10,1,101.000,14431.0,24.0,1,5
1,2017-07-01,0,0,21,1,39.000,4905.0,9.0,1,5
2,2017-07-01,0,1,1,2,28.500,2437.0,4.5,1,5
3,2017-07-01,0,1,15,2,67.000,10955.0,21.5,1,5
4,2017-07-01,0,1,37,1,63.000,10010.0,20.0,1,5
5,2017-07-01,0,1,39,2,67.110,11493.0,17.5,1,5
6,2017-07-01,0,1,89,1,58.000,8905.0,17.0,1,5
7,2017-07-01,0,1,95,1,90.000,15677.0,31.0,1,5
8,2017-07-01,0,1,160,2,46.520,6347.0,12.0,1,5
9,2017-07-01,0,1,303,1,66.000,11493.0,17.0,1,5


In [91]:
coord_money_hour = train_tr.groupby(['create_hour','start_geo_id','end_geo_id'],as_index=False)['estimate_money_mean'].agg({'money_hour_mean':'mean'})
coord_money_day = train_tr.groupby(['dayOfWeek','start_geo_id','end_geo_id'],as_index=False)['estimate_money_mean'].agg({'money_day_mean':'mean'})
coord_dis_hour = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['estimate_distance_mean'].agg({'dis_hour_mean':'mean'})
coord_term_hour = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['estimate_term_mean'].agg({'term_hour_mean':'mean'})
train_tr = pd.merge(train_tr,coord_money_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
train_tr = pd.merge(train_tr,coord_money_day,on=['dayOfWeek','start_geo_id','end_geo_id'],how='left')
train_tr = pd.merge(train_tr,coord_dis_hour,on=['start_geo_id','end_geo_id'],how='left')
train_tr = pd.merge(train_tr,coord_term_hour,on=['start_geo_id','end_geo_id'],how='left')
valid_tr = pd.merge(valid_tr,coord_money_hour,on=['create_hour','start_geo_id','end_geo_id'],how='left')
valid_tr = pd.merge(valid_tr,coord_money_day,on=['dayOfWeek','start_geo_id','end_geo_id'],how='left')
valid_tr = pd.merge(valid_tr,coord_dis_hour,on=['start_geo_id','end_geo_id'],how='left')
valid_tr = pd.merge(valid_tr,coord_term_hour,on=['start_geo_id','end_geo_id'],how='left')
valid_tr

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,term_hour_mean
0,2017-08-01,0,31,31,1,0,1,36.315000,29.154909,2379.361605,4.188930
1,2017-08-01,0,21,21,2,0,1,29.016364,28.290117,2388.217566,4.242661
2,2017-08-01,0,9,49,1,0,1,148.000000,99.000000,21345.764706,36.500000
3,2017-08-01,0,22,118,1,0,1,,97.000000,19857.100000,31.400000
4,2017-08-01,0,39,90,3,0,1,81.384167,98.046667,14912.451977,25.911017
5,2017-08-01,0,5,5,3,0,1,30.427917,28.097436,3839.464410,6.427083
6,2017-08-01,0,13,27,2,0,1,55.440000,47.998287,6068.234640,11.502177
7,2017-08-01,0,24,105,1,0,1,,,18498.900000,33.500000
8,2017-08-01,0,17,47,1,0,1,,109.277778,22979.020609,35.728495
9,2017-08-01,0,70,25,1,0,1,46.505000,51.310000,7198.996337,12.928571


### 2.Location
#### 2.1 Pure location

In [92]:
loc_start = loc_cls.copy()
loc_start.rename(columns={'location_id':'start_geo_id','cluster':'start_cluster'},inplace=True)
train_tr = pd.merge(train_tr,loc_start,on='start_geo_id',how='left')
loc_end = loc_cls.copy()
loc_end.rename(columns={'location_id':'end_geo_id','cluster':'end_cluster'},inplace=True)
train_tr = pd.merge(train_tr,loc_end,on='end_geo_id',how='left')
loc_start = loc_cls.copy()
loc_start.rename(columns={'location_id':'start_geo_id','cluster':'start_cluster'},inplace=True)
valid_tr = pd.merge(valid_tr,loc_start,on='start_geo_id',how='left')
loc_end = loc_cls.copy()
loc_end.rename(columns={'location_id':'end_geo_id','cluster':'end_cluster'},inplace=True)
valid_tr = pd.merge(valid_tr,loc_end,on='end_geo_id',how='left')
# valid_tr['end_cluster'].astype('int',inplace=True)
valid_tr

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,term_hour_mean,start_cluster,end_cluster
0,2017-08-01,0,31,31,1,0,1,36.315000,29.154909,2379.361605,4.188930,5,5.0
1,2017-08-01,0,21,21,2,0,1,29.016364,28.290117,2388.217566,4.242661,1,1.0
2,2017-08-01,0,9,49,1,0,1,148.000000,99.000000,21345.764706,36.500000,5,5.0
3,2017-08-01,0,22,118,1,0,1,,97.000000,19857.100000,31.400000,5,2.0
4,2017-08-01,0,39,90,3,0,1,81.384167,98.046667,14912.451977,25.911017,1,5.0
5,2017-08-01,0,5,5,3,0,1,30.427917,28.097436,3839.464410,6.427083,4,4.0
6,2017-08-01,0,13,27,2,0,1,55.440000,47.998287,6068.234640,11.502177,1,5.0
7,2017-08-01,0,24,105,1,0,1,,,18498.900000,33.500000,1,2.0
8,2017-08-01,0,17,47,1,0,1,,109.277778,22979.020609,35.728495,5,2.0
9,2017-08-01,0,70,25,1,0,1,46.505000,51.310000,7198.996337,12.928571,5,0.0


In [93]:
coord_se_mean = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_mean':'mean'})
coord_se_median = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_median':'median'})
coord_se_max = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_max':'max'})
coord_se_min = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_min':'min'})
train_tr = pd.merge(train_tr,coord_se_mean,on=['start_geo_id','end_geo_id'],how='left')
train_tr = pd.merge(train_tr,coord_se_median,on=['start_geo_id','end_geo_id'],how='left')
train_tr = pd.merge(train_tr,coord_se_max,on=['start_geo_id','end_geo_id'],how='left')
train_tr = pd.merge(train_tr,coord_se_min,on=['start_geo_id','end_geo_id'],how='left')
valid_tr = pd.merge(valid_tr,coord_se_mean,on=['start_geo_id','end_geo_id'],how='left')
valid_tr = pd.merge(valid_tr,coord_se_median,on=['start_geo_id','end_geo_id'],how='left')
valid_tr = pd.merge(valid_tr,coord_se_max,on=['start_geo_id','end_geo_id'],how='left')
valid_tr = pd.merge(valid_tr,coord_se_min,on=['start_geo_id','end_geo_id'],how='left')
valid_tr

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,term_hour_mean,start_cluster,end_cluster,sloc_eloc_mean,sloc_eloc_median,sloc_eloc_max,sloc_eloc_min
0,2017-08-01,0,31,31,1,0,1,36.315000,29.154909,2379.361605,4.188930,5,5.0,1.590551,1.0,7.0,1.0
1,2017-08-01,0,21,21,2,0,1,29.016364,28.290117,2388.217566,4.242661,1,1.0,2.573248,2.0,10.0,1.0
2,2017-08-01,0,9,49,1,0,1,148.000000,99.000000,21345.764706,36.500000,5,5.0,1.117647,1.0,2.0,1.0
3,2017-08-01,0,22,118,1,0,1,,97.000000,19857.100000,31.400000,5,2.0,1.000000,1.0,1.0,1.0
4,2017-08-01,0,39,90,3,0,1,81.384167,98.046667,14912.451977,25.911017,1,5.0,1.161017,1.0,3.0,1.0
5,2017-08-01,0,5,5,3,0,1,30.427917,28.097436,3839.464410,6.427083,4,4.0,1.270833,1.0,4.0,1.0
6,2017-08-01,0,13,27,2,0,1,55.440000,47.998287,6068.234640,11.502177,1,5.0,1.501946,1.0,7.0,1.0
7,2017-08-01,0,24,105,1,0,1,,,18498.900000,33.500000,1,2.0,1.100000,1.0,2.0,1.0
8,2017-08-01,0,17,47,1,0,1,,109.277778,22979.020609,35.728495,5,2.0,1.215054,1.0,4.0,1.0
9,2017-08-01,0,70,25,1,0,1,46.505000,51.310000,7198.996337,12.928571,5,0.0,1.131868,1.0,3.0,1.0


In [94]:
coord_se_mean = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloccl_eloccl_mean':'mean'})
coord_se_median = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloccl_eloccl_median':'median'})
coord_se_max = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloccl_eloccl_max':'max'})
coord_se_min = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloccl_eloccl_min':'min'})
train_tr = pd.merge(train_tr,coord_se_mean,on=['start_cluster','end_cluster'],how='left')
train_tr = pd.merge(train_tr,coord_se_median,on=['start_cluster','end_cluster'],how='left')
train_tr = pd.merge(train_tr,coord_se_max,on=['start_cluster','end_cluster'],how='left')
train_tr = pd.merge(train_tr,coord_se_min,on=['start_cluster','end_cluster'],how='left')
valid_tr = pd.merge(valid_tr,coord_se_mean,on=['start_cluster','end_cluster'],how='left')
valid_tr = pd.merge(valid_tr,coord_se_median,on=['start_cluster','end_cluster'],how='left')
valid_tr = pd.merge(valid_tr,coord_se_max,on=['start_cluster','end_cluster'],how='left')
valid_tr = pd.merge(valid_tr,coord_se_min,on=['start_cluster','end_cluster'],how='left')
valid_tr

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,...,start_cluster,end_cluster,sloc_eloc_mean,sloc_eloc_median,sloc_eloc_max,sloc_eloc_min,sloccl_eloccl_mean,sloccl_eloccl_median,sloccl_eloccl_max,sloccl_eloccl_min
0,2017-08-01,0,31,31,1,0,1,36.315000,29.154909,2379.361605,...,5,5.0,1.590551,1.0,7.0,1.0,1.419580,1.0,21.0,1.0
1,2017-08-01,0,21,21,2,0,1,29.016364,28.290117,2388.217566,...,1,1.0,2.573248,2.0,10.0,1.0,1.682335,1.0,24.0,1.0
2,2017-08-01,0,9,49,1,0,1,148.000000,99.000000,21345.764706,...,5,5.0,1.117647,1.0,2.0,1.0,1.419580,1.0,21.0,1.0
3,2017-08-01,0,22,118,1,0,1,,97.000000,19857.100000,...,5,2.0,1.000000,1.0,1.0,1.0,1.126785,1.0,9.0,1.0
4,2017-08-01,0,39,90,3,0,1,81.384167,98.046667,14912.451977,...,1,5.0,1.161017,1.0,3.0,1.0,1.492898,1.0,23.0,1.0
5,2017-08-01,0,5,5,3,0,1,30.427917,28.097436,3839.464410,...,4,4.0,1.270833,1.0,4.0,1.0,1.119451,1.0,5.0,1.0
6,2017-08-01,0,13,27,2,0,1,55.440000,47.998287,6068.234640,...,1,5.0,1.501946,1.0,7.0,1.0,1.492898,1.0,23.0,1.0
7,2017-08-01,0,24,105,1,0,1,,,18498.900000,...,1,2.0,1.100000,1.0,2.0,1.0,1.133389,1.0,6.0,1.0
8,2017-08-01,0,17,47,1,0,1,,109.277778,22979.020609,...,5,2.0,1.215054,1.0,4.0,1.0,1.126785,1.0,9.0,1.0
9,2017-08-01,0,70,25,1,0,1,46.505000,51.310000,7198.996337,...,5,0.0,1.131868,1.0,3.0,1.0,1.187149,1.0,12.0,1.0


In [103]:
coord_se_sum = train_tr.groupby(['start_geo_id','end_geo_id'],as_index=False)['demand_count'].agg({'sloc_eloc_sum':'sum'})
coord_s_sum = coord_se_sum.groupby('start_geo_id',as_index=False)['sloc_eloc_sum'].agg({'sloc_sum':'sum'})
coord_e_sum = coord_se_sum.groupby('end_geo_id',as_index=False)['sloc_eloc_sum'].agg({'eloc_sum':'sum'})
coord_se_sum = pd.merge(coord_se_sum,coord_s_sum,on='start_geo_id',how='left')
coord_se_sum = pd.merge(coord_se_sum,coord_e_sum,on='end_geo_id',how='left')
coord_se_sum.loc[:,'se_start_ratio'] = coord_se_sum['sloc_eloc_sum'] / (1.0 * coord_se_sum['sloc_sum'])
coord_se_sum.loc[:,'se_end_ratio'] = coord_se_sum['sloc_eloc_sum'] / (1.0 * coord_se_sum['eloc_sum'])
del coord_se_sum['sloc_eloc_sum'],coord_se_sum['sloc_sum'],coord_se_sum['eloc_sum']
# coord_se_sum
train_tr = pd.merge(train_tr,coord_se_sum,on=['start_geo_id','end_geo_id'],how='left')
valid_tr = pd.merge(valid_tr,coord_se_sum,on=['start_geo_id','end_geo_id'],how='left')
valid_tr

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,...,sloc_eloc_mean,sloc_eloc_median,sloc_eloc_max,sloc_eloc_min,sloccl_eloccl_mean,sloccl_eloccl_median,sloccl_eloccl_max,sloccl_eloccl_min,se_start_ratio,se_end_ratio
0,2017-08-01,0,31,31,1,0,1,36.315000,29.154909,2379.361605,...,1.590551,1.0,7.0,1.0,1.419580,1.0,21.0,1.0,0.047418,0.073056
1,2017-08-01,0,21,21,2,0,1,29.016364,28.290117,2388.217566,...,2.573248,2.0,10.0,1.0,1.682335,1.0,24.0,1.0,0.108098,0.108166
2,2017-08-01,0,9,49,1,0,1,148.000000,99.000000,21345.764706,...,1.117647,1.0,2.0,1.0,1.419580,1.0,21.0,1.0,0.001107,0.008482
3,2017-08-01,0,22,118,1,0,1,,97.000000,19857.100000,...,1.000000,1.0,1.0,1.0,1.126785,1.0,9.0,1.0,0.000518,0.006050
4,2017-08-01,0,39,90,3,0,1,81.384167,98.046667,14912.451977,...,1.161017,1.0,3.0,1.0,1.492898,1.0,23.0,1.0,0.008194,0.017852
5,2017-08-01,0,5,5,3,0,1,30.427917,28.097436,3839.464410,...,1.270833,1.0,4.0,1.0,1.119451,1.0,5.0,1.0,0.030194,0.040687
6,2017-08-01,0,13,27,2,0,1,55.440000,47.998287,6068.234640,...,1.501946,1.0,7.0,1.0,1.492898,1.0,23.0,1.0,0.045497,0.021254
7,2017-08-01,0,24,105,1,0,1,,,18498.900000,...,1.100000,1.0,2.0,1.0,1.133389,1.0,6.0,1.0,0.000296,0.024123
8,2017-08-01,0,17,47,1,0,1,,109.277778,22979.020609,...,1.215054,1.0,4.0,1.0,1.126785,1.0,9.0,1.0,0.012011,0.019004
9,2017-08-01,0,70,25,1,0,1,46.505000,51.310000,7198.996337,...,1.131868,1.0,3.0,1.0,1.187149,1.0,12.0,1.0,0.012846,0.019660


In [105]:
coord_se_sum = train_tr.groupby(['start_cluster','end_cluster'],as_index=False)['demand_count'].agg({'sloc_eloc_sum':'sum'})
coord_s_sum = coord_se_sum.groupby('start_cluster',as_index=False)['sloc_eloc_sum'].agg({'sloc_sum':'sum'})
coord_e_sum = coord_se_sum.groupby('end_cluster',as_index=False)['sloc_eloc_sum'].agg({'eloc_sum':'sum'})
coord_se_sum = pd.merge(coord_se_sum,coord_s_sum,on='start_cluster',how='left')
coord_se_sum = pd.merge(coord_se_sum,coord_e_sum,on='end_cluster',how='left')
coord_se_sum.loc[:,'se_start_cls_ratio'] = coord_se_sum['sloc_eloc_sum'] / (1.0 * coord_se_sum['sloc_sum'])
coord_se_sum.loc[:,'se_end_cls_ratio'] = coord_se_sum['sloc_eloc_sum'] / (1.0 * coord_se_sum['eloc_sum'])
del coord_se_sum['sloc_eloc_sum'],coord_se_sum['sloc_sum'],coord_se_sum['eloc_sum']
# coord_se_sum
train_tr = pd.merge(train_tr,coord_se_sum,on=['start_cluster','end_cluster'],how='left')
valid_tr = pd.merge(valid_tr,coord_se_sum,on=['start_cluster','end_cluster'],how='left')
valid_tr

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,...,sloc_eloc_max,sloc_eloc_min,sloccl_eloccl_mean,sloccl_eloccl_median,sloccl_eloccl_max,sloccl_eloccl_min,se_start_ratio,se_end_ratio,se_start_cls_ratio,se_end_cls_ratio
0,2017-08-01,0,31,31,1,0,1,36.315000,29.154909,2379.361605,...,7.0,1.0,1.419580,1.0,21.0,1.0,0.047418,0.073056,0.578574,0.583226
1,2017-08-01,0,21,21,2,0,1,29.016364,28.290117,2388.217566,...,10.0,1.0,1.682335,1.0,24.0,1.0,0.108098,0.108166,0.317937,0.332440
2,2017-08-01,0,9,49,1,0,1,148.000000,99.000000,21345.764706,...,2.0,1.0,1.419580,1.0,21.0,1.0,0.001107,0.008482,0.578574,0.583226
3,2017-08-01,0,22,118,1,0,1,,97.000000,19857.100000,...,1.0,1.0,1.126785,1.0,9.0,1.0,0.000518,0.006050,0.040975,0.480834
4,2017-08-01,0,39,90,3,0,1,81.384167,98.046667,14912.451977,...,3.0,1.0,1.492898,1.0,23.0,1.0,0.008194,0.017852,0.533584,0.276288
5,2017-08-01,0,5,5,3,0,1,30.427917,28.097436,3839.464410,...,4.0,1.0,1.119451,1.0,5.0,1.0,0.030194,0.040687,0.154930,0.121025
6,2017-08-01,0,13,27,2,0,1,55.440000,47.998287,6068.234640,...,7.0,1.0,1.492898,1.0,23.0,1.0,0.045497,0.021254,0.533584,0.276288
7,2017-08-01,0,24,105,1,0,1,,,18498.900000,...,2.0,1.0,1.133389,1.0,6.0,1.0,0.000296,0.024123,0.039409,0.237545
8,2017-08-01,0,17,47,1,0,1,,109.277778,22979.020609,...,4.0,1.0,1.126785,1.0,9.0,1.0,0.012011,0.019004,0.040975,0.480834
9,2017-08-01,0,70,25,1,0,1,46.505000,51.310000,7198.996337,...,3.0,1.0,1.187149,1.0,12.0,1.0,0.012846,0.019660,0.054596,0.521747


### Time 

In [107]:
map_hour = {1:1,2:1,3:1,4:1,5:1,6:1,7:2,8:2,9:2,10:3,11:3,12:3,13:4,14:4,15:4,16:5,17:5,18:5,19:6,20:6,21:6,22:0,23:0,0:0}
train_tr.loc[:,'hour_cls'] = train_tr['create_hour'].map(lambda x: map_hour[x])
valid_tr.loc[:,'hour_cls'] = valid_tr['create_hour'].map(lambda x: map_hour[x])
valid_tr

Unnamed: 0,create_date,create_hour,start_geo_id,end_geo_id,demand_count,holiday,dayOfWeek,money_hour_mean,money_day_mean,dis_hour_mean,...,sloc_eloc_min,sloccl_eloccl_mean,sloccl_eloccl_median,sloccl_eloccl_max,sloccl_eloccl_min,se_start_ratio,se_end_ratio,se_start_cls_ratio,se_end_cls_ratio,hour_cls
0,2017-08-01,0,31,31,1,0,1,36.315000,29.154909,2379.361605,...,1.0,1.419580,1.0,21.0,1.0,0.047418,0.073056,0.578574,0.583226,0
1,2017-08-01,0,21,21,2,0,1,29.016364,28.290117,2388.217566,...,1.0,1.682335,1.0,24.0,1.0,0.108098,0.108166,0.317937,0.332440,0
2,2017-08-01,0,9,49,1,0,1,148.000000,99.000000,21345.764706,...,1.0,1.419580,1.0,21.0,1.0,0.001107,0.008482,0.578574,0.583226,0
3,2017-08-01,0,22,118,1,0,1,,97.000000,19857.100000,...,1.0,1.126785,1.0,9.0,1.0,0.000518,0.006050,0.040975,0.480834,0
4,2017-08-01,0,39,90,3,0,1,81.384167,98.046667,14912.451977,...,1.0,1.492898,1.0,23.0,1.0,0.008194,0.017852,0.533584,0.276288,0
5,2017-08-01,0,5,5,3,0,1,30.427917,28.097436,3839.464410,...,1.0,1.119451,1.0,5.0,1.0,0.030194,0.040687,0.154930,0.121025,0
6,2017-08-01,0,13,27,2,0,1,55.440000,47.998287,6068.234640,...,1.0,1.492898,1.0,23.0,1.0,0.045497,0.021254,0.533584,0.276288,0
7,2017-08-01,0,24,105,1,0,1,,,18498.900000,...,1.0,1.133389,1.0,6.0,1.0,0.000296,0.024123,0.039409,0.237545,0
8,2017-08-01,0,17,47,1,0,1,,109.277778,22979.020609,...,1.0,1.126785,1.0,9.0,1.0,0.012011,0.019004,0.040975,0.480834,0
9,2017-08-01,0,70,25,1,0,1,46.505000,51.310000,7198.996337,...,1.0,1.187149,1.0,12.0,1.0,0.012846,0.019660,0.054596,0.521747,0


In [113]:
coord_hour = train_tr.groupby('create_hour', as_index=False)['demand_count'].agg({'demand_count_h_avg':'std'})
coord_hour

Unnamed: 0,create_hour,demand_count_h_avg
0,0,0.715362
1,1,0.542483
2,2,0.445038
3,3,0.479545
4,4,0.409011
5,5,0.455702
6,6,0.4859
7,7,0.857049
8,8,1.22335
9,9,1.12164
