In [1]:
import os
from dateutil.parser import parse as dateutil_parse
from six.moves import cPickle as pickle
import time
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler as skStandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.cross_validation import train_test_split,cross_val_score

PATH = '../season_1/'
CLEAN_PATH = PATH+'clean/'
SEARCH_PATH = 'rf/'

from mylib import myStandardScaler,process_order,process_traffic,get_order_group,get_traffic_group,XY_order_traffic, prediction2submit, Search_Model, DISTRICTS

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
cluster_map = pd.read_csv(CLEAN_PATH+'cluster_map.csv',index_col=0)
poi = pd.read_csv(CLEAN_PATH+'poi.csv',index_col=0)
train_order_group = pd.read_pickle(CLEAN_PATH+'train_order_group.pickle')
test_order_group = pd.read_pickle(CLEAN_PATH+'test_order_group.pickle')
train_traffic_group = pd.read_pickle(CLEAN_PATH+'train_traffic_group.pickle')
test_traffic_group = pd.read_pickle(CLEAN_PATH+'test_traffic_group.pickle')
test_target = pd.read_csv(CLEAN_PATH+'test_target.csv',index_col=0,parse_dates=True)

In [3]:
train_slot = pd.Index(sorted(train_order_group.values()[0].index.unique()))
train_slot = pd.Index(filter(lambda x: x%1000 >4,train_slot))
test_slot = test_target['datetimeslot']

In [4]:
now = time.time()

train_XY_group = dict()
for district in DISTRICTS:
    train_XY_group[district] = XY_order_traffic(district,train_order_group,train_traffic_group,train_slot)
test_XY_group = dict()
for district in DISTRICTS:
    test_XY_group[district] = XY_order_traffic(district,test_order_group,test_traffic_group,test_slot)
for district in DISTRICTS:
    scaler = myStandardScaler()
    train_XY_group[district][0] = scaler.fit_transform(train_XY_group[district][0])
    test_XY_group[district][0] = scaler.transform(test_XY_group[district][0])
    
stop = time.time()
print 'Take %02d:%02d:%02d' % ((stop-now)/3600,(stop-now)/60,(stop-now)%60)

Take 00:00:16


In [5]:
all_now = time.time()

grid_params = {'n_estimators': [150] ,'max_depth': np.arange(8, 21), 'min_samples_leaf': [2,4,6,8,10], 
                     'min_samples_split': [2, 4, 6, 8,10], 'max_features': ['log2', 'sqrt',None]}


# grid_params = {'n_estimators': [1] ,'max_depth': np.arange(1,5), 'min_samples_leaf': np.arange(10,20), 
#                      'min_samples_split': np.arange(10,20), 'max_features': ['log2', 'sqrt',None]}

search_models = {district: Search_Model(RandomForestRegressor) for district in DISTRICTS}
test_prediction = dict()
for district, model in search_models.items():
    if len(os.listdir(SEARCH_PATH))==66:
        print 'We can together! :)'
        break
    now = time.time()
    print 'Searching %s...'%district
    model.fit(grid_params,*train_XY_group[district])
    test_prediction[district] = model.predict(test_XY_group[district][0]) - test_XY_group[district][1].fillna(0)
    with open(SEARCH_PATH+'test_prediction_%s.pickle'%(district),'wb') as f:
        pickle.dump(test_prediction[district],f)
    stop = time.time()
    print 'Take %02d:%02d:%02d' % ((stop-now)/3600,(stop-now)/60,(stop-now)%60)
    
all_stop = time.time()
print 'Totally take %02d:%02d:%02d' % ((all_stop-all_now)/3600,(all_stop-all_now)/60,(all_stop-all_now)%60)

Searching 38d5ad2d22b61109fd8e7b43cd0e8901...
Best Params: {'n_estimators': 150, 'min_samples_split': 10, 'max_depth': 11, 'max_features': None, 'min_samples_leaf': 6}
CV score: 0.827183252496
Fit (R2) score: 0.940936710761
The metrics: 0.566260754787
Take 00:15:00
Searching 08f5b445ec6b29deba62e6fd8b0325a6...
Best Params: {'n_estimators': 150, 'min_samples_split': 2, 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 8}
CV score: 0.140749113321
Fit (R2) score: 0.349412566368
The metrics: 0.135748144719
Take 00:08:58
Searching 364bf755f9b270f0f9141d1a61de43ee...
Best Params: {'n_estimators': 150, 'min_samples_split': 2, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2}
CV score: 0.538026375951
Fit (R2) score: 0.730906719042
The metrics: 0.443226428503
Take 00:13:11
Searching 49ac89aa860c27e26c0836cb8dab2df2...
Best Params: {'n_estimators': 150, 'min_samples_split': 2, 'max_depth': 17, 'max_features': 'sqrt', 'min_samples_leaf': 6}
CV score: 0.136665764405
Fit (R

In [6]:
pickle_path = SEARCH_PATH
pickle_all = dict()
for pkl in os.listdir(pickle_path):
    pickle_all[pkl.split('_')[-1].split('.')[0]] = pd.read_pickle(pickle_path+pkl)
print 'Have already pickled %d'%len(pickle_all)

Have already pickled 66


In [7]:
prediction2submit(pickle_all,cluster_map)

Unnamed: 0_level_0,district,dts,prediction
datetimeslot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20160122046,1,2016-01-22-46,7.498345
20160122046,2,2016-01-22-46,0.443821
20160122046,3,2016-01-22-46,0.743585
20160122046,4,2016-01-22-46,16.385130
20160122046,5,2016-01-22-46,0.908673
20160122046,6,2016-01-22-46,10.244347
20160122046,7,2016-01-22-46,39.033285
20160122046,8,2016-01-22-46,270.705463
20160122046,9,2016-01-22-46,2.863952
20160122046,10,2016-01-22-46,2.310059
