In [2]:
import os
from dateutil.parser import parse as dateutil_parse
import time
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler as skStandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.cross_validation import train_test_split,cross_val_score
from six.moves import cPickle as pickle

PATH = 'season_1/'
CLEAN_PATH = PATH+'clean/'

from mylib import myStandardScaler,process_order,process_traffic,get_order_group,get_traffic_group,XY_order_traffic, prediction2submit, Search_Model, DISTRICTS

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
cluster_map = pd.read_csv(CLEAN_PATH+'cluster_map.csv',index_col=0)
poi = pd.read_csv(CLEAN_PATH+'poi.csv',index_col=0)
train_order_group = pd.read_pickle(CLEAN_PATH+'train_order_group.pickle')
test_order_group = pd.read_pickle(CLEAN_PATH+'test_order_group.pickle')
train_traffic_group = pd.read_pickle(CLEAN_PATH+'train_traffic_group.pickle')
test_traffic_group = pd.read_pickle(CLEAN_PATH+'test_traffic_group.pickle')
test_target = pd.read_csv(CLEAN_PATH+'test_target.csv',index_col=0,parse_dates=True)

In [3]:
now = time.time()

train_order = process_order(train_order)
test_order = process_order(test_order)
train_traffic = process_traffic(train_traffic)
test_traffic = process_traffic(test_traffic)

train_order_group = get_order_group(train_order)
test_order_group = get_order_group(test_order)
train_traffic_group = get_traffic_group(train_traffic)
test_traffic_group = get_traffic_group(test_traffic)

train_slot = pd.Index(sorted(train_order['datetimeslot'].unique()))
train_slot = pd.Index(filter(lambda x: x%1000 >4,train_slot))
test_slot = test_target['datetimeslot']

stop = time.time()
print 'Take %02d:%02d:%02d' % ((stop-now)/3600,(stop-now)/60,(stop-now)%60)

Take 00:12:57


In [4]:
now = time.time()

train_XY_group = dict()
for district in DISTRICTS:
    train_XY_group[district] = XY_order_traffic(district,train_order_group,train_traffic_group,train_slot)
test_XY_group = dict()
for district in DISTRICTS:
    test_XY_group[district] = XY_order_traffic(district,test_order_group,test_traffic_group,test_slot)
for district in DISTRICTS:
    scaler = myStandardScaler()
    train_XY_group[district][0] = scaler.fit_transform(train_XY_group[district][0])
    test_XY_group[district][0] = scaler.transform(test_XY_group[district][0])
    
stop = time.time()
print 'Take %02d:%02d:%02d' % ((stop-now)/3600,(stop-now)/60,(stop-now)%60)

Take 00:00:15


In [5]:
with open(CLEAN_PATH+'train_order_group.pickle','wb') as f:
    pickle.dump(train_order_group,f)
with open(CLEAN_PATH+'test_order_group.pickle','wb') as f:
    pickle.dump(test_order_group,f)
with open(CLEAN_PATH+'train_traffic_group.pickle','wb') as f:
    pickle.dump(train_traffic_group,f)
with open(CLEAN_PATH+'test_traffic_group.pickle','wb') as f:
    pickle.dump(test_traffic_group,f)

In [9]:
with open(CLEAN_PATH+'test_traffic_group.pickle','rb') as f:
    tmp = pickle.load(f)
tmp

{'08232402614a9b48895cc3d0aeb0e9f2':               level_1  level_2  level_3  level_4
 datetimeslot                                    
 20160122043        69        9        0        4
 20160122044        52       14        7        8
 20160122045        55       14        6        2
 20160122055       163       14        8        5
 20160122056       168       19       14        2
 20160122057       122       39        7        1
 20160122067       121       10        5        3
 20160122068        87       23        1        4
 20160122069        81        2        1        0
 20160122079       111       12        1        5
 20160122080        98       27       11        4
 20160122081        93       27        7        2
 20160122091       125       19       14        1
 20160122092       107       28        2        5
 20160122093       126       22        8        5
 20160122103       118       28        5        0
 20160122104       153       57        4       10
 20160122105  

In [None]:
grid_params = {'n_estimators': [80] ,'max_depth': np.arange(10, 18), 'min_samples_leaf': [2, 6], 
                     'min_samples_split': [2, 6], 'max_features': ['log2', 'sqrt',None]}
search_models = {district: Search_Model(RandomForestRegressor) for district in DISTRICTS}
test_prediction = dict()
for district, model in search_models.items():
    now = time.time()
    print 'Searching %s...'%district
    model.fit(grid_params,*train_XY_group[district])
    test_prediction[district] = model.predict(test_XY_group[district][0]) - test_XY_group[district][1].fillna(0)
    with open(CLEAN_PATH+'prediction/test_prediction_%s.pickle'%(district),'wb') as f:
        pickle.dump(test_prediction[district],f)
    stop = time.time()
    print 'Take %02d:%02d:%02d' % ((stop-now)/3600,(stop-now)/60,(stop-now)%60)

Searching 38d5ad2d22b61109fd8e7b43cd0e8901...
Best Params: {'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': True, 'min_samples_leaf': 6, 'n_estimators': 80, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'mse', 'random_state': None, 'max_features': None, 'max_depth': 14}
Fit score: 0.941839950793
The metrics: 0.526459662404
Take 00:05:25
Searching 08f5b445ec6b29deba62e6fd8b0325a6...
Best Params: {'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': True, 'min_samples_leaf': 6, 'n_estimators': 80, 'min_samples_split': 6, 'min_weight_fraction_leaf': 0.0, 'criterion': 'mse', 'random_state': None, 'max_features': 'log2', 'max_depth': 16}
Fit score: 0.367889432829
The metrics: 0.13541965458
Take 00:03:08
Searching 364bf755f9b270f0f9141d1a61de43ee...
Best Params: {'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootst

In [None]:
submit = prediction2submit(test_prediction)
sublit

In [None]:
submit.to_csv(PATH+'submit/searchrf_order_traffic.csv',index=None,header=False)