In [1]:
# Import library
import pandas as pd
import numpy as np
import os, random, warnings, gc, psutil, datetime
from tqdm import tqdm_notebook, tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from math import sqrt

import lightgbm as lgbm
from catboost import CatBoostRegressor

from glob import glob
from IPython.display import display

import seaborn as sns
import matplotlib.pyplot as plt

# Set options
pd.set_option('max_columns',500)
pd.set_option('max_rows',500)
pd.options.display.max_colwidth = 300

warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_palette('bright')

In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything()

In [3]:
train = pd.read_pickle('../Create_Features/preprocessed_train.pickle')
test = pd.read_pickle('../Create_Features/preprocessed_test.pickle')

In [4]:
n_trn = 415423
target_col = '18~20_ride'

In [5]:
train.head()

Unnamed: 0,18~20_ride,bus_route_id,in_out,latitude,longitude,station_code,dayofweek,weekend,ride_total,takeoff_total,ride_go_to_work,takeoff_go_to_work,dis_jejusi,dis_seoquipo,bus_route_id_station_code,bus_route_id_station_code_weekend,date_fq_enc,station_code_fq_enc,bus_route_id_fq_enc,bus_route_id_station_code_fq_enc,date_bus_route_id_fq_enc,date_station_code_fq_enc,date_bus_route_id_station_code_fq_enc,7~8_ride_date_mean,7~8_ride_date_bus_route_id_mean,8~9_ride_date_mean,8~9_ride_date_bus_route_id_mean,9~10_ride_date_mean,9~10_ride_date_bus_route_id_mean,station_sequence,station_reverse_sequence,weekday,is_national_holiday,getin_total,morning_getin,morning_takeoff,noon_getin,noon_takeoff,station_morning_getin_sum,station_morning_takeoff_sum,bus_route_getin_sum,bus_route_takeoff_sum,station_morning_getin_mean,station_morning_takeoff_mean,bus_route_getin_mean,bus_route_takeoff_mean,kmeans1,kmeans2,regular_commuter_count,afternoon_takeoff,next_bus_time_diff,getin_user_count1_morning,getin_user_count2_morning,takeoff_user_count1_noon,takeoff_user_count2_noon,hourly_rain,prev_daily_rain,hourly_cloud,latlong_second,total_population,man_population,woman_population,avg_time_diff,passengers_in,passengers_out,latitude_rank,longitude_rank
0,0.0,0,1,33.4899,126.49373,322,6,1,16.0,0.0,8.0,0.0,2.95492,26.256744,31053,53745,11538,46,1189,46,25,1,1,0.391576,1.12,0.49246,0.72,0.543855,1.52,1,25,6,0,16.0,3.0,0.0,13.0,0.0,3.0,0.0,50.0,8.0,3.0,0.0,2.0,0.32,15,9,9.0,0.0,3650.490741,7.0,1.0,,,0.2,0.0,88,2411,43217.0,21189.0,22028.0,3113.015748,15.23913,1.909091,404772.5,214964.5
1,5.0,0,1,33.48944,126.48508,335,6,1,22.0,0.0,10.0,0.0,3.720275,26.403025,31054,53747,11538,2303,1189,46,25,45,1,0.391576,1.12,0.49246,0.72,0.543855,1.52,2,24,6,0,22.0,9.0,0.0,13.0,0.0,183.0,116.0,50.0,8.0,4.066667,2.577778,2.0,0.32,15,9,466.0,0.0,3650.490741,197.0,,95.0,,0.2,0.0,88,2403,43217.0,21189.0,22028.0,1837.940774,594.0,428.0,396531.5,189710.0
2,2.0,0,1,33.48181,126.47352,408,6,1,4.0,0.0,3.0,0.0,5.036124,25.893305,31057,53753,11538,1154,1189,46,25,21,1,0.391576,1.12,0.49246,0.72,0.543855,1.52,3,23,6,0,4.0,2.0,0.0,2.0,0.0,71.0,11.0,50.0,8.0,3.380952,0.52381,2.0,0.32,15,45,164.0,0.0,3650.490741,76.0,,11.0,2.0,0.2,0.0,88,2347,56223.0,27761.0,28462.0,2448.248012,166.913043,28.0,342909.5,156930.5
3,53.0,0,0,33.50577,126.49252,1448,6,1,79.0,0.0,49.0,0.0,2.864166,27.997494,31020,53682,11538,49,1189,46,25,1,1,0.391576,1.12,0.49246,0.72,0.543855,1.52,4,22,6,0,79.0,23.0,0.0,56.0,0.0,23.0,0.0,50.0,8.0,23.0,0.0,2.0,0.32,4,3,2.0,0.0,3650.490741,49.0,,,,0.2,0.0,88,2980,15673.0,7904.0,7769.0,3961.540412,89.130435,3.6,518315.0,210660.0
4,0.0,0,0,33.25579,126.4126,1510,6,1,0.0,1.0,0.0,1.0,29.040353,13.574693,31022,53686,11538,386,1189,39,25,10,1,0.391576,1.12,0.49246,0.72,0.543855,1.52,5,21,6,0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,50.0,8.0,0.1,0.5,2.0,0.32,85,136,3.0,0.0,3650.490741,4.0,,5.0,,0.2,0.0,88,562,4414.0,2252.0,2162.0,1058.012605,6.847826,27.456522,98750.5,93507.5


In [6]:
# Before modeling
train_set = train.drop(target_col,1)
test_set = test.drop(target_col,1)

train_label = train[target_col]
test_label = test[target_col]

In [7]:
n_rounds = 100000
n_splits = 5

cat_params = {
        'n_estimators': n_rounds,
        'learning_rate': 0.05,
        'eval_metric': 'RMSE',
        'loss_function': 'RMSE',
        'random_seed': 42,
        'metric_period': 500,
        'od_wait': 500,
        'task_type': 'GPU',
       'l2_leaf_reg' : 3,
        'depth': 8,
    }

In [8]:
train.head()

Unnamed: 0,18~20_ride,bus_route_id,in_out,latitude,longitude,station_code,dayofweek,weekend,ride_total,takeoff_total,ride_go_to_work,takeoff_go_to_work,dis_jejusi,dis_seoquipo,bus_route_id_station_code,bus_route_id_station_code_weekend,date_fq_enc,station_code_fq_enc,bus_route_id_fq_enc,bus_route_id_station_code_fq_enc,date_bus_route_id_fq_enc,date_station_code_fq_enc,date_bus_route_id_station_code_fq_enc,7~8_ride_date_mean,7~8_ride_date_bus_route_id_mean,8~9_ride_date_mean,8~9_ride_date_bus_route_id_mean,9~10_ride_date_mean,9~10_ride_date_bus_route_id_mean,station_sequence,station_reverse_sequence,weekday,is_national_holiday,getin_total,morning_getin,morning_takeoff,noon_getin,noon_takeoff,station_morning_getin_sum,station_morning_takeoff_sum,bus_route_getin_sum,bus_route_takeoff_sum,station_morning_getin_mean,station_morning_takeoff_mean,bus_route_getin_mean,bus_route_takeoff_mean,kmeans1,kmeans2,regular_commuter_count,afternoon_takeoff,next_bus_time_diff,getin_user_count1_morning,getin_user_count2_morning,takeoff_user_count1_noon,takeoff_user_count2_noon,hourly_rain,prev_daily_rain,hourly_cloud,latlong_second,total_population,man_population,woman_population,avg_time_diff,passengers_in,passengers_out,latitude_rank,longitude_rank
0,0.0,0,1,33.4899,126.49373,322,6,1,16.0,0.0,8.0,0.0,2.95492,26.256744,31053,53745,11538,46,1189,46,25,1,1,0.391576,1.12,0.49246,0.72,0.543855,1.52,1,25,6,0,16.0,3.0,0.0,13.0,0.0,3.0,0.0,50.0,8.0,3.0,0.0,2.0,0.32,15,9,9.0,0.0,3650.490741,7.0,1.0,,,0.2,0.0,88,2411,43217.0,21189.0,22028.0,3113.015748,15.23913,1.909091,404772.5,214964.5
1,5.0,0,1,33.48944,126.48508,335,6,1,22.0,0.0,10.0,0.0,3.720275,26.403025,31054,53747,11538,2303,1189,46,25,45,1,0.391576,1.12,0.49246,0.72,0.543855,1.52,2,24,6,0,22.0,9.0,0.0,13.0,0.0,183.0,116.0,50.0,8.0,4.066667,2.577778,2.0,0.32,15,9,466.0,0.0,3650.490741,197.0,,95.0,,0.2,0.0,88,2403,43217.0,21189.0,22028.0,1837.940774,594.0,428.0,396531.5,189710.0
2,2.0,0,1,33.48181,126.47352,408,6,1,4.0,0.0,3.0,0.0,5.036124,25.893305,31057,53753,11538,1154,1189,46,25,21,1,0.391576,1.12,0.49246,0.72,0.543855,1.52,3,23,6,0,4.0,2.0,0.0,2.0,0.0,71.0,11.0,50.0,8.0,3.380952,0.52381,2.0,0.32,15,45,164.0,0.0,3650.490741,76.0,,11.0,2.0,0.2,0.0,88,2347,56223.0,27761.0,28462.0,2448.248012,166.913043,28.0,342909.5,156930.5
3,53.0,0,0,33.50577,126.49252,1448,6,1,79.0,0.0,49.0,0.0,2.864166,27.997494,31020,53682,11538,49,1189,46,25,1,1,0.391576,1.12,0.49246,0.72,0.543855,1.52,4,22,6,0,79.0,23.0,0.0,56.0,0.0,23.0,0.0,50.0,8.0,23.0,0.0,2.0,0.32,4,3,2.0,0.0,3650.490741,49.0,,,,0.2,0.0,88,2980,15673.0,7904.0,7769.0,3961.540412,89.130435,3.6,518315.0,210660.0
4,0.0,0,0,33.25579,126.4126,1510,6,1,0.0,1.0,0.0,1.0,29.040353,13.574693,31022,53686,11538,386,1189,39,25,10,1,0.391576,1.12,0.49246,0.72,0.543855,1.52,5,21,6,0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,50.0,8.0,0.1,0.5,2.0,0.32,85,136,3.0,0.0,3650.490741,4.0,,5.0,,0.2,0.0,88,562,4414.0,2252.0,2162.0,1058.012605,6.847826,27.456522,98750.5,93507.5


In [9]:
cat_cols = list(train_set)

In [10]:
for col in tqdm_notebook( list(train_set) ):
    train_set[col] = train_set[col].astype(np.str)
    test_set[col] = test_set[col].astype(np.str)


HBox(children=(IntProgress(value=0, max=66), HTML(value='')))




In [11]:
split_col = 'bus_route_id'
len_seeds = 1

outer_oof_train = np.zeros( train.shape[0] )
outer_oof_test = np.zeros( test.shape[0] )

for _ in tqdm_notebook(range(len_seeds)):
    
    seed = random.randint(1, 100000)
    cat_params['random_seed'] = seed
    
    cv_list = []

    oof_train = np.zeros( train.shape[0] )
    final_test = np.zeros( test.shape[0] )

    kfolds = StratifiedKFold(n_splits = n_splits, shuffle=True, random_state=seed )
    
    for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate(kfolds.split(train_set, train_set[split_col])) ):

        X_train, y_train = train_set.iloc[trn_ind], train_label[trn_ind]
        X_valid, y_valid = train_set.iloc[val_ind], train_label[val_ind]
        
        model = CatBoostRegressor(**cat_params)
    
        X_train, y_train = train_set.iloc[trn_ind], train_label[trn_ind]
        X_valid, y_valid = train_set.iloc[val_ind], train_label[val_ind]


        model.fit( X_train, y_train, eval_set = (X_valid, y_valid), 
                      cat_features  = cat_cols,
                      use_best_model=True,
                      verbose=True)

        valid_pred = model.predict(X_valid)
        test_pred  = model.predict(test_set)

        oof_train[val_ind] += valid_pred
        final_test += test_pred

        cv_list.append( sqrt(mean_squared_error(y_valid, valid_pred)) )

        print('='*80)

    final_test /= n_splits

    print(f"Average CV : {np.mean(cv_list)}")
    print(f"RMSE for OOF: {sqrt(mean_squared_error(train_label, oof_train))}")
    
    outer_oof_train += oof_train
    outer_oof_test += final_test
    
outer_oof_train /= len_seeds
outer_oof_test /= len_seeds

print(f"Overall for OOF: {sqrt(mean_squared_error(train_label, outer_oof_train))}")


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

0:	learn: 4.7560291	test: 4.7079666	best: 4.7079666 (0)	total: 129ms	remaining: 3h 35m 28s
500:	learn: 2.1448649	test: 2.3676031	best: 2.3676031 (500)	total: 58.1s	remaining: 3h 12m 23s
1000:	learn: 1.9623673	test: 2.3012421	best: 2.3012421 (1000)	total: 1m 56s	remaining: 3h 12m 21s
1500:	learn: 1.8511056	test: 2.2689729	best: 2.2688827 (1487)	total: 2m 56s	remaining: 3h 12m 34s
2000:	learn: 1.7686741	test: 2.2528120	best: 2.2527677 (1999)	total: 3m 55s	remaining: 3h 12m 15s
2500:	learn: 1.7001795	test: 2.2401328	best: 2.2400437 (2490)	total: 4m 55s	remaining: 3h 12m 16s
3000:	learn: 1.6424550	test: 2.2327133	best: 2.2326917 (2990)	total: 5m 56s	remaining: 3h 12m 9s
3500:	learn: 1.5921806	test: 2.2258414	best: 2.2258051 (3499)	total: 6m 56s	remaining: 3h 11m 13s
4000:	learn: 1.5478071	test: 2.2211123	best: 2.2210742 (3990)	total: 7m 56s	remaining: 3h 10m 44s
4500:	learn: 1.5082395	test: 2.2182702	best: 2.2182337 (4497)	total: 8m 57s	remaining: 3h 10m 8s
5000:	learn: 1.4697534	test: 2.2

CatBoostError: bad allocation

In [9]:
split_col = 'bus_route_id'
len_seeds = 5

outer_oof_train = np.zeros( train.shape[0] )
outer_oof_test = np.zeros( test.shape[0] )

for _ in tqdm_notebook(range(len_seeds)):
    
    seed = random.randint(1, 100000)
    cat_params['random_seed'] = seed
    
    cv_list = []

    oof_train = np.zeros( train.shape[0] )
    final_test = np.zeros( test.shape[0] )

    kfolds = StratifiedKFold(n_splits = n_splits, shuffle=True, random_state=seed )
    
    for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate(kfolds.split(train_set, train_set[split_col])) ):

        X_train, y_train = train_set.iloc[trn_ind], train_label[trn_ind]
        X_valid, y_valid = train_set.iloc[val_ind], train_label[val_ind]
        
        model = CatBoostRegressor(**cat_params)
    
        X_train, y_train = train_set.iloc[trn_ind], train_label[trn_ind]
        X_valid, y_valid = train_set.iloc[val_ind], train_label[val_ind]


        model.fit( X_train, y_train, eval_set = (X_valid, y_valid), 
                      cat_features  = ['bus_route_id','station_code','weekday',\
                                                'kmeans1','kmeans2',
                                               'latlong_second',
                                              ] ,
                      use_best_model=True,
                      verbose=True)

        valid_pred = model.predict(X_valid)
        test_pred  = model.predict(test_set)

        oof_train[val_ind] += valid_pred
        final_test += test_pred

        cv_list.append( sqrt(mean_squared_error(y_valid, valid_pred)) )

        print('='*80)

    final_test /= n_splits

    print(f"Average CV : {np.mean(cv_list)}")
    print(f"RMSE for OOF: {sqrt(mean_squared_error(train_label, oof_train))}")
    
    outer_oof_train += oof_train
    outer_oof_test += final_test
    
outer_oof_train /= len_seeds
outer_oof_test /= len_seeds

print(f"Overall for OOF: {sqrt(mean_squared_error(train_label, outer_oof_train))}")


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

0:	learn: 4.7528929	test: 4.7073132	best: 4.7073132 (0)	total: 49.2ms	remaining: 1h 21m 59s
500:	learn: 2.0483653	test: 2.3513805	best: 2.3513805 (500)	total: 26.1s	remaining: 1h 26m 23s
1000:	learn: 1.8414126	test: 2.2808700	best: 2.2808341 (999)	total: 52.7s	remaining: 1h 26m 47s
1500:	learn: 1.7187993	test: 2.2521478	best: 2.2521478 (1500)	total: 1m 20s	remaining: 1h 27m 39s
2000:	learn: 1.6279027	test: 2.2328754	best: 2.2328294 (1998)	total: 1m 46s	remaining: 1h 26m 55s
2500:	learn: 1.5534381	test: 2.2198667	best: 2.2198560 (2499)	total: 2m 14s	remaining: 1h 27m 40s
3000:	learn: 1.4900717	test: 2.2106071	best: 2.2104149 (2994)	total: 2m 42s	remaining: 1h 27m 24s
3500:	learn: 1.4370770	test: 2.2038840	best: 2.2038617 (3496)	total: 3m 9s	remaining: 1h 27m 6s
4000:	learn: 1.3896626	test: 2.2006121	best: 2.2006121 (4000)	total: 3m 37s	remaining: 1h 26m 56s
4500:	learn: 1.3474541	test: 2.1967779	best: 2.1967779 (4500)	total: 4m 4s	remaining: 1h 26m 32s
5000:	learn: 1.3075878	test: 2.193

bestTest = 2.170021443
bestIteration = 7569
Shrink model to first 7570 iterations.
0:	learn: 4.7909257	test: 4.5440202	best: 4.5440202 (0)	total: 53.2ms	remaining: 1h 28m 43s
500:	learn: 2.0678869	test: 2.2746570	best: 2.2744826 (499)	total: 26.6s	remaining: 1h 28m 10s
1000:	learn: 1.8531099	test: 2.2002501	best: 2.2000586 (992)	total: 53.1s	remaining: 1h 27m 28s
1500:	learn: 1.7240586	test: 2.1665013	best: 2.1665013 (1500)	total: 1m 20s	remaining: 1h 28m 12s
2000:	learn: 1.6264431	test: 2.1461437	best: 2.1461437 (2000)	total: 1m 48s	remaining: 1h 28m 31s
2500:	learn: 1.5520176	test: 2.1373578	best: 2.1373578 (2500)	total: 2m 15s	remaining: 1h 28m 3s
3000:	learn: 1.4907823	test: 2.1304129	best: 2.1303855 (2999)	total: 2m 42s	remaining: 1h 27m 25s
3500:	learn: 1.4363038	test: 2.1241176	best: 2.1240568 (3491)	total: 3m 9s	remaining: 1h 27m 12s
4000:	learn: 1.3870460	test: 2.1194923	best: 2.1194856 (3965)	total: 3m 37s	remaining: 1h 27m 5s
4500:	learn: 1.3446318	test: 2.1160420	best: 2.11

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

0:	learn: 4.7765025	test: 4.6320814	best: 4.6320814 (0)	total: 49.6ms	remaining: 1h 22m 35s
500:	learn: 2.0590063	test: 2.2584680	best: 2.2584680 (500)	total: 26.6s	remaining: 1h 28m
1000:	learn: 1.8522815	test: 2.1773819	best: 2.1773699 (999)	total: 53.3s	remaining: 1h 27m 53s
1500:	learn: 1.7266425	test: 2.1395198	best: 2.1395198 (1500)	total: 1m 20s	remaining: 1h 28m 13s
2000:	learn: 1.6332472	test: 2.1176036	best: 2.1176036 (2000)	total: 1m 48s	remaining: 1h 28m 24s
2500:	learn: 1.5591058	test: 2.1030951	best: 2.1030358 (2497)	total: 2m 15s	remaining: 1h 27m 57s
3000:	learn: 1.4983523	test: 2.0948993	best: 2.0948644 (2999)	total: 2m 42s	remaining: 1h 27m 38s
3500:	learn: 1.4435054	test: 2.0880842	best: 2.0880663 (3498)	total: 3m 9s	remaining: 1h 27m 8s
4000:	learn: 1.3939831	test: 2.0810745	best: 2.0810745 (4000)	total: 3m 36s	remaining: 1h 26m 39s
4500:	learn: 1.3514569	test: 2.0762532	best: 2.0762532 (4500)	total: 4m 4s	remaining: 1h 26m 28s
5000:	learn: 1.3122924	test: 2.0739495

4000:	learn: 1.4080929	test: 2.1641229	best: 2.1641229 (4000)	total: 3m 35s	remaining: 1h 26m 7s
4500:	learn: 1.3619550	test: 2.1592967	best: 2.1592967 (4500)	total: 4m 3s	remaining: 1h 26m 5s
5000:	learn: 1.3210730	test: 2.1556245	best: 2.1556245 (5000)	total: 4m 31s	remaining: 1h 25m 58s
5500:	learn: 1.2834969	test: 2.1526329	best: 2.1526191 (5496)	total: 4m 58s	remaining: 1h 25m 31s
6000:	learn: 1.2499085	test: 2.1507098	best: 2.1505103 (5946)	total: 5m 27s	remaining: 1h 25m 26s
6500:	learn: 1.2172045	test: 2.1489341	best: 2.1489211 (6499)	total: 5m 54s	remaining: 1h 25m
7000:	learn: 1.1877513	test: 2.1474914	best: 2.1474490 (6973)	total: 6m 23s	remaining: 1h 24m 59s
7500:	learn: 1.1602472	test: 2.1469658	best: 2.1465073 (7361)	total: 6m 51s	remaining: 1h 24m 39s
bestTest = 2.146507297
bestIteration = 7361
Shrink model to first 7362 iterations.
Average CV : 2.178881640887623
RMSE for OOF: 2.18038021636416


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

0:	learn: 4.7508750	test: 4.7061686	best: 4.7061686 (0)	total: 56.5ms	remaining: 1h 34m 5s
500:	learn: 2.0513688	test: 2.2809964	best: 2.2809800 (499)	total: 28.3s	remaining: 1h 33m 41s
1000:	learn: 1.8434357	test: 2.1939801	best: 2.1939310 (999)	total: 57.1s	remaining: 1h 34m 5s
1500:	learn: 1.7144285	test: 2.1544285	best: 2.1544285 (1500)	total: 1m 26s	remaining: 1h 34m 23s
2000:	learn: 1.6209962	test: 2.1350727	best: 2.1350727 (2000)	total: 1m 56s	remaining: 1h 34m 48s
2500:	learn: 1.5466854	test: 2.1205084	best: 2.1205084 (2500)	total: 2m 25s	remaining: 1h 34m 45s
3000:	learn: 1.4845230	test: 2.1126478	best: 2.1126478 (3000)	total: 2m 55s	remaining: 1h 34m 43s
3500:	learn: 1.4300780	test: 2.1070194	best: 2.1070044 (3498)	total: 3m 25s	remaining: 1h 34m 32s
4000:	learn: 1.3834652	test: 2.1028267	best: 2.1028129 (3999)	total: 3m 55s	remaining: 1h 34m 17s
4500:	learn: 1.3410150	test: 2.0999318	best: 2.0999318 (4500)	total: 4m 25s	remaining: 1h 34m 1s
5000:	learn: 1.3007694	test: 2.097

bestTest = 2.259763337
bestIteration = 10150
Shrink model to first 10151 iterations.
0:	learn: 4.7575331	test: 4.6682511	best: 4.6682511 (0)	total: 54.3ms	remaining: 1h 30m 34s
500:	learn: 2.0687405	test: 2.3274164	best: 2.3274011 (499)	total: 27.1s	remaining: 1h 29m 40s
1000:	learn: 1.8573355	test: 2.2526145	best: 2.2526145 (1000)	total: 54.7s	remaining: 1h 30m 11s
1500:	learn: 1.7294084	test: 2.2179026	best: 2.2179026 (1500)	total: 1m 23s	remaining: 1h 30m 47s
2000:	learn: 1.6322549	test: 2.1973480	best: 2.1972949 (1995)	total: 1m 51s	remaining: 1h 30m 56s
2500:	learn: 1.5556309	test: 2.1822555	best: 2.1822555 (2500)	total: 2m 19s	remaining: 1h 30m 47s
3000:	learn: 1.4903624	test: 2.1735379	best: 2.1735379 (3000)	total: 2m 48s	remaining: 1h 30m 42s
3500:	learn: 1.4369500	test: 2.1677457	best: 2.1677457 (3500)	total: 3m 17s	remaining: 1h 30m 37s
4000:	learn: 1.3868717	test: 2.1620637	best: 2.1620637 (4000)	total: 3m 45s	remaining: 1h 30m 17s
4500:	learn: 1.3437005	test: 2.1585737	best

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

0:	learn: 4.7409406	test: 4.7556002	best: 4.7556002 (0)	total: 61ms	remaining: 1h 41m 38s
500:	learn: 2.0729659	test: 2.3769801	best: 2.3769801 (500)	total: 28.9s	remaining: 1h 35m 45s
1000:	learn: 1.8580893	test: 2.2931869	best: 2.2931597 (999)	total: 57.5s	remaining: 1h 34m 43s
1500:	learn: 1.7209467	test: 2.2569407	best: 2.2569407 (1500)	total: 1m 26s	remaining: 1h 34m 53s
2000:	learn: 1.6271561	test: 2.2346577	best: 2.2346577 (2000)	total: 1m 55s	remaining: 1h 34m 23s
2500:	learn: 1.5549585	test: 2.2202862	best: 2.2202722 (2498)	total: 2m 24s	remaining: 1h 33m 57s
3000:	learn: 1.4928982	test: 2.2102330	best: 2.2102330 (3000)	total: 2m 53s	remaining: 1h 33m 12s
3500:	learn: 1.4368467	test: 2.2021427	best: 2.2020054 (3461)	total: 3m 21s	remaining: 1h 32m 34s
4000:	learn: 1.3889871	test: 2.1962626	best: 2.1962626 (4000)	total: 3m 55s	remaining: 1h 33m 58s
4500:	learn: 1.3453402	test: 2.1921352	best: 2.1921352 (4500)	total: 4m 31s	remaining: 1h 36m 3s
5000:	learn: 1.3065267	test: 2.188

1500:	learn: 1.7005191	test: 2.2103721	best: 2.2103721 (1500)	total: 1m 48s	remaining: 1h 58m 35s
2000:	learn: 1.6089166	test: 2.1947911	best: 2.1947911 (2000)	total: 2m 25s	remaining: 1h 58m 23s
2500:	learn: 1.5330107	test: 2.1847222	best: 2.1847222 (2500)	total: 3m 2s	remaining: 1h 58m 19s
3000:	learn: 1.4694314	test: 2.1770020	best: 2.1769264 (2999)	total: 3m 39s	remaining: 1h 58m 10s
3500:	learn: 1.4162186	test: 2.1712921	best: 2.1712527 (3494)	total: 4m 15s	remaining: 1h 57m 28s
4000:	learn: 1.3696372	test: 2.1669766	best: 2.1668960 (3979)	total: 4m 52s	remaining: 1h 56m 54s
4500:	learn: 1.3266015	test: 2.1622067	best: 2.1621155 (4485)	total: 5m 30s	remaining: 1h 56m 47s
5000:	learn: 1.2870714	test: 2.1586122	best: 2.1585705 (4993)	total: 6m 8s	remaining: 1h 56m 39s
5500:	learn: 1.2511869	test: 2.1562021	best: 2.1561850 (5484)	total: 6m 45s	remaining: 1h 56m 8s
6000:	learn: 1.2170136	test: 2.1542208	best: 2.1541490 (5951)	total: 7m 23s	remaining: 1h 55m 49s
6500:	learn: 1.1870198	

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

0:	learn: 4.7103177	test: 4.8721602	best: 4.8721602 (0)	total: 70.9ms	remaining: 1h 58m 6s
500:	learn: 2.0377925	test: 2.3009132	best: 2.3007990 (499)	total: 35.1s	remaining: 1h 56m 16s
1000:	learn: 1.8292089	test: 2.2329478	best: 2.2329478 (1000)	total: 1m 11s	remaining: 1h 58m
1500:	learn: 1.7014420	test: 2.2068762	best: 2.2068762 (1500)	total: 1m 48s	remaining: 1h 58m 47s
2000:	learn: 1.6101896	test: 2.1907669	best: 2.1907669 (2000)	total: 2m 24s	remaining: 1h 57m 57s
2500:	learn: 1.5367277	test: 2.1844466	best: 2.1842614 (2475)	total: 3m	remaining: 1h 57m 15s
3000:	learn: 1.4736178	test: 2.1787790	best: 2.1787339 (2997)	total: 3m 38s	remaining: 1h 57m 38s
3500:	learn: 1.4202539	test: 2.1748931	best: 2.1746971 (3472)	total: 4m 13s	remaining: 1h 56m 26s
4000:	learn: 1.3739902	test: 2.1714031	best: 2.1712606 (3956)	total: 4m 43s	remaining: 1h 53m 13s
4500:	learn: 1.3319819	test: 2.1693393	best: 2.1693375 (4498)	total: 5m 12s	remaining: 1h 50m 38s
5000:	learn: 1.2941092	test: 2.1667670

10500:	learn: 1.0177978	test: 2.1222037	best: 2.1220555 (10390)	total: 9m 39s	remaining: 1h 22m 21s
11000:	learn: 0.9980066	test: 2.1218594	best: 2.1218559 (10764)	total: 10m 7s	remaining: 1h 21m 58s
11500:	learn: 0.9800342	test: 2.1217837	best: 2.1216093 (11216)	total: 10m 35s	remaining: 1h 21m 33s
12000:	learn: 0.9622046	test: 2.1218615	best: 2.1214641 (11733)	total: 11m 4s	remaining: 1h 21m 12s
bestTest = 2.121464089
bestIteration = 11733
Shrink model to first 11734 iterations.
0:	learn: 4.7411077	test: 4.7874334	best: 4.7874334 (0)	total: 51.8ms	remaining: 1h 26m 19s
500:	learn: 2.0538275	test: 2.4068882	best: 2.4068882 (500)	total: 26.5s	remaining: 1h 27m 44s
1000:	learn: 1.8485256	test: 2.3353961	best: 2.3353961 (1000)	total: 53.6s	remaining: 1h 28m 20s
1500:	learn: 1.7223595	test: 2.3052090	best: 2.3050187 (1496)	total: 1m 21s	remaining: 1h 28m 38s
2000:	learn: 1.6310496	test: 2.2878480	best: 2.2878480 (2000)	total: 1m 48s	remaining: 1h 28m 48s
2500:	learn: 1.5575869	test: 2.279

In [10]:
# PostProcessing
outer_oof_train = [x if x>0 else 0 for x in  outer_oof_train]
outer_oof_test = [x if x>0 else 0 for x in  outer_oof_test]

print(f"RMSE for OOF: {sqrt(mean_squared_error(train_label, outer_oof_train))}")

RMSE for OOF: 2.132076609656555


In [11]:
df_oof = pd.read_csv('../raw_dataset/train.csv', usecols = ['id','18~20_ride'])
df_oof['18~20_ride'] = outer_oof_train

df_oof.to_csv('../oof/cat_5_seeds_stractified5k_bus_route_id.csv',index=False)

In [12]:
df_sub = pd.read_csv('../raw_dataset/submission_sample.csv')
df_sub['18~20_ride'] = outer_oof_test

df_sub.to_csv('../submission/cat_5_seeds_stractified5k_bus_route_id.csv',index=False)