In [65]:
%%time
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # Linear algebra
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb # LightGBM is a gradient boosting framework that uses tree based learning algorithms

from bayes_opt import BayesianOptimization # Pure Python implementation of bayesian global optimization with gaussian processes

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import StandardScaler


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Wall time: 0 ns


In [66]:
%%time
# Define a seed for everything
def seed_everything(seed = 42):
    rn.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

Wall time: 0 ns


In [67]:
%%time
# Configure notebook display settings
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 20) 
pd.set_option('display.max_rows', 50) 

Wall time: 0 ns


In [68]:
%%time
# Model parameters 
TRAIN_PATH =  '../data/input/train.csv'
TEST_PATH  = '../data/input/test.csv'
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

Wall time: 4.03 s


In [69]:
%%time
train.head()

Wall time: 0 ns


Unnamed: 0,RowId,IntersectionId,Latitude,Longitude,EntryStreetName,ExitStreetName,EntryHeading,ExitHeading,Hour,Weekend,...,TimeFromFirstStop_p40,TimeFromFirstStop_p50,TimeFromFirstStop_p60,TimeFromFirstStop_p80,DistanceToFirstStop_p20,DistanceToFirstStop_p40,DistanceToFirstStop_p50,DistanceToFirstStop_p60,DistanceToFirstStop_p80,City
0,1920335,0,33.79,-84.43,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
1,1920336,0,33.79,-84.43,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
2,1920337,0,33.79,-84.43,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,1,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
3,1920338,0,33.79,-84.43,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,1,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
4,1920339,0,33.79,-84.43,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,2,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta


In [70]:
%%time
test.head()

Wall time: 0 ns


Unnamed: 0,RowId,IntersectionId,Latitude,Longitude,EntryStreetName,ExitStreetName,EntryHeading,ExitHeading,Hour,Weekend,Month,Path,City
0,0,1,33.75,-84.39,Peachtree Street Southwest,Mitchell Street Southwest,SW,SE,0,0,6,Peachtree Street Southwest_SW_Mitchell Street ...,Atlanta
1,1,1,33.75,-84.39,Peachtree Street Southwest,Peachtree Street Southwest,SW,SW,0,0,6,Peachtree Street Southwest_SW_Peachtree Street...,Atlanta
2,2,1,33.75,-84.39,Peachtree Street Southwest,Peachtree Street Southwest,NE,NE,1,0,6,Peachtree Street Southwest_NE_Peachtree Street...,Atlanta
3,3,1,33.75,-84.39,Peachtree Street Southwest,Peachtree Street Southwest,SW,SW,1,0,6,Peachtree Street Southwest_SW_Peachtree Street...,Atlanta
4,4,1,33.75,-84.39,Peachtree Street Southwest,Peachtree Street Southwest,NE,NE,2,0,6,Peachtree Street Southwest_NE_Peachtree Street...,Atlanta


In [71]:
%%time
# Provides a list of the null columns in the dataframe and the quantity
null_columns = train.columns[train.isnull().any()]
train[null_columns].isnull().sum()

Wall time: 217 ms


EntryStreetName    8189
ExitStreetName     5534
dtype: int64

In [72]:
%%time
train[['EntryStreetName', 'ExitStreetName', 'Path']].sample(10)

Wall time: 41 ms


Unnamed: 0,EntryStreetName,ExitStreetName,Path
562823,North 33rd Street,West Montgomery Avenue,North 33rd Street_N_West Montgomery Avenue_E
751349,West Lehigh Avenue,West Lehigh Avenue,West Lehigh Avenue_E_West Lehigh Avenue_E
117081,Boulevard Southeast,Boulevard Northeast,Boulevard Southeast_N_Boulevard Northeast_N
24887,Piedmont Avenue Northeast,North Avenue Northeast,Piedmont Avenue Northeast_N_North Avenue North...
315508,Charter Street,Commercial Street,Charter Street_NW_Commercial Street_SW
607143,Baltimore Avenue,Baltimore Avenue,Baltimore Avenue_W_Baltimore Avenue_W
55350,Crown Road Southeast,Crown Road Southeast,Crown Road Southeast_SE_Crown Road Southeast_SE
409835,West Ida B. Wells Drive,West Ida B. Wells Drive,West Ida B. Wells Drive_E_West Ida B. Wells Dr...
419123,North Milwaukee Avenue,North Milwaukee Avenue,North Milwaukee Avenue_NW_North Milwaukee Aven...
672012,Pratt Street,Pratt Street,Pratt Street_SE_Pratt Street_SE


In [73]:
%%time
entrySummary = train.groupby(['EntryStreetName'])['ExitStreetName'].nunique().reset_index()
exitSummary = train.groupby(['ExitStreetName'])['EntryStreetName'].nunique().reset_index()

Wall time: 331 ms


In [74]:
%%time
entrySummary

Wall time: 0 ns


Unnamed: 0,EntryStreetName,ExitStreetName
0,10th Street Northeast,8
1,10th Street Northwest,12
2,12th Street Northeast,1
3,14th Street Northeast,6
4,14th Street Northwest,11
...,...,...
1702,Worcester Drive NE,1
1703,Wylie Street Southeast,2
1704,Wyman Street Southeast,2
1705,Wynnefield Avenue,4


In [75]:
%%time
exitSummary

Wall time: 0 ns


Unnamed: 0,ExitStreetName,EntryStreetName
0,10th Street Northeast,8
1,10th Street Northwest,12
2,12th Street Northeast,1
3,14th Street Northeast,5
4,14th Street Northwest,11
...,...,...
1688,Wylie Street Southeast,2
1689,Wyman Street Southeast,2
1690,Wynnefield Avenue,4
1691,Wyoming Street,1


In [76]:
%%time
# Display the first few rows of the dataset
train.describe()

Wall time: 704 ms


Unnamed: 0,RowId,IntersectionId,Latitude,Longitude,Hour,Weekend,Month,TotalTimeStopped_p20,TotalTimeStopped_p40,TotalTimeStopped_p50,...,TimeFromFirstStop_p20,TimeFromFirstStop_p40,TimeFromFirstStop_p50,TimeFromFirstStop_p60,TimeFromFirstStop_p80,DistanceToFirstStop_p20,DistanceToFirstStop_p40,DistanceToFirstStop_p50,DistanceToFirstStop_p60,DistanceToFirstStop_p80
count,857409.0,857409.0,857409.0,857409.0,857409.0,857409.0,857409.0,857409.0,857409.0,857409.0,...,857409.0,857409.0,857409.0,857409.0,857409.0,857409.0,857409.0,857409.0,857409.0,857409.0
mean,2349039.0,836.33,39.66,-77.9,12.43,0.28,9.1,1.73,5.36,7.68,...,3.13,9.05,12.61,18.8,34.04,6.56,19.87,28.26,43.27,81.92
std,247512.8,644.2,2.92,5.98,6.06,0.45,1.99,7.08,12.86,15.55,...,11.63,20.12,23.85,29.4,40.62,28.0,56.56,71.72,96.93,152.68
min,1920335.0,0.0,33.65,-87.84,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2134687.0,306.0,39.94,-84.39,8.0,0.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2349039.0,685.0,39.99,-75.17,13.0,0.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,27.0,0.0,0.0,0.0,0.0,60.4
75%,2563391.0,1254.0,41.92,-75.08,17.0,1.0,11.0,0.0,0.0,10.0,...,0.0,0.0,22.0,31.0,49.0,0.0,0.0,52.9,64.1,85.6
max,2777743.0,2875.0,42.38,-71.02,23.0,1.0,12.0,273.0,318.0,343.0,...,334.0,347.0,355.0,358.0,359.0,1902.7,3099.5,3099.5,3581.6,4064.3


In [77]:
%%time
# Display the first few rows of the dataset
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 857409 entries, 0 to 857408
Data columns (total 28 columns):
RowId                      857409 non-null int64
IntersectionId             857409 non-null int64
Latitude                   857409 non-null float64
Longitude                  857409 non-null float64
EntryStreetName            849220 non-null object
ExitStreetName             851875 non-null object
EntryHeading               857409 non-null object
ExitHeading                857409 non-null object
Hour                       857409 non-null int64
Weekend                    857409 non-null int64
Month                      857409 non-null int64
Path                       857409 non-null object
TotalTimeStopped_p20       857409 non-null int64
TotalTimeStopped_p40       857409 non-null int64
TotalTimeStopped_p50       857409 non-null int64
TotalTimeStopped_p60       857409 non-null int64
TotalTimeStopped_p80       857409 non-null int64
TimeFromFirstStop_p20      857409 non-null int6

In [78]:
%%time
# Display the first few rows of the dataset
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1920335 entries, 0 to 1920334
Data columns (total 13 columns):
RowId              int64
IntersectionId     int64
Latitude           float64
Longitude          float64
EntryStreetName    object
ExitStreetName     object
EntryHeading       object
ExitHeading        object
Hour               int64
Weekend            int64
Month              int64
Path               object
City               object
dtypes: float64(2), int64(5), object(6)
memory usage: 190.5+ MB
Wall time: 2 ms


In [79]:
%%time
def describe_categorical(df, sample_fields = 5):
    ''' Describe categoricals datasets, in more details to have ageneral idea of the values on each column '''
    text_variables = []
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df = df.select_dtypes(exclude = numerics)
    for col in df.columns:
        text_variables.append(col)
        pct_valid_data = 1 - (df[col].isnull().sum() / df.shape[0])
        unique_fields = list(set(df[col]))
        print('Variable Name: {:<18} % Data: {:0.2f} # Unique Fields:{:<6} Samples: {}'.format(col,pct_valid_data, len(unique_fields), unique_fields[:sample_fields]))
    return text_variables

Wall time: 0 ns


In [80]:
%%time
# Load and utilize the describe_categorical funtion on the dataframe
categorical_feat = describe_categorical(train, 6)

Variable Name: EntryStreetName    % Data: 0.99 # Unique Fields:1708   Samples: [nan, 'South 67th Street', 'Lagrange Street', 'Brown Street', 'Bridge Street', 'North Kilpatrick Avenue']
Variable Name: ExitStreetName     % Data: 0.99 # Unique Fields:1694   Samples: [nan, 'South 67th Street', 'Lagrange Street', 'Brown Street', 'Bridge Street', 'North Kilpatrick Avenue']
Variable Name: EntryHeading       % Data: 1.00 # Unique Fields:8      Samples: ['SE', 'W', 'N', 'SW', 'E', 'S']
Variable Name: ExitHeading        % Data: 1.00 # Unique Fields:8      Samples: ['SE', 'W', 'N', 'SW', 'E', 'S']
Variable Name: Path               % Data: 1.00 # Unique Fields:15111  Samples: ['Plymouth Street_W_Hampshire Street_NW', 'North Beacon Street_NW_Soldiers Field Road_E', 'Lombard Street_W_South 17th Street_W', 'Milk Street_E_Milk Street_E', 'North Ashland Avenue_N_North Milwaukee Avenue_NW', 'Massachusetts Avenue_NW_Harrison Avenue_NE']
Variable Name: City               % Data: 1.00 # Unique Fields:4    

In [81]:
%%time
# Identify the unique EntryStreetName 
def encode_headings(df):
    '''Encode the heading of the streets'''
    directions_dict = {
        'N' : 0,
        'NE': 0.25,
        'E' : 0.5,
        'SE': 0.75,
        'S' : 1,
        'SW': 1.25,
        'W' : 1.5,
        'NW': 1.75
    }
    
    df['EntryHeadingCode'] = train['EntryHeading'].map(directions_dict)
    df['ExitHeadingCode'] = train['ExitHeading'].map(directions_dict)
    
    return df

Wall time: 0 ns


In [82]:
%%time
# Apply the encode_headings function
train = encode_headings(train)
test = encode_headings(test)

Wall time: 263 ms


In [83]:
%%time
# Identify the unique EntryStreetName 
def encode_streets(row):
    '''Encode the heading of the streets'''
    street_dict = {
        'Street'   : 0,
        'St'       : 0,
        'Avenue'   : 1,
        'Ave'      : 1,
        'Boulevard': 2,
        'Road'     : 3,
        'Drive'    : 4,
        'Lane'     : 5,
        'Tunnel'   : 6,
        'Highway'  : 7,
        'Way'      : 8,
        'Parkway'  : 9,
        'Parking'  : 10,
        'Oval'     : 11,
        'Square'   : 12,
        'Place'    : 13,
        'Bridge'   : 14
    }
    
    if pd.isna(row):
        return 0
    else:
        for road in street_dict.keys():
            if road in row:
                return street_dict[road]
    return 0

Wall time: 0 ns


In [84]:
%%time
# Apply the encode_streets function to the train dataset
train['EntryStreetNameCode'] = train['EntryStreetName'].apply(encode_streets)
train['ExitStreetNameCode'] = train['ExitStreetName'].apply(encode_streets)

# Apply the encode_streets function to the test dataset
test['EntryStreetNameCode'] = test['EntryStreetName'].apply(encode_streets)
test['ExitStreetNameCode'] = test['ExitStreetName'].apply(encode_streets)

Wall time: 8.27 s


In [85]:
%%time
# Identify the unique EntryStreetName 
def encode_city(df):
    '''Encode the heading of the streets'''
    city_dict = {
        'Chicago'      : 0,
        'Atlanta'      : 1,
        'Philadelphia' : 2,
        'Boston'       : 3,
    }

    df['CityCode'] = train['City'].map(city_dict)    
    return df

Wall time: 0 ns


In [86]:
%%time
train = encode_city(train)
test = encode_city(test)

Wall time: 153 ms


In [87]:
%%time
features = ['IntersectionId', 'Latitude', 'Longitude', 'Hour', 'Weekend',
            'Month', 'EntryHeadingCode', 'ExitHeadingCode', 'EntryStreetNameCode',
            'ExitStreetNameCode', 'CityCode']

categorical = ['Hour', 'Weekend', 'Month','EntryHeadingCode', 'ExitHeadingCode', 
               'EntryStreetNameCode', 'ExitStreetNameCode', 'CityCode']

Wall time: 0 ns


In [88]:
%%time
target1 = train['TotalTimeStopped_p20']
target2 = train['TotalTimeStopped_p50']
target3 = train['TotalTimeStopped_p80']
target4 = train['DistanceToFirstStop_p20']
target5 = train['DistanceToFirstStop_p50']
target6 = train['DistanceToFirstStop_p80']

Wall time: 0 ns


In [89]:
%%time
predictions = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
targets = [target1, target2, target3, target4, target5, target6]

Wall time: 1 ms


In [90]:
%%time
# Hyper-Param Optimization Model...
train_data = lgb.Dataset(data = train[features], label = target3)

# Definition of the objective function...
def hyper_lgbm(num_leaves, feature_fraction, bagging_fraction, max_depth, min_split_gain, min_child_weight, lambda_l1, lambda_l2):
    params = {'application'     :'regression',
              'num_iterations'  : 450,
              'learning_rate'   : 0.02,
              'metric'          :'rmse'
             } # Default parameters configuration
    
    params["num_leaves"]       = int(round(num_leaves))
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth']        = int(round(max_depth))
    params['min_split_gain']   = min_split_gain
    params['min_child_weight'] = min_child_weight
    params['lambda_l1']        = lambda_l1
    params['lambda_l2']        = lambda_l2
    
    cv_results = lgb.cv(params, train_data, nfold = 5, seed = 42, categorical_feature = categorical, stratified = False, verbose_eval = None)
    return -np.min(cv_results['rmse-mean'])

Wall time: 184 ms


In [56]:
%%time
# Domain space -- Range of hyperparameters
param_domain_space = {'num_leaves'       :(120, 230),
                      'feature_fraction' :(0.3, 0.9),
                      'bagging_fraction' :(0.8, 1),
                      'lambda_l1'        :(0,3),
                      'lambda_l2'        :(0,5),
                      'max_depth'        :(8, 19),
                      'min_split_gain'   :(0.001, 0.1),
                      'min_child_weight' :(1, 20)
                     }

Wall time: 0 ns


In [57]:
%%time
# Surrogate model
optimizer = BayesianOptimization(hyper_lgbm, param_domain_space, random_state = 42)

# Optimize
optimizer.maximize(init_points = 3, n_iter = 5)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-22.21   [0m | [0m 0.8749  [0m | [0m 0.8704  [0m | [0m 2.196   [0m | [0m 2.993   [0m | [0m 9.716   [0m | [0m 3.964   [0m | [0m 0.00675 [0m | [0m 215.3   [0m |
| [0m 2       [0m | [0m-22.4    [0m | [0m 0.9202  [0m | [0m 0.7248  [0m | [0m 0.06175 [0m | [0m 4.85    [0m | [0m 17.16   [0m | [0m 5.034   [0m | [0m 0.019   [0m | [0m 140.2   [0m |
| [0m 3       [0m | [0m-22.33   [0m | [0m 0.8608  [0m | [0m 0.6149  [0m | [0m 1.296   [0m | [0m 1.456   [0m | [0m 14.73   [0m | [0m 3.65    [0m | [0m 0.02992 [0m | [0m 160.3   [0m |
| [0m 4       [0m | [0m-23.2    [0m | [0m 0.8     [0m | [0m 0.3     [0m | [0m 3.0     [0m | [0m 0.0     [0m | [0m 19.0    [0m | [0m 20.

In [91]:
%%time
optimizer.max
p = optimizer.max['params']

Wall time: 0 ns


In [92]:
%%time
param = {'num_leaves'       : int(round(p['num_leaves'])),
         'feature_fraction' : p['feature_fraction'],
         'bagging_fraction' : p['bagging_fraction'],
         'max_depth'        : int(round(p['max_depth'])),
         'lambda_l1'        : p['lambda_l1'],
         'lambda_l2'        : p['lambda_l2'],
         'min_split_gain'   : p['min_split_gain'],
         'min_child_weight' : p['min_child_weight'],
         'learning_rate'    : 0.05,
         'objective'        : 'regression',
         'boosting_type'    : 'gbdt',
         'verbose'          : 1,
         'metric'           : 'rmse',
         'seed'             : 42,
        }

Wall time: 0 ns


In [None]:
%%time
nfold = 3
kf = KFold(n_splits = nfold, random_state = 42, shuffle = True)

for i in range(len(predictions)):
    print('Training and predicting for target {}'.format(i+1))
    oof = np.zeros(len(train))
    predictions[i] = np.zeros(len(test))
    
    n = 1
    for train_index, valid_index in kf.split(targets[i]):
        print("fold {}".format(n))
        xg_train = lgb.Dataset(train[features].iloc[train_index], label = targets[i][train_index])
        xg_valid = lgb.Dataset(train[features].iloc[valid_index], label = targets[i][valid_index])   

        clf = lgb.train(param, xg_train, 15000, valid_sets = [xg_valid], categorical_feature = categorical, verbose_eval = 200, early_stopping_rounds = 500)
        oof[valid_index] = clf.predict(train[features].iloc[valid_index], num_iteration = clf.best_iteration) 

        predictions[i] += clf.predict(test[features], num_iteration = clf.best_iteration) / nfold
        n = n + 1
        
    print("\n\nCV RMSE: {:<0.4f}".format(np.sqrt(mean_squared_error(targets[i], oof))))  

Training and predicting for target 1
fold 1
Training until validation scores don't improve for 500 rounds
[200]	valid_0's rmse: 6.39182
[400]	valid_0's rmse: 6.21926
[600]	valid_0's rmse: 6.14613
[800]	valid_0's rmse: 6.10555
[1000]	valid_0's rmse: 6.08477
[1200]	valid_0's rmse: 6.06663
[1400]	valid_0's rmse: 6.05288
[1600]	valid_0's rmse: 6.04268
[1800]	valid_0's rmse: 6.03606
[2000]	valid_0's rmse: 6.03161
[2200]	valid_0's rmse: 6.02711
[2400]	valid_0's rmse: 6.02451
[2600]	valid_0's rmse: 6.02323
[2800]	valid_0's rmse: 6.02209
[3000]	valid_0's rmse: 6.02301
[3200]	valid_0's rmse: 6.02467
Early stopping, best iteration is:
[2789]	valid_0's rmse: 6.02205
fold 2
Training until validation scores don't improve for 500 rounds
[200]	valid_0's rmse: 6.2613
[400]	valid_0's rmse: 6.10211
[600]	valid_0's rmse: 6.03839
[800]	valid_0's rmse: 5.99869
[1000]	valid_0's rmse: 5.97383
[1200]	valid_0's rmse: 5.95818
[1400]	valid_0's rmse: 5.94751
[1600]	valid_0's rmse: 5.94191
[1800]	valid_0's rmse: 5