In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!pip install catboost

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import pickle

from catboost import CatBoostRegressor

In [0]:
PATH_TO_DATA = Path('drive/My Drive/sendy/data')
SEED = 42

In [0]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [0]:
def write_to_submission_file(preds, preds_idx, file_name):
    df = pd.DataFrame(preds, index=preds_idx).reset_index()
    df.columns = ['Order_No', 'Time from Pickup to Arrival']
    df.to_csv(PATH_TO_DATA / f'../submissions/{file_name}', index=False)

In [0]:
class Model:
    def __init__(self, X, y, test, model, cat_ids, model_type, metric, test_size=0.3, random_state=42):
        self.X = X
        self.y = y
        self.test = test
        self.model = model
        self.cat_ids = cat_ids
        self.model_type = model_type
        self.test_size = test_size
        self.metric = metric
        self.random_state = random_state
        self.oof = np.zeros(len(test))
        self.kfolds = np.zeros(len(X))
        self.error = 0
        self.features_importances = np.zeros(X.shape[1])
    
    
    def fitModel(self, n_splits=5):
        kfolds = KFold(n_splits=n_splits)
        for train_idx, test_idx in kfolds.split(self.X, self.y):
            X_train, X_test = self.X.values[train_idx], self.X.values[test_idx]
            y_train, y_test = self.y[train_idx], self.y[test_idx]
            
            self.model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=500, cat_features=self.cat_ids)
            
            preds_val = self.model.predict(X_test)
            # print(y_test.shape, test_idx.shape, preds_grouped.shape, preds_grouped_true.shape)
            error = self.metric(y_test, preds_val)
            print('eval_metric: ', error)
            self.error += error
            preds_test = self.model.predict(self.test)
            
            self.oof += preds_test
            if self.model_type == 'lgb':
                self.features_importances += self.model.feature_importances_
            elif self.model_type == 'cat':
                self.features_importances += self.model.get_feature_importance()
        self.error /= n_splits
        self.oof /= n_splits
        self.features_importances /= n_splits
  
    def getFeaturesImportances(self):
        data = pd.DataFrame(self.features_importances, index=self.X.columns).sort_values(by=0, ascending=False)
        data.columns = ['features_importance']
        return data

    def getError(self):
        return self.error 

    def getOof(self):
        return self.oof

    def getKfoldsPreds(self):
        return self.kfolds

In [0]:
train = pd.read_csv(PATH_TO_DATA / 'given/Train.csv')
test = pd.read_csv(PATH_TO_DATA / 'given/Test.csv')

In [0]:
riders = pd.read_csv(PATH_TO_DATA / 'given/Riders.csv')

In [10]:
train.head()

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),Confirmation - Time,Arrival at Pickup - Day of Month,Arrival at Pickup - Weekday (Mo = 1),Arrival at Pickup - Time,Pickup - Day of Month,Pickup - Weekday (Mo = 1),Pickup - Time,Arrival at Destination - Day of Month,Arrival at Destination - Weekday (Mo = 1),Arrival at Destination - Time,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,Time from Pickup to Arrival
0,Order_No_4211,User_Id_633,Bike,3,Business,9,5,9:35:46 AM,9,5,9:40:10 AM,9,5,10:04:47 AM,9,5,10:27:30 AM,9,5,10:39:55 AM,4,20.4,,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,745
1,Order_No_25375,User_Id_2285,Bike,3,Personal,12,5,11:16:16 AM,12,5,11:23:21 AM,12,5,11:40:22 AM,12,5,11:44:09 AM,12,5,12:17:22 PM,16,26.4,,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,1993
2,Order_No_1899,User_Id_265,Bike,3,Business,30,2,12:39:25 PM,30,2,12:42:44 PM,30,2,12:49:34 PM,30,2,12:53:03 PM,30,2,1:00:38 PM,3,,,-1.308284,36.843419,-1.300921,36.828195,Rider_Id_155,455
3,Order_No_9336,User_Id_1402,Bike,3,Business,15,5,9:25:34 AM,15,5,9:26:05 AM,15,5,9:37:56 AM,15,5,9:43:06 AM,15,5,10:05:27 AM,9,19.2,,-1.281301,36.832396,-1.257147,36.795063,Rider_Id_855,1341
4,Order_No_27883,User_Id_1737,Bike,1,Personal,13,1,9:55:18 AM,13,1,9:56:18 AM,13,1,10:03:53 AM,13,1,10:05:23 AM,13,1,10:25:37 AM,9,15.4,,-1.266597,36.792118,-1.295041,36.809817,Rider_Id_770,1214


In [0]:
train = train.merge(riders, on='Rider Id', how='left')

In [0]:
test = test.merge(riders, on='Rider Id', how='left')

In [0]:
with open(PATH_TO_DATA / 'preprocessed/train_coords.pkl', 'rb') as f:
    train_coords = pickle.load(f)

with open(PATH_TO_DATA / 'preprocessed/test_coords.pkl', 'rb') as f:
    test_coords = pickle.load(f)

In [0]:
train_w_coords = pd.concat((train, train_coords), axis=1)
test_w_coords = pd.concat((test, test_coords.reset_index(drop=True)), axis=1)

In [16]:
train_w_coords.head()

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),Confirmation - Time,Arrival at Pickup - Day of Month,Arrival at Pickup - Weekday (Mo = 1),Arrival at Pickup - Time,Pickup - Day of Month,Pickup - Weekday (Mo = 1),Pickup - Time,Arrival at Destination - Day of Month,Arrival at Destination - Weekday (Mo = 1),Arrival at Destination - Time,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,Time from Pickup to Arrival,No_Of_Orders,Age,Average_Rating,No_of_Ratings,ascent,avg_speed,descent,detourfactor,distance,steps_length,duration
0,Order_No_4211,User_Id_633,Bike,3,Business,9,5,9:35:46 AM,9,5,9:40:10 AM,9,5,10:04:47 AM,9,5,10:27:30 AM,9,5,10:39:55 AM,4,20.4,,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,745,1637,1309,13.8,549,9.845833,21.45,11.495833,1.59,3.037,11,509.8
1,Order_No_25375,User_Id_2285,Bike,3,Personal,12,5,11:16:16 AM,12,5,11:23:21 AM,12,5,11:40:22 AM,12,5,11:44:09 AM,12,5,12:17:22 PM,16,26.4,,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,1993,396,339,13.6,69,130.242525,18.67,53.240025,1.42,16.246,25,3133.3
2,Order_No_1899,User_Id_265,Bike,3,Business,30,2,12:39:25 PM,30,2,12:42:44 PM,30,2,12:49:34 PM,30,2,12:53:03 PM,30,2,1:00:38 PM,3,,,-1.308284,36.843419,-1.300921,36.828195,Rider_Id_155,455,1023,242,12.5,114,31.738324,21.49,15.863324,1.71,3.274,12,548.4
3,Order_No_9336,User_Id_1402,Bike,3,Business,15,5,9:25:34 AM,15,5,9:26:05 AM,15,5,9:37:56 AM,15,5,9:43:06 AM,15,5,10:05:27 AM,9,19.2,,-1.281301,36.832396,-1.257147,36.795063,Rider_Id_855,1341,886,283,14.5,113,111.500595,21.02,33.350595,1.49,7.259,39,1243.2
4,Order_No_27883,User_Id_1737,Bike,1,Personal,13,1,9:55:18 AM,13,1,9:56:18 AM,13,1,10:03:53 AM,13,1,10:05:23 AM,13,1,10:25:37 AM,9,15.4,,-1.266597,36.792118,-1.295041,36.809817,Rider_Id_770,1214,2311,872,14.1,533,92.232147,21.34,77.855224,1.66,6.226,21,1050.3


In [0]:
cols_to_drop_train = ['Order No', 'User Id', 'Vehicle Type', 'Placement - Time',
                'Confirmation - Time', 'Arrival at Pickup - Time', 'Pickup - Time', 
                'Arrival at Destination - Day of Month', 'Arrival at Destination - Weekday (Mo = 1)',
                'Arrival at Destination - Time', 'Rider Id']
cols_to_drop_test = ['Order No', 'User Id', 'Vehicle Type', 'Placement - Time',
                'Confirmation - Time', 'Arrival at Pickup - Time', 'Pickup - Time', 
                'Rider Id']

In [0]:
train_wt_cols = train_w_coords.drop(cols_to_drop_train, axis=1)

In [0]:
test_wt_cols = test_w_coords.drop(cols_to_drop_test, axis=1)

In [20]:
train_wt_cols.head()

Unnamed: 0,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),Arrival at Pickup - Day of Month,Arrival at Pickup - Weekday (Mo = 1),Pickup - Day of Month,Pickup - Weekday (Mo = 1),Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Time from Pickup to Arrival,No_Of_Orders,Age,Average_Rating,No_of_Ratings,ascent,avg_speed,descent,detourfactor,distance,steps_length,duration
0,3,Business,9,5,9,5,9,5,9,5,4,20.4,,-1.317755,36.83037,-1.300406,36.829741,745,1637,1309,13.8,549,9.845833,21.45,11.495833,1.59,3.037,11,509.8
1,3,Personal,12,5,12,5,12,5,12,5,16,26.4,,-1.351453,36.899315,-1.295004,36.814358,1993,396,339,13.6,69,130.242525,18.67,53.240025,1.42,16.246,25,3133.3
2,3,Business,30,2,30,2,30,2,30,2,3,,,-1.308284,36.843419,-1.300921,36.828195,455,1023,242,12.5,114,31.738324,21.49,15.863324,1.71,3.274,12,548.4
3,3,Business,15,5,15,5,15,5,15,5,9,19.2,,-1.281301,36.832396,-1.257147,36.795063,1341,886,283,14.5,113,111.500595,21.02,33.350595,1.49,7.259,39,1243.2
4,1,Personal,13,1,13,1,13,1,13,1,9,15.4,,-1.266597,36.792118,-1.295041,36.809817,1214,2311,872,14.1,533,92.232147,21.34,77.855224,1.66,6.226,21,1050.3


In [21]:
test_coords.shape, test.shape, test_w_coords.shape

((7068, 7), (7068, 29), (7068, 36))

In [0]:
encoder = LabelEncoder()
train_wt_cols['Personal or Business'] = encoder.fit_transform(train_wt_cols['Personal or Business'])
test_wt_cols['Personal or Business'] = encoder.transform(test_wt_cols['Personal or Business'])

In [23]:
train_wt_cols.head()

Unnamed: 0,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),Arrival at Pickup - Day of Month,Arrival at Pickup - Weekday (Mo = 1),Pickup - Day of Month,Pickup - Weekday (Mo = 1),Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Time from Pickup to Arrival,No_Of_Orders,Age,Average_Rating,No_of_Ratings,ascent,avg_speed,descent,detourfactor,distance,steps_length,duration
0,3,0,9,5,9,5,9,5,9,5,4,20.4,,-1.317755,36.83037,-1.300406,36.829741,745,1637,1309,13.8,549,9.845833,21.45,11.495833,1.59,3.037,11,509.8
1,3,1,12,5,12,5,12,5,12,5,16,26.4,,-1.351453,36.899315,-1.295004,36.814358,1993,396,339,13.6,69,130.242525,18.67,53.240025,1.42,16.246,25,3133.3
2,3,0,30,2,30,2,30,2,30,2,3,,,-1.308284,36.843419,-1.300921,36.828195,455,1023,242,12.5,114,31.738324,21.49,15.863324,1.71,3.274,12,548.4
3,3,0,15,5,15,5,15,5,15,5,9,19.2,,-1.281301,36.832396,-1.257147,36.795063,1341,886,283,14.5,113,111.500595,21.02,33.350595,1.49,7.259,39,1243.2
4,1,1,13,1,13,1,13,1,13,1,9,15.4,,-1.266597,36.792118,-1.295041,36.809817,1214,2311,872,14.1,533,92.232147,21.34,77.855224,1.66,6.226,21,1050.3


In [24]:
test_wt_cols.head()

Unnamed: 0,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),Arrival at Pickup - Day of Month,Arrival at Pickup - Weekday (Mo = 1),Pickup - Day of Month,Pickup - Weekday (Mo = 1),Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,No_Of_Orders,Age,Average_Rating,No_of_Ratings,ascent,avg_speed,descent,detourfactor,distance,steps_length,duration
0,3,0,27,3,27,3,27,3,27,3,8,,,-1.333275,36.870815,-1.305249,36.82239,439,1511,13.3,171,58.666481,20.54,38.258148,1.32,8.52,17,1493.0
1,3,0,17,5,17,5,17,5,17,5,5,,,-1.272639,36.794723,-1.277007,36.823907,488,273,14.4,45,67.101756,21.5,134.751756,2.11,6.891,36,1153.8
2,3,0,27,4,27,4,27,4,27,4,5,22.8,,-1.290894,36.822971,-1.276574,36.851365,833,460,13.1,67,20.283028,21.63,43.740171,1.34,4.732,22,787.4
3,3,0,17,1,17,1,17,1,17,1,5,24.5,,-1.290503,36.809646,-1.303382,36.790658,487,560,13.7,44,62.858571,21.83,8.870336,1.55,4.071,15,671.5
4,3,0,11,2,11,2,11,2,11,2,6,24.4,,-1.281081,36.814423,-1.266467,36.792161,4761,1120,14.1,1010,89.355,21.42,46.98,1.4,4.143,14,696.3


In [0]:
X = train_wt_cols.drop('Time from Pickup to Arrival', axis=1)
y = train_wt_cols['Time from Pickup to Arrival']
test = test_wt_cols.copy()

In [0]:
X = X.loc[y > 20, :]
y = y[y > 20]

In [0]:
params_cat = {
    'task_type': 'GPU',
    'loss_function': 'RMSE',
    'iterations': 10000,
    'early_stopping_rounds': 300,
    'learning_rate': 0.1,
    'random_seed': 42,
    'use_best_model': True,
}

In [0]:
catboost = CatBoostRegressor(**params_cat)

In [0]:
cat_ids = list(range(9))

In [0]:
def makeStrCatCats(data, cat_ids):
    for i in cat_ids:
        data.iloc[:, i] = data.iloc[:, i].apply(str)
    return data

In [0]:
model = Model(X.reset_index(drop=True), y.reset_index(drop=True), test, catboost, cat_ids, 'cat', rmse)

In [0]:
X = makeStrCatCats(X, cat_ids)
test = makeStrCatCats(test, cat_ids)

In [59]:
model.fitModel()

0:	learn: 925.9646182	test: 965.3150871	best: 965.3150871 (0)	total: 65.1ms	remaining: 10m 51s
500:	learn: 681.0646091	test: 770.6119900	best: 770.6050455 (495)	total: 24.2s	remaining: 7m 37s
1000:	learn: 655.9712268	test: 769.5277302	best: 769.4775125 (999)	total: 48.2s	remaining: 7m 13s
bestTest = 769.3723762
bestIteration = 1014
Shrink model to first 1015 iterations.
eval_metric:  769.3724424058519
0:	learn: 932.3470075	test: 938.4486518	best: 938.4486518 (0)	total: 48.1ms	remaining: 8m
500:	learn: 685.1827279	test: 749.1769826	best: 749.0487375 (489)	total: 24.1s	remaining: 7m 36s
1000:	learn: 655.0658020	test: 746.4918416	best: 746.4606989 (990)	total: 48.3s	remaining: 7m 14s
1500:	learn: 635.2478283	test: 745.7527386	best: 745.4089230 (1462)	total: 1m 12s	remaining: 6m 52s
bestTest = 745.2780252
bestIteration = 1597
Shrink model to first 1598 iterations.
eval_metric:  745.2780681995475
0:	learn: 933.8657257	test: 929.5366939	best: 929.5366939 (0)	total: 53.9ms	remaining: 8m 59s
5

In [0]:
test_idx = pd.read_csv(PATH_TO_DATA / 'given/Test.csv')['Order No']

In [0]:
write_to_submission_file(model.getOof(), test_idx, 'cat_boost_init.csv')

In [0]:
model.getError()

748.189724181231

In [60]:
model.getFeaturesImportances()

Unnamed: 0,features_importance
Distance (KM),21.673651
Average_Rating,7.75897
Age,7.31792
Pickup Lat,6.367657
duration,6.186264
Destination Lat,6.053177
steps_length,5.177292
No_Of_Orders,5.169047
No_of_Ratings,4.646993
Destination Long,4.414627
