In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, median_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from scipy.stats import *
import h3
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from datetime import datetime
from math import floor
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn

In [2]:
ride_columns = ['accepted_driver_id', 'created_at', 'passenger_id', 'source_lat',
       'source_lng', 'destination_lat', 'destination_lng', 'eta', 'provider',
       'ata', 'id', 'city']

khatkesh_columns = ['ride_id','driver_id','a_t_a_result.arrival_a_t_a','a_t_a_result.boarding_a_t_a','a_t_a_result.ride_a_t_a','a_t_a_result.arrival_probe_result.probe.point.lat','a_t_a_result.arrival_probe_result.probe.point.lon','a_t_a_result.arrival_probe_result.probe.timestamp','a_t_a_result.arrival_probe_result.confidence','a_t_a_result.arrival_probe_result.h3_index', 'a_t_a_result.arrival_probe_result.k_ring_level','a_t_a_result.boarding_probe_result.probe.point.lat','a_t_a_result.boarding_probe_result.probe.point.lon','a_t_a_result.boarding_probe_result.probe.timestamp','a_t_a_result.boarding_probe_result.confidence','a_t_a_result.boarding_probe_result.h3_index','a_t_a_result.boarding_probe_result.k_ring_level','a_t_a_result.final_destination_probe_result.probe.point.lat','a_t_a_result.final_destination_probe_result.probe.point.lon','a_t_a_result.final_destination_probe_result.probe.timestamp','a_t_a_result.final_destination_probe_result.confidence','a_t_a_result.final_destination_probe_result.h3_index','a_t_a_result.final_destination_probe_result.k_ring_level','a_t_a_result.destination_probe_result.probe.point.lat','a_t_a_result.destination_probe_result.probe.point.lon','a_t_a_result.destination_probe_result.probe.timestamp','a_t_a_result.destination_probe_result.confidence','a_t_a_result.destination_probe_result.h3_index','a_t_a_result.destination_probe_result.k_ring_level','a_t_a_result.extra_destination_probe_result.probe.point.lat','a_t_a_result.extra_destination_probe_result.probe.point.lon','a_t_a_result.extra_destination_probe_result.probe.timestamp','a_t_a_result.extra_destination_probe_result.confidence','a_t_a_result.extra_destination_probe_result.h3_index','a_t_a_result.extra_destination_probe_result.k_ring_level','pickup_a_d_d_result.distance','pickup_a_d_d_result.confidence','pickup_a_d_d_result.route_ratio','pickup_a_d_d_result.g_p_s_ratio','ride_a_d_d_result.distance','ride_a_d_d_result.confidence','ride_a_d_d_result.route_ratio','ride_a_d_d_result.g_p_s_ratio','total_a_d_d_confidence','in_ride_allotment','e_d_d','clickhouse_time','hash']

train_dates = ['2022-08-02', '2022-08-03', '2022-08-04', '2022-08-05', '2022-08-06', '2022-08-07', '2022-08-08',
               '2022-08-09', '2022-08-10', '2022-08-11', '2022-08-12', '2022-08-13', '2022-08-14', '2022-08-15',
               '2022-08-16', '2022-08-17', '2022-08-18', '2022-08-19', '2022-08-20', '2022-08-21', '2022-08-22',
               '2022-08-23', '2022-08-24', '2022-08-25', '2022-08-26', '2022-08-27', '2022-08-28', '2022-08-29',
               '2022-08-30', '2022-08-31', '2022-09-01', '2022-09-02', '2022-09-03', '2022-09-04', '2022-09-05',
               '2022-09-06', '2022-09-07', '2022-09-08', '2022-09-09', '2022-09-10']

train_holidays = ['2022-08-07', '2022-08-08']

test_dates = ['2022-09-11', '2022-09-12', '2022-09-13', '2022-09-14', '2022-09-15', '2022-09-16', '2022-09-17',
              '2022-09-18', '2022-09-19', '2022-09-20', '2022-09-21', '2022-09-22', '2022-09-23', '2022-09-24',
              '2022-09-25', '2022-09-26', '2022-09-27', '2022-09-28', '2022-09-29', '2022-09-30', '2022-10-01',
              '2022-10-02', '2022-10-03', '2022-10-04', '2022-10-05', '2022-10-06', '2022-10-07', '2022-10-08',
              '2022-10-09']

test_holidays = ['2022-09-17', '2022-09-25', '2022-09-27', '2022-10-05']

In [3]:
rides_train = pd.read_csv('../rides_train.csv')
rides_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4063341 entries, 0 to 4063340
Data columns (total 59 columns):
 #   Column                                                       Dtype  
---  ------                                                       -----  
 0   accepted_driver_id                                           int64  
 1   created_at                                                   object 
 2   passenger_id                                                 int64  
 3   source_lat                                                   float64
 4   source_lng                                                   float64
 5   destination_lat                                              float64
 6   destination_lng                                              float64
 7   eta                                                          int64  
 8   provider                                                     object 
 9   ata                                                          int64  

In [4]:
rides_test = pd.read_csv('../rides_test.csv')
rides_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2729988 entries, 0 to 2729987
Data columns (total 59 columns):
 #   Column                                                       Dtype  
---  ------                                                       -----  
 0   accepted_driver_id                                           int64  
 1   created_at                                                   object 
 2   passenger_id                                                 int64  
 3   source_lat                                                   float64
 4   source_lng                                                   float64
 5   destination_lat                                              float64
 6   destination_lng                                              float64
 7   eta                                                          int64  
 8   provider                                                     object 
 9   ata                                                          int64  

In [6]:
rides_train_p3 = rides_train[(rides_train['eta'] >= 1200) & (rides_train['eta'] < 1800)]
rides_train_p3

Unnamed: 0,accepted_driver_id,created_at,passenger_id,source_lat,source_lng,destination_lat,destination_lng,eta,provider,ata,...,pickup_a_d_d_result.g_p_s_ratio,ride_a_d_d_result.distance,ride_a_d_d_result.confidence,ride_a_d_d_result.route_ratio,ride_a_d_d_result.g_p_s_ratio,total_a_d_d_confidence,in_ride_allotment,e_d_d,clickhouse_time,hash
4,1389231,2022-08-02 08:34:03,7470502,35.760803,51.412613,35.680412,51.406525,1295,smapp-same-dc,1320,...,0.007752,24.997999,1.000000,0.0,0.976744,1.000000,0.0,0.000,2022-08-02 09:01:23,16898267108648726188
7,2728438,2022-08-02 07:57:27,46320838,35.551586,51.251617,35.647949,51.399227,1721,smapp-same-dc,1726,...,0.006329,46.188000,1.000000,0.0,1.000000,1.000000,0.0,0.000,2022-08-02 08:29:58,10312041573682854014
8,3929597,2022-08-02 14:30:25,50000201,35.417717,51.795067,35.465965,51.675755,1230,smapp-same-dc,1048,...,0.009901,26.224001,0.950122,0.0,1.000000,0.957399,0.0,0.000,2022-08-02 14:52:35,5706011284955171486
14,1747330,2022-08-02 15:10:14,14512880,35.768822,51.305519,35.765411,51.442303,1641,smapp-same-dc,2785,...,0.003663,41.141998,1.000000,0.0,0.963370,1.000000,0.0,0.000,2022-08-02 16:09:17,8400418949954600245
17,3225425,2022-08-02 14:27:08,37976832,35.748501,51.373596,35.784950,51.459675,1419,smapp-same-dc,2136,...,0.004292,22.626093,0.769841,0.0,0.545064,0.810101,0.0,0.000,2022-08-02 15:15:30,6831010361265584132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4063317,3600338,2022-09-10 13:22:41,45600791,35.310020,51.730579,35.423836,51.585243,1565,smapp-same-dc,1638,...,0.006329,30.001736,0.768606,0.0,0.506329,0.734278,0.0,21.211,2022-09-10 13:57:58,13937156972589334868
4063318,721503,2022-09-10 02:05:49,18806359,35.719933,51.335495,35.727440,51.526875,1344,smapp-same-dc,1139,...,0.004975,24.430000,0.594638,0.0,0.079602,0.591892,0.0,22.188,2022-09-10 02:38:12,8941506555186050771
4063330,2547634,2022-09-10 14:17:31,18081143,35.720974,51.498474,35.708061,51.401943,1301,smapp-same-dc,1487,...,0.006944,20.761999,0.992727,0.0,0.937500,0.982424,0.0,10.682,2022-09-10 14:51:03,15050861885246709008
4063337,2368196,2022-09-10 15:42:45,5676377,35.724361,51.402294,35.794140,51.472240,1239,smapp-same-dc,1567,...,0.006452,33.134998,0.877245,0.0,0.748387,0.893184,0.0,14.393,2022-09-10 16:13:23,15480662044417010408


In [7]:
rides_test_p3 = rides_test[(rides_test['eta'] >= 1200) & (rides_test['eta'] < 1800)]
rides_test_p3

Unnamed: 0,accepted_driver_id,created_at,passenger_id,source_lat,source_lng,destination_lat,destination_lng,eta,provider,ata,...,pickup_a_d_d_result.g_p_s_ratio,ride_a_d_d_result.distance,ride_a_d_d_result.confidence,ride_a_d_d_result.route_ratio,ride_a_d_d_result.g_p_s_ratio,total_a_d_d_confidence,in_ride_allotment,e_d_d,clickhouse_time,hash
4,2255413,2022-09-11 16:18:50,13204345,35.754002,51.502956,35.633537,51.494732,1586,smapp-same-dc,1663,...,0.005952,29.419765,0.887584,0.0,0.779762,0.887654,0.000,17.757999,2022-09-11 18:07:01,3093743379629491928
9,1835396,2022-09-11 08:00:59,53997073,35.706886,51.198895,35.734352,51.351475,1409,smapp-same-dc,2047,...,0.004386,36.624611,0.966836,0.0,0.864035,0.971825,0.000,19.396999,2022-09-11 08:42:53,2284898990298444222
14,995872,2022-09-11 10:37:12,28136537,35.792473,51.425320,35.698429,51.456745,1708,smapp-same-dc,2254,...,0.004484,31.429775,0.955925,0.0,0.959641,0.942513,0.412,19.094999,2022-09-11 11:21:00,6241151625673717460
26,1551060,2022-09-11 07:58:38,41732133,35.639339,51.357048,35.687080,51.414032,1284,smapp-same-dc,1643,...,0.006173,16.183001,0.739202,0.0,0.469136,0.801064,0.000,11.752000,2022-09-11 08:35:17,8035995659418123778
32,4096383,2022-09-11 07:03:23,684714,35.720085,51.301212,35.710041,51.175381,1275,smapp-same-dc,1435,...,0.006944,34.138000,0.968188,0.0,0.979167,0.972344,0.000,18.556000,2022-09-11 07:32:28,5890795519639149617
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2729961,1009979,2022-10-09 13:29:46,20463806,35.735298,51.317776,35.680313,51.388023,1253,smapp-same-dc,1612,...,0.006289,28.757999,0.960011,0.0,0.911950,0.966451,0.000,14.957000,2022-10-09 14:02:58,16088522187439589912
2729962,658795,2022-10-09 15:03:36,130209,35.807129,51.428375,35.721817,51.440979,1690,smapp-same-dc,2673,...,0.003774,27.115614,0.692608,0.0,0.554717,0.699067,0.000,12.689000,2022-10-09 16:00:13,14193172852111088267
2729973,108962,2022-10-09 11:41:52,19648,35.770760,51.352173,35.711819,51.408939,1581,smapp-same-dc,1624,...,0.005952,31.447001,0.988059,0.0,0.958333,0.989762,0.000,14.896000,2022-10-09 12:13:46,15613762872068344084
2729978,233709,2022-10-09 19:53:50,9499000,35.800331,51.489254,35.755852,51.543327,1312,smapp-same-dc,1202,...,0.000000,12.179000,0.500000,0.0,0.000000,0.450000,0.000,12.179000,2022-10-09 20:20:12,12251248972094970402


There are some duplicates in the data

In [8]:
rides_train_p3 = rides_train_p3.drop_duplicates(subset=['ride_id', 'ata'])
len(rides_train_p3)

698924

In [9]:
rides_test_p3 = rides_test_p3.drop_duplicates(subset=['ride_id', 'ata'])
len(rides_test_p3)

459060

In [10]:
rides_train_p3 = rides_train_p3[(rides_train_p3['a_t_a_result.ride_a_t_a'] > 180) &
                                (rides_train_p3['a_t_a_result.ride_a_t_a'] < 10800)]
rides_train_p3.describe()

Unnamed: 0,accepted_driver_id,passenger_id,source_lat,source_lng,destination_lat,destination_lng,eta,ata,ride_id,city,...,pickup_a_d_d_result.route_ratio,pickup_a_d_d_result.g_p_s_ratio,ride_a_d_d_result.distance,ride_a_d_d_result.confidence,ride_a_d_d_result.route_ratio,ride_a_d_d_result.g_p_s_ratio,total_a_d_d_confidence,in_ride_allotment,e_d_d,hash
count,698251.0,698251.0,698251.0,698251.0,698251.0,698251.0,698251.0,698251.0,698251.0,698251.0,...,698251.0,698251.0,698251.0,698251.0,698251.0,698251.0,698251.0,698251.0,698251.0,698251.0
mean,1857428.0,21687310.0,35.710893,51.399347,35.712309,51.395567,1456.451147,1725.10225,4824418000.0,1.0,...,0.000156,0.005585,26.897764,0.854451,1.9e-05,0.719288,0.86349,0.074902,3.885754,9.220974e+18
std,1409675.0,17875870.0,0.074141,0.100507,0.073239,0.106259,169.848543,575.757119,60121780.0,0.0,...,0.00976,0.002304,10.484821,0.172755,0.000484,0.333526,0.164481,0.608518,7.312222,5.324649e+18
min,9.0,76.0,35.178387,50.901024,35.173473,50.764294,1200.0,143.0,4714480000.0,1.0,...,0.0,0.0,1.398,0.0,0.0,0.0,0.0,0.0,0.0,76924880000000.0
25%,447417.0,3664554.0,35.681526,51.349068,35.685211,51.349876,1308.0,1386.0,4774177000.0,1.0,...,0.0,0.004717,19.204,0.756863,0.0,0.5,0.781687,0.0,0.0,4.613144e+18
50%,1746312.0,20155980.0,35.72385,51.407139,35.724022,51.40818,1436.0,1624.0,4825935000.0,1.0,...,0.0,0.005882,25.815001,0.937937,0.0,0.893443,0.940537,0.0,0.0,9.214321e+18
75%,3138568.0,36039660.0,35.759724,51.453899,35.760132,51.449703,1594.0,1930.0,4872342000.0,1.0,...,0.0,0.006993,33.271999,0.987221,0.0,0.97931,0.987554,0.0,0.0,1.38387e+19
max,4330757.0,57377540.0,35.83987,52.1077,35.943981,52.106018,1799.0,20274.0,4933870000.0,1.0,...,1.0,0.052632,1042.181519,1.0,0.072162,1.0,1.0,84.599289,69.93,1.844674e+19


In [11]:
rides_train_p3 = rides_train_p3[(rides_train_p3['a_t_a_result.destination_probe_result.confidence'] > 0) & (rides_train_p3['a_t_a_result.boarding_probe_result.confidence'] > 0)]
rides_train_p3.describe()

Unnamed: 0,accepted_driver_id,passenger_id,source_lat,source_lng,destination_lat,destination_lng,eta,ata,ride_id,city,...,pickup_a_d_d_result.route_ratio,pickup_a_d_d_result.g_p_s_ratio,ride_a_d_d_result.distance,ride_a_d_d_result.confidence,ride_a_d_d_result.route_ratio,ride_a_d_d_result.g_p_s_ratio,total_a_d_d_confidence,in_ride_allotment,e_d_d,hash
count,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,...,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0
mean,1903604.0,21477330.0,35.711474,51.399068,35.713951,51.395514,1455.52727,1748.044648,4824397000.0,1.0,...,7.6e-05,0.006217,28.394496,0.904068,1.1e-05,0.82622,0.91317,0.081313,3.960848,9.213893e+18
std,1419565.0,17835460.0,0.07331,0.100062,0.072304,0.106199,169.783044,567.404541,60249310.0,0.0,...,0.006515,0.001535,10.024319,0.137919,0.000339,0.255499,0.12315,0.623839,7.372856,5.324936e+18
min,9.0,76.0,35.178387,50.901024,35.173473,50.764294,1200.0,143.0,4714480000.0,1.0,...,0.0,0.000904,1.398,0.0,0.0,0.003937,0.0,0.0,0.0,76924880000000.0
25%,473833.0,3543176.0,35.681633,51.348881,35.68692,51.350353,1307.0,1412.0,4773912000.0,1.0,...,0.0,0.005208,21.094999,0.883406,0.0,0.784946,0.890993,0.0,0.0,4.606591e+18
50%,1824698.0,19783290.0,35.724007,51.406883,35.725761,51.407803,1435.0,1646.0,4825453000.0,1.0,...,0.0,0.006135,27.467377,0.964323,0.0,0.948617,0.965939,0.0,0.0,9.200858e+18
75%,3207710.0,35782050.0,35.759914,51.453754,35.761288,51.450859,1592.0,1950.0,4872820000.0,1.0,...,0.0,0.007143,34.560001,0.994926,0.0,0.992308,0.994692,0.0,0.0,1.383479e+19
max,4330757.0,57377540.0,35.839024,52.1077,35.943981,52.103642,1799.0,20274.0,4933870000.0,1.0,...,1.0,0.052632,260.281769,1.0,0.048962,1.0,1.0,22.429001,66.652,1.844674e+19


In [12]:
rides_test_p3 = rides_test_p3[(rides_test_p3['a_t_a_result.destination_probe_result.confidence'] > 0) & (rides_test_p3['a_t_a_result.boarding_probe_result.confidence'] > 0)]
rides_test_p3.describe()

Unnamed: 0,accepted_driver_id,passenger_id,source_lat,source_lng,destination_lat,destination_lng,eta,ata,ride_id,city,...,pickup_a_d_d_result.route_ratio,pickup_a_d_d_result.g_p_s_ratio,ride_a_d_d_result.distance,ride_a_d_d_result.confidence,ride_a_d_d_result.route_ratio,ride_a_d_d_result.g_p_s_ratio,total_a_d_d_confidence,in_ride_allotment,e_d_d,hash
count,277921.0,277921.0,277921.0,277921.0,277921.0,277921.0,277921.0,277921.0,277921.0,277921.0,...,277921.0,277921.0,277921.0,277921.0,277921.0,277921.0,277921.0,277921.0,277921.0,277921.0
mean,1950764.0,22064430.0,35.707878,51.398545,35.710745,51.395072,1453.74868,1745.764156,5010238000.0,1.0,...,5.7e-05,0.006274,29.568464,0.905545,1e-05,0.835038,0.914267,0.023702,16.671833,9.232512e+18
std,1456583.0,18067660.0,0.076061,0.103353,0.075421,0.110683,169.292906,605.452738,44045170.0,0.0,...,0.005579,0.00161,10.869953,0.137264,0.000304,0.254168,0.122694,0.241595,5.479094,5.329329e+18
min,9.0,17.0,35.181488,50.900341,35.168644,50.867809,1200.0,128.0,4933925000.0,1.0,...,0.0,0.00063,0.124,0.0,0.0,0.004914,0.0,0.0,0.421,14929870000000.0
25%,485463.0,3826310.0,35.676189,51.345062,35.681149,51.34626,1306.0,1397.0,4972242000.0,1.0,...,0.0,0.005263,21.969,0.888027,0.0,0.813433,0.89449,0.0,12.729,4.613929e+18
50%,1874054.0,20664210.0,35.722034,51.405483,35.724213,51.407406,1432.0,1629.0,5011284000.0,1.0,...,0.0,0.006211,28.691999,0.9657,0.0,0.955882,0.967212,0.0,16.07,9.233618e+18
75%,3280994.0,36480220.0,35.758289,51.455685,35.760063,51.453274,1589.0,1932.0,5048534000.0,1.0,...,0.0,0.007246,36.049297,0.995148,0.0,0.99359,0.994865,0.0,19.937,1.384941e+19
max,4420741.0,58137520.0,35.83984,52.081783,35.896008,52.107143,1799.0,18031.0,5083628000.0,1.0,...,1.0,0.066667,1277.150391,1.0,0.036465,1.0,1.0,40.013607,90.994003,1.844671e+19


In [13]:
rides_train_p3 = rides_train_p3[['eta', 'ata', 'a_t_a_result.ride_a_t_a', 'created_at', 'source_lat', 'source_lng', 'destination_lat', 'destination_lng', 'e_d_d', 'clickhouse_time']]
rides_train_p3

Unnamed: 0,eta,ata,a_t_a_result.ride_a_t_a,created_at,source_lat,source_lng,destination_lat,destination_lng,e_d_d,clickhouse_time
4,1295,1320,1288,2022-08-02 08:34:03,35.760803,51.412613,35.680412,51.406525,0.000,2022-08-02 09:01:23
7,1721,1726,1580,2022-08-02 07:57:27,35.551586,51.251617,35.647949,51.399227,0.000,2022-08-02 08:29:58
8,1230,1048,1004,2022-08-02 14:30:25,35.417717,51.795067,35.465965,51.675755,0.000,2022-08-02 14:52:35
14,1641,2785,2727,2022-08-02 15:10:14,35.768822,51.305519,35.765411,51.442303,0.000,2022-08-02 16:09:17
18,1390,2010,1964,2022-08-02 11:08:50,35.730118,51.382473,35.758369,51.441551,0.000,2022-08-02 12:05:23
...,...,...,...,...,...,...,...,...,...,...
4063264,1712,1875,1830,2022-09-10 12:24:51,35.669792,51.302319,35.685867,51.414448,13.342,2022-09-10 13:04:13
4063317,1565,1638,1579,2022-09-10 13:22:41,35.310020,51.730579,35.423836,51.585243,21.211,2022-09-10 13:57:58
4063330,1301,1487,1435,2022-09-10 14:17:31,35.720974,51.498474,35.708061,51.401943,10.682,2022-09-10 14:51:03
4063337,1239,1567,1544,2022-09-10 15:42:45,35.724361,51.402294,35.794140,51.472240,14.393,2022-09-10 16:13:23


In [14]:
rides_test_p3 = rides_test_p3[['eta', 'ata', 'a_t_a_result.ride_a_t_a', 'created_at', 'source_lat', 'source_lng', 'destination_lat', 'destination_lng', 'e_d_d', 'clickhouse_time']]
rides_test_p3

Unnamed: 0,eta,ata,a_t_a_result.ride_a_t_a,created_at,source_lat,source_lng,destination_lat,destination_lng,e_d_d,clickhouse_time
4,1586,1663,1672,2022-09-11 16:18:50,35.754002,51.502956,35.633537,51.494732,17.757999,2022-09-11 18:07:01
14,1708,2254,2224,2022-09-11 10:37:12,35.792473,51.425320,35.698429,51.456745,19.094999,2022-09-11 11:21:00
32,1275,1435,1436,2022-09-11 07:03:23,35.720085,51.301212,35.710041,51.175381,18.556000,2022-09-11 07:32:28
34,1389,1729,1685,2022-09-11 08:14:49,35.684532,51.409050,35.756382,51.395046,11.348000,2022-09-11 08:52:25
46,1432,1450,1556,2022-09-11 12:16:11,35.765507,51.418747,35.710609,51.315659,18.350000,2022-09-11 12:47:13
...,...,...,...,...,...,...,...,...,...,...
2729932,1605,1964,1888,2022-10-09 11:23:07,35.718967,51.314106,35.742729,51.408863,13.023000,2022-10-09 12:01:00
2729934,1508,1672,1620,2022-10-09 11:43:55,35.661739,51.383343,35.787380,51.375748,18.952999,2022-10-09 12:18:13
2729939,1783,1984,1932,2022-10-09 20:13:35,35.774361,51.377525,35.692543,51.365459,12.733000,2022-10-09 20:51:04
2729953,1762,2073,2231,2022-10-09 18:35:58,35.628841,51.337055,35.557434,51.229160,14.194000,2022-10-09 19:16:11


In [15]:
rides_train_p3['source_h3_4'] = rides_train_p3.apply(lambda row: h3.geo_to_h3(row.source_lat, row.source_lng, 4), axis=1)
rides_train_p3['dest_h3_4'] = rides_train_p3.apply(lambda row: h3.geo_to_h3(row.destination_lat, row.destination_lng, 4), axis=1)
rides_train_p3['hour'] = rides_train_p3.apply(lambda row: ((datetime.timestamp(datetime.strptime(row.created_at,"%Y-%m-%d %H:%M:%S")) - 1662147000) % 604800) / 3600, axis=1)
rides_train_p3['holiday'] = rides_train_p3.apply(lambda row: int(row.created_at.split()[0] in train_holidays), axis=1)
rides_train_p3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_train_p3['source_h3_4'] = rides_train_p3.apply(lambda row: h3.geo_to_h3(row.source_lat, row.source_lng, 4), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_train_p3['dest_h3_4'] = rides_train_p3.apply(lambda row: h3.geo_to_h3(row.destination_lat, row.destination_lng, 4), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/

Unnamed: 0,eta,ata,a_t_a_result.ride_a_t_a,created_at,source_lat,source_lng,destination_lat,destination_lng,e_d_d,clickhouse_time,source_h3_4,dest_h3_4,hour,holiday
4,1295,1320,1288,2022-08-02 08:34:03,35.760803,51.412613,35.680412,51.406525,0.000,2022-08-02 09:01:23,842cf31ffffffff,842cf31ffffffff,80.567500,0
7,1721,1726,1580,2022-08-02 07:57:27,35.551586,51.251617,35.647949,51.399227,0.000,2022-08-02 08:29:58,842cf35ffffffff,842cf31ffffffff,79.957500,0
8,1230,1048,1004,2022-08-02 14:30:25,35.417717,51.795067,35.465965,51.675755,0.000,2022-08-02 14:52:35,842cf37ffffffff,842cf37ffffffff,86.506944,0
14,1641,2785,2727,2022-08-02 15:10:14,35.768822,51.305519,35.765411,51.442303,0.000,2022-08-02 16:09:17,842cf31ffffffff,842cf31ffffffff,87.170556,0
18,1390,2010,1964,2022-08-02 11:08:50,35.730118,51.382473,35.758369,51.441551,0.000,2022-08-02 12:05:23,842cf31ffffffff,842cf31ffffffff,83.147222,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4063264,1712,1875,1830,2022-09-10 12:24:51,35.669792,51.302319,35.685867,51.414448,13.342,2022-09-10 13:04:13,842cf31ffffffff,842cf31ffffffff,12.414167,0
4063317,1565,1638,1579,2022-09-10 13:22:41,35.310020,51.730579,35.423836,51.585243,21.211,2022-09-10 13:57:58,842cf37ffffffff,842cf35ffffffff,13.378056,0
4063330,1301,1487,1435,2022-09-10 14:17:31,35.720974,51.498474,35.708061,51.401943,10.682,2022-09-10 14:51:03,842cf31ffffffff,842cf31ffffffff,14.291944,0
4063337,1239,1567,1544,2022-09-10 15:42:45,35.724361,51.402294,35.794140,51.472240,14.393,2022-09-10 16:13:23,842cf31ffffffff,842cf31ffffffff,15.712500,0


In [16]:
rides_test_p3['source_h3_4'] = rides_test_p3.apply(lambda row: h3.geo_to_h3(row.source_lat, row.source_lng, 4), axis=1)
rides_test_p3['dest_h3_4'] = rides_test_p3.apply(lambda row: h3.geo_to_h3(row.destination_lat, row.destination_lng, 4), axis=1)
rides_test_p3['hour'] = rides_test_p3.apply(lambda row: ((datetime.timestamp(datetime.strptime(row.created_at,"%Y-%m-%d %H:%M:%S")) - 1662147000) % 604800) / 3600, axis=1)
rides_test_p3['holiday'] = rides_test_p3.apply(lambda row: int(row.created_at.split()[0] in test_holidays), axis=1)
rides_test_p3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_test_p3['source_h3_4'] = rides_test_p3.apply(lambda row: h3.geo_to_h3(row.source_lat, row.source_lng, 4), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_test_p3['dest_h3_4'] = rides_test_p3.apply(lambda row: h3.geo_to_h3(row.destination_lat, row.destination_lng, 4), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

Unnamed: 0,eta,ata,a_t_a_result.ride_a_t_a,created_at,source_lat,source_lng,destination_lat,destination_lng,e_d_d,clickhouse_time,source_h3_4,dest_h3_4,hour,holiday
4,1586,1663,1672,2022-09-11 16:18:50,35.754002,51.502956,35.633537,51.494732,17.757999,2022-09-11 18:07:01,842cf31ffffffff,842cf31ffffffff,40.313889,0
14,1708,2254,2224,2022-09-11 10:37:12,35.792473,51.425320,35.698429,51.456745,19.094999,2022-09-11 11:21:00,842cf31ffffffff,842cf31ffffffff,34.620000,0
32,1275,1435,1436,2022-09-11 07:03:23,35.720085,51.301212,35.710041,51.175381,18.556000,2022-09-11 07:32:28,842cf31ffffffff,842cf3dffffffff,31.056389,0
34,1389,1729,1685,2022-09-11 08:14:49,35.684532,51.409050,35.756382,51.395046,11.348000,2022-09-11 08:52:25,842cf31ffffffff,842cf31ffffffff,32.246944,0
46,1432,1450,1556,2022-09-11 12:16:11,35.765507,51.418747,35.710609,51.315659,18.350000,2022-09-11 12:47:13,842cf31ffffffff,842cf31ffffffff,36.269722,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2729932,1605,1964,1888,2022-10-09 11:23:07,35.718967,51.314106,35.742729,51.408863,13.023000,2022-10-09 12:01:00,842cf31ffffffff,842cf31ffffffff,36.385278,0
2729934,1508,1672,1620,2022-10-09 11:43:55,35.661739,51.383343,35.787380,51.375748,18.952999,2022-10-09 12:18:13,842cf31ffffffff,842cf31ffffffff,36.731944,0
2729939,1783,1984,1932,2022-10-09 20:13:35,35.774361,51.377525,35.692543,51.365459,12.733000,2022-10-09 20:51:04,842cf31ffffffff,842cf31ffffffff,45.226389,0
2729953,1762,2073,2231,2022-10-09 18:35:58,35.628841,51.337055,35.557434,51.229160,14.194000,2022-10-09 19:16:11,842cf31ffffffff,842cf35ffffffff,43.599444,0


In [26]:
train_p3_geo_x = rides_train_p3[["eta", "source_lat", "source_lng", "destination_lat", "destination_lng", "hour", "holiday"]]
train_p3_h3_x = rides_train_p3[["eta", "source_h3_4", "dest_h3_4", "hour", "holiday"]]
train_p3_geo_y = rides_train_p3["a_t_a_result.ride_a_t_a"]
train_p3_h3_y = rides_train_p3["a_t_a_result.ride_a_t_a"]

In [27]:
test_p3_geo_x = rides_test_p3[["eta", "source_lat", "source_lng", "destination_lat", "destination_lng", "hour", "holiday"]]
test_p3_h3_x = rides_test_p3[["eta", "source_h3_4", "dest_h3_4", "hour", "holiday"]]
test_p3_geo_y = rides_test_p3["a_t_a_result.ride_a_t_a"]
test_p3_h3_y = rides_test_p3["a_t_a_result.ride_a_t_a"]

In [28]:
dummy_train_rides = pd.get_dummies(train_p3_h3_x, prefix={'source_h3_4':'source_h3_4',
                                                             'dest_h3_4':'dest_h3_4'})
dummy_train_rides.describe()

Unnamed: 0,eta,hour,holiday,source_h3_4_842cd4bffffffff,source_h3_4_842cd59ffffffff,source_h3_4_842cf31ffffffff,source_h3_4_842cf33ffffffff,source_h3_4_842cf35ffffffff,source_h3_4_842cf37ffffffff,source_h3_4_842cf3dffffffff,dest_h3_4_842cd4bffffffff,dest_h3_4_842cd59ffffffff,dest_h3_4_842cf07ffffffff,dest_h3_4_842cf23ffffffff,dest_h3_4_842cf31ffffffff,dest_h3_4_842cf33ffffffff,dest_h3_4_842cf35ffffffff,dest_h3_4_842cf37ffffffff,dest_h3_4_842cf39ffffffff,dest_h3_4_842cf3dffffffff
count,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0,418611.0
mean,1455.52727,82.690465,0.016753,0.000344,0.000437,0.926932,0.006242,0.028661,0.009823,0.02756,0.000389,0.000528,2e-06,5e-06,0.914943,0.005396,0.028392,0.009149,0.000755,0.040441
std,169.783044,45.894171,0.128345,0.018544,0.020904,0.260248,0.07876,0.166853,0.098623,0.163709,0.019729,0.022971,0.001546,0.002186,0.278968,0.073262,0.166089,0.095214,0.027465,0.196991
min,1200.0,0.008889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1307.0,40.830556,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,1435.0,85.934444,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,1592.0,116.331667,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1799.0,167.586111,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
print(np.count_nonzero(dummy_train_rides, axis=0))
print(dummy_train_rides.columns)

[418611 418611   7013    144    183 388024   2613  11998   4112  11537
    163    221      1      2 383005   2259  11885   3830    316  16929]
Index(['eta', 'hour', 'holiday', 'source_h3_4_842cd4bffffffff',
       'source_h3_4_842cd59ffffffff', 'source_h3_4_842cf31ffffffff',
       'source_h3_4_842cf33ffffffff', 'source_h3_4_842cf35ffffffff',
       'source_h3_4_842cf37ffffffff', 'source_h3_4_842cf3dffffffff',
       'dest_h3_4_842cd4bffffffff', 'dest_h3_4_842cd59ffffffff',
       'dest_h3_4_842cf07ffffffff', 'dest_h3_4_842cf23ffffffff',
       'dest_h3_4_842cf31ffffffff', 'dest_h3_4_842cf33ffffffff',
       'dest_h3_4_842cf35ffffffff', 'dest_h3_4_842cf37ffffffff',
       'dest_h3_4_842cf39ffffffff', 'dest_h3_4_842cf3dffffffff'],
      dtype='object')


In [30]:
geo_noise = np.count_nonzero(dummy_train_rides, axis=0) < 100

cols = dummy_train_rides.columns
for col, noise in zip(cols, geo_noise):
    if noise:
        # DOESN'T REINDEX
        # remove the rows where rides_train_p3_x is in the noisy geo (geo is 1)
        train_p3_h3_x = train_p3_h3_x[dummy_train_rides[col] == 0]
        train_p3_h3_y = train_p3_h3_y[dummy_train_rides[col] == 0]

train_p3_h3_x.describe()

  train_p3_h3_x = train_p3_h3_x[dummy_train_rides[col] == 0]


Unnamed: 0,eta,hour,holiday
count,418608.0,418608.0,418608.0
mean,1455.525575,82.69044,0.016751
std,169.782417,45.894241,0.128336
min,1200.0,0.008889,0.0
25%,1307.0,40.830278,0.0
50%,1435.0,85.934306,0.0
75%,1592.0,116.331667,0.0
max,1799.0,167.586111,1.0


In [31]:
ohe = ColumnTransformer(
    [('OHE', OneHotEncoder(handle_unknown='ignore', sparse=False),['source_h3_4', 'dest_h3_4'])],
    remainder = 'passthrough'
    ).fit(train_p3_h3_x)

In [32]:
ohe.get_feature_names_out()

array(['OHE__source_h3_4_842cd4bffffffff',
       'OHE__source_h3_4_842cd59ffffffff',
       'OHE__source_h3_4_842cf31ffffffff',
       'OHE__source_h3_4_842cf33ffffffff',
       'OHE__source_h3_4_842cf35ffffffff',
       'OHE__source_h3_4_842cf37ffffffff',
       'OHE__source_h3_4_842cf3dffffffff',
       'OHE__dest_h3_4_842cd4bffffffff', 'OHE__dest_h3_4_842cd59ffffffff',
       'OHE__dest_h3_4_842cf31ffffffff', 'OHE__dest_h3_4_842cf33ffffffff',
       'OHE__dest_h3_4_842cf35ffffffff', 'OHE__dest_h3_4_842cf37ffffffff',
       'OHE__dest_h3_4_842cf39ffffffff', 'OHE__dest_h3_4_842cf3dffffffff',
       'remainder__eta', 'remainder__hour', 'remainder__holiday'],
      dtype=object)

In [33]:
train_p3_h3_x = ohe.transform(train_p3_h3_x)

In [34]:
test_p3_h3_x = ohe.transform(test_p3_h3_x)

In [35]:
reg_geo_p3 = LinearRegression().fit(train_p3_geo_x, train_p3_geo_y)

In [36]:
print(reg_geo_p3.coef_)
print(reg_geo_p3.intercept_)

[  1.25130281 135.20598499  24.93671027 451.76374288  84.72800247
  -0.5420216  -85.75198965]
-26661.645050796564


In [37]:
train_pred_geo = reg_geo_p3.predict(train_p3_geo_x)
test_pred_geo = reg_geo_p3.predict(test_p3_geo_x)

In [38]:
reg_h3_p3 = LinearRegression().fit(train_p3_h3_x, train_p3_h3_y)

In [39]:
print(reg_h3_p3.coef_)
print(reg_h3_p3.intercept_)

[-7.42906563e+11 -7.42906563e+11 -7.42906563e+11 -7.42906563e+11
 -7.42906563e+11 -7.42906563e+11 -7.42906563e+11  2.25564338e+11
  2.25564338e+11  2.25564338e+11  2.25564338e+11  2.25564338e+11
  2.25564338e+11  2.25564338e+11  2.25564338e+11  1.25261658e+00
 -5.54887253e-01 -8.74726453e+01]
517342225087.9334


In [40]:
train_pred_h3 = reg_h3_p3.predict(train_p3_h3_x)
test_pred_h3 = reg_h3_p3.predict(test_p3_h3_x)

In [41]:
known_h3_index = np.count_nonzero(test_p3_h3_x[:, 0:17], axis=1) == 2

In [42]:
test_p3_h3_y = test_p3_h3_y[known_h3_index]
test_p3_h3_x = test_p3_h3_x[known_h3_index]
test_pred_h3 = test_pred_h3[known_h3_index]

In [43]:
train_p3_geo_x

Unnamed: 0,eta,source_lat,source_lng,destination_lat,destination_lng,hour,holiday
4,1295,35.760803,51.412613,35.680412,51.406525,80.567500,0
7,1721,35.551586,51.251617,35.647949,51.399227,79.957500,0
8,1230,35.417717,51.795067,35.465965,51.675755,86.506944,0
14,1641,35.768822,51.305519,35.765411,51.442303,87.170556,0
18,1390,35.730118,51.382473,35.758369,51.441551,83.147222,0
...,...,...,...,...,...,...,...
4063264,1712,35.669792,51.302319,35.685867,51.414448,12.414167,0
4063317,1565,35.310020,51.730579,35.423836,51.585243,13.378056,0
4063330,1301,35.720974,51.498474,35.708061,51.401943,14.291944,0
4063337,1239,35.724361,51.402294,35.794140,51.472240,15.712500,0


In [44]:
print("ETA train MAE", mean_absolute_error(train_p3_geo_y, train_p3_geo_x['eta']))
print("prediction train MAE", mean_absolute_error(train_p3_geo_y, train_pred_geo))

print("ETA test MAE ", mean_absolute_error(test_p3_geo_y, test_p3_geo_x['eta']))
print("prediction test MAE", mean_absolute_error(test_p3_geo_y, test_pred_geo))


print("ETA train MSE", mean_squared_error(train_p3_geo_y, train_p3_geo_x['eta']))
print("prediction train MSE", mean_squared_error(train_p3_geo_y, train_pred_geo))

print("ETA test MSE ", mean_squared_error(test_p3_geo_y, test_p3_geo_x['eta']))
print("prediction test MSE", mean_squared_error(test_p3_geo_y, test_pred_geo))


print("ETA train explained_variance_score", explained_variance_score(train_p3_geo_y, train_p3_geo_x['eta']))
print("prediction train explained_variance_score", explained_variance_score(train_p3_geo_y, train_pred_geo))

print("ETA test explained_variance_score ", explained_variance_score(test_p3_geo_y, test_p3_geo_x['eta']))
print("prediction test explained_variance_score", explained_variance_score(test_p3_geo_y, test_pred_geo))


print("ETA train median_absolute_error", median_absolute_error(train_p3_geo_y, train_p3_geo_x['eta']))
print("prediction train median_absolute_error", median_absolute_error(train_p3_geo_y, train_pred_geo))

print("ETA test median_absolute_error ", median_absolute_error(test_p3_geo_y, test_p3_geo_x['eta']))
print("prediction test median_absolute_error", median_absolute_error(test_p3_geo_y, test_pred_geo))


print("ETA train r2_score", r2_score(train_p3_geo_y, train_p3_geo_x['eta']))
print("prediction train r2_score", r2_score(train_p3_geo_y, train_pred_geo))

print("ETA test r2_score ", r2_score(test_p3_geo_y, test_p3_geo_x['eta']))
print("prediction test r2_score", r2_score(test_p3_geo_y, test_pred_geo))


print("ETA train MAPE", mean_absolute_percentage_error(train_p3_geo_y, train_p3_geo_x['eta']))
print("prediction train MAPE", mean_absolute_percentage_error(train_p3_geo_y, train_pred_geo))

print("ETA test MAPE ", mean_absolute_percentage_error(test_p3_geo_y, test_p3_geo_x['eta']))
print("prediction test MAPE", mean_absolute_percentage_error(test_p3_geo_y, test_pred_geo))

ETA train MAE 328.5940455458648
prediction train MAE 297.90919973258815
ETA test MAE  334.4388657208343
prediction test MAE 313.0926640552622
ETA train MSE 267538.4672356914
prediction train MSE 197278.02374889096
ETA test MSE  307447.5143727894
prediction test MSE 236500.08092897446
ETA train explained_variance_score 0.17476575601716515
prediction train explained_variance_score 0.19219475004177133
ETA test explained_variance_score  0.15194470364736523
prediction test explained_variance_score 0.17348450223625478
ETA train median_absolute_error 205.0
prediction train median_absolute_error 224.17002217840854
ETA test median_absolute_error  197.0
prediction test median_absolute_error 226.25554632038256
ETA train r2_score -0.0955045792321001
prediction train r2_score 0.19219475004177133
ETA test r2_score  -0.07452246219765657
prediction test r2_score 0.17343729453082235
ETA train MAPE 0.16670909089984084
prediction train MAPE 0.17256322803076857
ETA test MAPE  0.16769182531397583
predictio

In [46]:
train_p4_geo_x_normalized = (train_p4_geo_x - train_p4_geo_x.min()) / (train_p4_geo_x.max() - train_p4_geo_x.min())
train_p4_geo_y_normalized = (train_p4_geo_y - train_p4_geo_y.min()) / (train_p4_geo_y.max() - train_p4_geo_y.min())
# test_p4_geo_x_normalized = (test_p4_geo_x - train_p4_geo_x.min()) / (train_p4_geo_x.max() - train_p4_geo_x.min())
# test_p4_geo_y_normalized = (test_p4_geo_y - train_p4_geo_y.min()) / (train_p4_geo_y.max() - train_p4_geo_y.min())

reg_p4_geo_normalized = LinearRegression().fit(train_p4_geo_x_normalized, train_p4_geo_y_normalized)
print(reg_p4_geo_normalized.coef_)
print(reg_p4_geo_normalized.intercept_)

[ 0.95267888  0.03992745  0.01639903  0.14161629  0.07104881 -0.00869313
 -0.01985985]
0.027375837176890544


In [39]:
print("ETA train MAE", mean_absolute_error(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train MAE", mean_absolute_error(train_p4_h3_y, train_pred_h3))

print("ETA test MAE ", mean_absolute_error(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test MAE", mean_absolute_error(test_p4_h3_y, test_pred_h3))


print("ETA train MSE", mean_squared_error(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train MSE", mean_squared_error(train_p4_h3_y, train_pred_h3))

print("ETA test MSE ", mean_squared_error(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test MSE", mean_squared_error(test_p4_h3_y, test_pred_h3))


print("ETA train explained_variance_score", explained_variance_score(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train explained_variance_score", explained_variance_score(train_p4_h3_y, train_pred_h3))

print("ETA test explained_variance_score ", explained_variance_score(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test explained_variance_score", explained_variance_score(test_p4_h3_y, test_pred_h3))


print("ETA train median_absolute_error", median_absolute_error(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train median_absolute_error", median_absolute_error(train_p4_h3_y, train_pred_h3))

print("ETA test median_absolute_error ", median_absolute_error(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test median_absolute_error", median_absolute_error(test_p4_h3_y, test_pred_h3))


print("ETA train r2_score", r2_score(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train r2_score", r2_score(train_p4_h3_y, train_pred_h3))

print("ETA test r2_score ", r2_score(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test r2_score", r2_score(test_p4_h3_y, test_pred_h3))


print("ETA train MAPE", mean_absolute_percentage_error(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train MAPE", mean_absolute_percentage_error(train_p4_h3_y, train_pred_h3))

print("ETA test MAPE ", mean_absolute_percentage_error(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test MAPE", mean_absolute_percentage_error(test_p4_h3_y, test_pred_h3))

ETA train MAE 558.6899742605314
prediction train MAE 468.32320172854264
ETA test MAE  609.4802658229795
prediction test MAE 532.3905549659162
ETA train MSE 724259.5492995734
prediction train MSE 474510.4137146755
ETA test MSE  1005379.4640005743
prediction test MSE 706504.3378821977
ETA train explained_variance_score 0.5486781823287357
prediction train explained_variance_score 0.5735345690702949
ETA test explained_variance_score  0.4753275329161858
prediction test explained_variance_score 0.5084530615897137
ETA train median_absolute_error 353.0
prediction train median_absolute_error 347.19482421875
ETA test median_absolute_error  352.0
prediction test median_absolute_error 366.8814697265625
ETA train r2_score 0.3490729563150865
prediction train r2_score 0.5735345690702917
ETA test r2_score  0.2984759208241079
prediction test r2_score 0.5070221515223838
ETA train MAPE 0.16697790395524195
prediction train MAPE 0.1610154609737745
ETA test MAPE  0.1730999583791752
prediction test MAPE 0.17

In [47]:
train_p4_h3_x_normalized = (train_p4_h3_x - train_p4_h3_x.min()) / (train_p4_h3_x.max() - train_p4_h3_x.min())
train_p4_h3_y_normalized = (train_p4_h3_y - train_p4_h3_y.min()) / (train_p4_h3_y.max() - train_p4_h3_y.min())
# test_p4_h3_x_normalized = (test_p4_h3_x - train_p4_h3_x.min()) / (train_p4_h3_x.max() - train_p4_h3_x.min())
# test_p4_h3_y_normalized = (test_p4_h3_y - train_p4_h3_y.min()) / (train_p4_h3_y.max() - train_p4_h3_y.min())

reg_p4_h3_normalized = LinearRegression().fit(train_p4_h3_x_normalized, train_p4_h3_y_normalized)
print(reg_p4_h3_normalized.coef_)
print(reg_p4_h3_normalized.intercept_)

[ 3.38839243e+11  3.38839243e+11  3.38839244e+11  3.38839243e+11
  3.38839243e+11  3.38839243e+11  3.38839243e+11 -1.81469415e+11
 -1.81469415e+11 -1.81469415e+11 -1.81469415e+11 -1.81469415e+11
 -1.81469415e+11 -1.81469415e+11 -1.81469415e+11 -1.81469415e+11
 -1.81469415e+11  1.10072538e+00 -4.58049363e-01 -1.50679642e+02]
-17055362.3876918


In [None]:
%set_env AWS_ACCESS_KEY_ID=SokXIEc1g9vNqCJt4CSObyk6vumoOOPQ
%set_env AWS_SECRET_ACCESS_KEY=QNyTpGhFjUTYSP9VKmfhpUizwKr0t8gk
%set_env MLFLOW_S3_ENDPOINT_URL=https://minio-clustered-smapp-storage.apps.private.teh-1.snappcloud.io

# create experiment
%set_env MLFLOW_TRACKING_URI=https://mlflow.apps.private.okd4.teh-1.snappcloud.io/
# mlflow experiments create --experiment-name elahe

# run script under experiment
%set_env MLFLOW_EXPERIMENT_NAME=elahe
# cd save/
# python test.py

In [None]:
! echo $AWS_ACCESS_KEY_ID
! echo $AWS_SECRET_ACCESS_KEY
! echo $MLFLOW_S3_ENDPOINT_URL
! echo $MLFLOW_TRACKING_URI
! echo $MLFLOW_EXPERIMENT_NAME

In [None]:
# with mlflow.start_run(run_name="regression") as run:
#     # Log the sklearn model and register as version 1
#     mlflow.sklearn.log_model(
#         sk_model=reg_p4,
#         artifact_path="regression",
#         registered_model_name="reg-model"
#     )

In [None]:
# with mlflow.start_run(run_name="YOUR_RUN_NAME") as run:
#     params = {"n_estimators": 5, "random_state": 42}
#     sk_learn_rfr = RandomForestRegressor(**params)
#
#     # Log parameters and metrics using the MLflow APIs
#     mlflow.log_params(params)
#     mlflow.log_param("param_1", randint(0, 100))
#     mlflow.log_metrics({"metric_1": random(), "metric_2": random() + 1})
#
#     # Log the sklearn model and register as version 1
#     mlflow.sklearn.log_model(
#         sk_model=sk_learn_rfr,
#         artifact_path="sklearn-model",
#         registered_model_name="sk-learn-random-forest-reg-model"
#     )