In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, median_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from scipy.stats import *
import h3
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from datetime import datetime
from math import floor
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn

In [2]:
ride_columns = ['accepted_driver_id', 'created_at', 'passenger_id', 'source_lat',
       'source_lng', 'destination_lat', 'destination_lng', 'eta', 'provider',
       'ata', 'id', 'city']

khatkesh_columns = ['ride_id','driver_id','a_t_a_result.arrival_a_t_a','a_t_a_result.boarding_a_t_a','a_t_a_result.ride_a_t_a','a_t_a_result.arrival_probe_result.probe.point.lat','a_t_a_result.arrival_probe_result.probe.point.lon','a_t_a_result.arrival_probe_result.probe.timestamp','a_t_a_result.arrival_probe_result.confidence','a_t_a_result.arrival_probe_result.h3_index', 'a_t_a_result.arrival_probe_result.k_ring_level','a_t_a_result.boarding_probe_result.probe.point.lat','a_t_a_result.boarding_probe_result.probe.point.lon','a_t_a_result.boarding_probe_result.probe.timestamp','a_t_a_result.boarding_probe_result.confidence','a_t_a_result.boarding_probe_result.h3_index','a_t_a_result.boarding_probe_result.k_ring_level','a_t_a_result.final_destination_probe_result.probe.point.lat','a_t_a_result.final_destination_probe_result.probe.point.lon','a_t_a_result.final_destination_probe_result.probe.timestamp','a_t_a_result.final_destination_probe_result.confidence','a_t_a_result.final_destination_probe_result.h3_index','a_t_a_result.final_destination_probe_result.k_ring_level','a_t_a_result.destination_probe_result.probe.point.lat','a_t_a_result.destination_probe_result.probe.point.lon','a_t_a_result.destination_probe_result.probe.timestamp','a_t_a_result.destination_probe_result.confidence','a_t_a_result.destination_probe_result.h3_index','a_t_a_result.destination_probe_result.k_ring_level','a_t_a_result.extra_destination_probe_result.probe.point.lat','a_t_a_result.extra_destination_probe_result.probe.point.lon','a_t_a_result.extra_destination_probe_result.probe.timestamp','a_t_a_result.extra_destination_probe_result.confidence','a_t_a_result.extra_destination_probe_result.h3_index','a_t_a_result.extra_destination_probe_result.k_ring_level','pickup_a_d_d_result.distance','pickup_a_d_d_result.confidence','pickup_a_d_d_result.route_ratio','pickup_a_d_d_result.g_p_s_ratio','ride_a_d_d_result.distance','ride_a_d_d_result.confidence','ride_a_d_d_result.route_ratio','ride_a_d_d_result.g_p_s_ratio','total_a_d_d_confidence','in_ride_allotment','e_d_d','clickhouse_time','hash']

train_dates = ['2022-08-02', '2022-08-03', '2022-08-04', '2022-08-05', '2022-08-06', '2022-08-07', '2022-08-08',
               '2022-08-09', '2022-08-10', '2022-08-11', '2022-08-12', '2022-08-13', '2022-08-14', '2022-08-15',
               '2022-08-16', '2022-08-17', '2022-08-18', '2022-08-19', '2022-08-20', '2022-08-21', '2022-08-22',
               '2022-08-23', '2022-08-24', '2022-08-25', '2022-08-26', '2022-08-27', '2022-08-28', '2022-08-29',
               '2022-08-30', '2022-08-31', '2022-09-01', '2022-09-02', '2022-09-03', '2022-09-04', '2022-09-05',
               '2022-09-06', '2022-09-07', '2022-09-08', '2022-09-09', '2022-09-10']

train_holidays = ['2022-08-07', '2022-08-08']

test_dates = ['2022-09-11', '2022-09-12', '2022-09-13', '2022-09-14', '2022-09-15', '2022-09-16', '2022-09-17',
              '2022-09-18', '2022-09-19', '2022-09-20', '2022-09-21', '2022-09-22', '2022-09-23', '2022-09-24',
              '2022-09-25', '2022-09-26', '2022-09-27', '2022-09-28', '2022-09-29', '2022-09-30', '2022-10-01',
              '2022-10-02', '2022-10-03', '2022-10-04', '2022-10-05', '2022-10-06', '2022-10-07', '2022-10-08',
              '2022-10-09']

test_holidays = ['2022-09-17', '2022-09-25', '2022-09-27', '2022-10-05']

In [3]:
rides_train = pd.read_csv('../rides_train.csv')
rides_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4063341 entries, 0 to 4063340
Data columns (total 59 columns):
 #   Column                                                       Dtype  
---  ------                                                       -----  
 0   accepted_driver_id                                           int64  
 1   created_at                                                   object 
 2   passenger_id                                                 int64  
 3   source_lat                                                   float64
 4   source_lng                                                   float64
 5   destination_lat                                              float64
 6   destination_lng                                              float64
 7   eta                                                          int64  
 8   provider                                                     object 
 9   ata                                                          int64  

In [4]:
rides_test = pd.read_csv('../rides_test.csv')
rides_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2729988 entries, 0 to 2729987
Data columns (total 59 columns):
 #   Column                                                       Dtype  
---  ------                                                       -----  
 0   accepted_driver_id                                           int64  
 1   created_at                                                   object 
 2   passenger_id                                                 int64  
 3   source_lat                                                   float64
 4   source_lng                                                   float64
 5   destination_lat                                              float64
 6   destination_lng                                              float64
 7   eta                                                          int64  
 8   provider                                                     object 
 9   ata                                                          int64  

In [6]:
rides_train_p2 = rides_train[(rides_train['eta'] >= 600) & (rides_train['eta'] < 1200)]
rides_train_p2

Unnamed: 0,accepted_driver_id,created_at,passenger_id,source_lat,source_lng,destination_lat,destination_lng,eta,provider,ata,...,pickup_a_d_d_result.g_p_s_ratio,ride_a_d_d_result.distance,ride_a_d_d_result.confidence,ride_a_d_d_result.route_ratio,ride_a_d_d_result.g_p_s_ratio,total_a_d_d_confidence,in_ride_allotment,e_d_d,clickhouse_time,hash
2,408918,2022-08-02 09:54:15,5213539,35.779682,51.418243,35.759449,51.411560,759,smapp-same-dc,604,...,0.017241,5.497,0.966254,0.0,1.000000,0.862177,0.0,0.000000,2022-08-02 10:08:24,2313090424092922132
9,53191,2022-08-02 18:50:27,584403,35.727207,51.414001,35.685680,51.417545,1013,smapp-same-dc,1033,...,0.009174,10.750,0.897674,0.0,0.935780,0.920802,0.0,0.000000,2022-08-02 19:15:08,5451641608039862966
10,1067390,2022-08-02 09:03:31,30326364,35.793903,51.533375,35.746414,51.465214,1159,smapp-same-dc,1122,...,0.000000,13.055,0.500000,0.0,0.000000,0.450000,0.0,0.000000,2022-08-02 09:27:25,4680654792485305243
11,2965628,2022-08-02 08:12:13,3706933,35.784466,51.469406,35.745651,51.574635,762,smapp-same-dc,1112,...,0.000000,24.347,0.978827,0.0,0.866071,0.954214,0.0,0.000000,2022-08-02 08:37:36,8567897489074288511
19,3585225,2022-08-02 06:55:51,10865738,35.668488,51.471096,35.737984,51.507107,1088,smapp-same-dc,1324,...,0.006944,31.246,1.000000,0.0,0.986111,1.000000,0.0,0.000000,2022-08-02 07:28:51,729957630370450801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4063328,2039638,2022-09-10 06:46:01,36290596,35.736115,51.313038,35.758835,51.443092,1187,smapp-same-dc,1503,...,0.006757,34.778,0.980505,0.0,0.972973,0.984175,0.0,17.391001,2022-09-10 07:19:37,1051319816819413442
4063329,1130885,2022-09-10 14:24:21,9395527,35.783543,51.381916,35.802864,51.403702,631,smapp-same-dc,811,...,0.015625,13.448,1.000000,0.0,1.000000,1.000000,0.0,6.729000,2022-09-10 14:42:34,5503017915922955727
4063331,72763,2022-09-10 12:56:23,33622661,35.706825,51.420853,35.716705,51.401714,638,smapp-same-dc,550,...,0.000000,4.574,0.773502,0.0,0.500000,0.696152,0.0,3.307000,2022-09-10 13:09:02,13930658555766840196
4063333,2876463,2022-09-10 14:42:19,39332253,35.695030,51.254135,35.716125,51.285088,781,smapp-same-dc,1271,...,0.008772,23.270,0.974860,0.0,0.964912,0.978154,0.0,12.272000,2022-09-10 15:08:30,1923152195668709992


In [7]:
rides_test_p2 = rides_test[(rides_test['eta'] >= 600) & (rides_test['eta'] < 1200)]
rides_test_p2

Unnamed: 0,accepted_driver_id,created_at,passenger_id,source_lat,source_lng,destination_lat,destination_lng,eta,provider,ata,...,pickup_a_d_d_result.g_p_s_ratio,ride_a_d_d_result.distance,ride_a_d_d_result.confidence,ride_a_d_d_result.route_ratio,ride_a_d_d_result.g_p_s_ratio,total_a_d_d_confidence,in_ride_allotment,e_d_d,clickhouse_time,hash
0,2014057,2022-09-11 11:24:07,49398,35.747234,51.398994,35.776882,51.351730,854,smapp-same-dc,737,...,0.000000,11.169000,0.500000,0.0,0.000000,0.450000,0.00,11.161,2022-09-11 11:39:53,5137367095109055683
1,1391799,2022-09-11 18:11:25,35845971,35.700195,51.331490,35.676605,51.313297,977,smapp-same-dc,2336,...,0.004149,11.010000,0.727702,0.0,0.273859,0.654932,0.00,7.444,2022-09-11 19:07:47,10059080565474924133
3,3976967,2022-09-11 22:11:05,44963507,35.442017,51.564331,35.461838,51.656166,1067,smapp-same-dc,1095,...,0.009259,22.372000,1.000000,0.0,1.000000,1.000000,0.00,11.072,2022-09-11 22:35:23,11510188098138702444
7,747553,2022-09-11 08:41:48,32278185,35.686687,51.353626,35.676292,51.306538,661,smapp-same-dc,689,...,0.014085,7.608000,0.569532,0.0,0.112676,0.592628,0.00,6.406,2022-09-11 09:00:15,14066967287691026815
8,178820,2022-09-11 08:47:42,47663548,35.737583,51.817417,35.716343,51.786205,767,smapp-same-dc,815,...,0.014286,14.842000,1.000000,0.0,0.985714,1.000000,0.00,7.461,2022-09-11 09:05:35,2262467388776729805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2729979,2631962,2022-10-09 07:26:33,52511684,35.688557,51.241486,35.557350,51.250626,1185,smapp-same-dc,1374,...,0.006623,37.939999,0.988271,0.0,0.953642,0.989648,0.00,20.011,2022-10-09 08:42:35,8652886350700956763
2729981,441313,2022-10-09 11:07:50,5316811,35.722595,51.386192,35.694836,51.395294,708,smapp-same-dc,1034,...,0.000000,3.358000,0.500000,0.0,0.000000,0.450000,0.00,3.346,2022-10-09 11:34:18,100947978219073146
2729983,80573,2022-10-09 06:18:11,3282774,35.802494,51.362171,35.772793,51.409565,747,smapp-same-dc,978,...,0.006667,21.190001,1.000000,0.0,0.993333,1.000000,3.98,9.157,2022-10-09 06:53:49,2386757117456033695
2729984,409819,2022-10-09 18:09:08,218416,35.745148,51.398754,35.775940,51.348122,1041,smapp-same-dc,1672,...,0.005814,19.576000,1.000000,0.0,0.988372,1.000000,0.00,11.953,2022-10-09 18:40:16,16994287713772196650


There are some duplicates in the data

In [8]:
rides_train_p2 = rides_train_p2.drop_duplicates(subset=['ride_id', 'ata'])
len(rides_train_p2)

1483456

In [9]:
rides_test_p2 = rides_test_p2.drop_duplicates(subset=['ride_id', 'ata'])
len(rides_test_p2)

1002411

In [10]:
rides_train_p2 = rides_train_p2[(rides_train_p2['a_t_a_result.ride_a_t_a'] > 180) &
                                (rides_train_p2['a_t_a_result.ride_a_t_a'] < 10800)]
rides_train_p2.describe()

Unnamed: 0,accepted_driver_id,passenger_id,source_lat,source_lng,destination_lat,destination_lng,eta,ata,ride_id,city,...,pickup_a_d_d_result.route_ratio,pickup_a_d_d_result.g_p_s_ratio,ride_a_d_d_result.distance,ride_a_d_d_result.confidence,ride_a_d_d_result.route_ratio,ride_a_d_d_result.g_p_s_ratio,total_a_d_d_confidence,in_ride_allotment,e_d_d,hash
count,1482198.0,1482198.0,1482198.0,1482198.0,1482198.0,1482198.0,1482198.0,1482198.0,1482198.0,1482198.0,...,1482198.0,1482198.0,1482198.0,1482198.0,1482198.0,1482198.0,1482198.0,1482198.0,1482198.0,1482198.0
mean,1804205.0,21717380.0,35.71072,51.40306,35.71117,51.40223,870.4163,994.5292,4822747000.0,1.0,...,0.0001524715,0.01000983,14.94397,0.8741344,3.60012e-05,0.746241,0.8820352,0.08392331,2.100558,9.222813e+18
std,1421937.0,17827150.0,0.07757467,0.1109245,0.0774789,0.1114322,170.3211,386.7778,60660490.0,0.0,...,0.01000942,0.004562369,12.39521,0.1732799,0.0009504525,0.3313338,0.1645525,0.6122273,4.110544,5.321304e+18
min,0.0,32.0,35.17376,50.9003,35.17174,50.9002,600.0,120.0,4714478000.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3479688000000.0
25%,381360.0,3637001.0,35.69078,51.3487,35.69168,51.34868,722.0,751.0,4770465000.0,1.0,...,0.0,0.007874016,9.764,0.8024615,0.0,0.5672514,0.8243964,0.0,0.0,4.620294e+18
50%,1621141.0,20390970.0,35.72703,51.40816,35.7275,51.40788,857.0,933.0,4824182000.0,1.0,...,0.0,0.01010101,13.807,0.9665777,0.0,0.9295775,0.9665393,0.0,0.0,9.223497e+18
75%,3103450.0,36097510.0,35.75837,51.45697,35.75884,51.45494,1011.0,1155.0,4871744000.0,1.0,...,0.0,0.01282051,18.994,1.0,0.0,0.990566,1.0,0.0,0.0,1.383019e+19
max,4330661.0,57376530.0,35.83982,52.79733,35.8535,52.81633,1199.0,12682.0,4933891000.0,1.0,...,1.0,0.05263158,8432.877,1.0,0.1383614,1.0,1.0,86.75999,47.035,1.844674e+19


In [11]:
rides_train_p2 = rides_train_p2[(rides_train_p2['a_t_a_result.destination_probe_result.confidence'] > 0) & (rides_train_p2['a_t_a_result.boarding_probe_result.confidence'] > 0)]
rides_train_p2.describe()

Unnamed: 0,accepted_driver_id,passenger_id,source_lat,source_lng,destination_lat,destination_lng,eta,ata,ride_id,city,...,pickup_a_d_d_result.route_ratio,pickup_a_d_d_result.g_p_s_ratio,ride_a_d_d_result.distance,ride_a_d_d_result.confidence,ride_a_d_d_result.route_ratio,ride_a_d_d_result.g_p_s_ratio,total_a_d_d_confidence,in_ride_allotment,e_d_d,hash
count,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,...,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0
mean,1851133.0,21560390.0,35.711987,51.403064,35.712743,51.402181,869.374405,1013.314658,4822600000.0,1.0,...,7.1e-05,0.011114,15.8226,0.927236,2.1e-05,0.860681,0.934179,0.090931,2.137494,9.221702e+18
std,1433300.0,17791080.0,0.076325,0.110533,0.076275,0.111088,170.274985,380.911892,60830040.0,0.0,...,0.006356,0.003436,11.349151,0.130749,0.000665,0.238079,0.115209,0.632849,4.146516,5.321024e+18
min,0.0,49.0,35.206142,50.900383,35.175785,50.900665,600.0,120.0,4714478000.0,1.0,...,0.0,0.000935,0.428,0.0,0.0,0.006472,0.0,0.0,0.0,3479688000000.0
25%,407867.0,3549853.0,35.692047,51.348618,35.692902,51.348824,721.0,771.0,4769845000.0,1.0,...,0.0,0.008696,10.688,0.925845,0.0,0.850467,0.924569,0.0,0.0,4.621696e+18
50%,1708776.0,20108160.0,35.727703,51.408211,35.728825,51.407547,855.0,952.0,4823460000.0,1.0,...,0.0,0.010638,14.793,0.989349,0.0,0.975806,0.988141,0.0,0.0,9.223621e+18
75%,3178465.0,35905700.0,35.758945,51.457214,35.75972,51.455956,1010.0,1171.0,4872254000.0,1.0,...,0.0,0.013158,19.950001,1.0,0.0,1.0,1.0,0.0,0.0,1.382726e+19
max,4330661.0,57375380.0,35.839825,52.793839,35.841194,52.791519,1199.0,12682.0,4933891000.0,1.0,...,1.0,0.052632,8432.876953,1.0,0.095434,1.0,1.0,26.954994,47.035,1.844672e+19


In [12]:
rides_test_p2 = rides_test_p2[(rides_test_p2['a_t_a_result.destination_probe_result.confidence'] > 0) & (rides_test_p2['a_t_a_result.boarding_probe_result.confidence'] > 0)]
rides_test_p2.describe()

Unnamed: 0,accepted_driver_id,passenger_id,source_lat,source_lng,destination_lat,destination_lng,eta,ata,ride_id,city,...,pickup_a_d_d_result.route_ratio,pickup_a_d_d_result.g_p_s_ratio,ride_a_d_d_result.distance,ride_a_d_d_result.confidence,ride_a_d_d_result.route_ratio,ride_a_d_d_result.g_p_s_ratio,total_a_d_d_confidence,in_ride_allotment,e_d_d,hash
count,622227.0,622227.0,622227.0,622227.0,622227.0,622227.0,622227.0,622227.0,622227.0,622227.0,...,622227.0,622227.0,622227.0,622227.0,622227.0,622227.0,622227.0,622227.0,622227.0,622227.0
mean,1906119.0,22081020.0,35.709644,51.403075,35.710542,51.402358,867.782197,1012.403205,5010023000.0,1.0,...,6.9e-05,0.011174,16.309521,0.929128,1.9e-05,0.867449,0.935625,0.024346,9.060947,9.223734e+18
std,1470427.0,18036810.0,0.078239,0.112598,0.078295,0.113354,169.791134,394.066216,43692670.0,0.0,...,0.006185,0.003598,8.09603,0.129124,0.000634,0.234847,0.113758,0.237764,3.63668,5.324037e+18
min,0.0,17.0,35.186165,50.900536,35.183189,50.871979,600.0,120.0,4933925000.0,1.0,...,0.0,0.00065,0.0,0.0,0.0,0.004914,0.0,0.0,0.362,14971740000000.0
25%,425406.0,3765864.0,35.689022,51.347374,35.690048,51.348095,720.0,768.0,4972128000.0,1.0,...,0.0,0.008772,11.013731,0.92977,0.0,0.865546,0.927801,0.0,6.286,4.610225e+18
50%,1773074.0,20876290.0,35.726734,51.407803,35.727859,51.407433,853.0,947.0,5011286000.0,1.0,...,0.0,0.010753,15.284,0.990241,0.0,0.979452,0.988894,0.0,8.55,9.216589e+18
75%,3261491.0,36491750.0,35.758026,51.458729,35.759079,51.457523,1007.0,1164.0,5048147000.0,1.0,...,0.0,0.013158,20.614,1.0,0.0,1.0,1.0,0.0,11.335,1.383342e+19
max,4422059.0,58138570.0,35.839855,52.778404,35.843197,52.797073,1199.0,12334.0,5083647000.0,1.0,...,1.0,0.5,2599.429199,1.0,0.132202,1.0,1.0,55.098999,56.626999,1.844674e+19


In [13]:
rides_train_p2 = rides_train_p2[['eta', 'ata', 'a_t_a_result.ride_a_t_a', 'created_at', 'source_lat', 'source_lng', 'destination_lat', 'destination_lng', 'e_d_d', 'clickhouse_time']]
rides_train_p2

Unnamed: 0,eta,ata,a_t_a_result.ride_a_t_a,created_at,source_lat,source_lng,destination_lat,destination_lng,e_d_d,clickhouse_time
2,759,604,576,2022-08-02 09:54:15,35.779682,51.418243,35.759449,51.411560,0.000000,2022-08-02 10:08:24
19,1088,1324,1436,2022-08-02 06:55:51,35.668488,51.471096,35.737984,51.507107,0.000000,2022-08-02 07:28:51
24,930,1637,1327,2022-08-02 09:55:29,35.743820,51.465885,35.758488,51.441872,0.000000,2022-08-02 10:30:20
27,783,982,956,2022-08-02 19:58:17,35.753742,51.367241,35.735798,51.405380,0.000000,2022-08-02 20:23:58
32,888,843,810,2022-08-02 21:13:47,35.638977,51.334576,35.671574,51.302036,0.000000,2022-08-02 21:32:32
...,...,...,...,...,...,...,...,...,...,...
4063322,1140,4362,4332,2022-09-10 17:12:02,35.675362,51.467590,35.727814,51.416153,23.992001,2022-09-10 18:29:51
4063323,995,951,1116,2022-09-10 22:35:28,35.682247,51.462456,35.641071,51.500271,10.609000,2022-09-10 23:01:50
4063328,1187,1503,1476,2022-09-10 06:46:01,35.736115,51.313038,35.758835,51.443092,17.391001,2022-09-10 07:19:37
4063329,631,811,634,2022-09-10 14:24:21,35.783543,51.381916,35.802864,51.403702,6.729000,2022-09-10 14:42:34


In [14]:
rides_test_p2 = rides_test_p2[['eta', 'ata', 'a_t_a_result.ride_a_t_a', 'created_at', 'source_lat', 'source_lng', 'destination_lat', 'destination_lng', 'e_d_d', 'clickhouse_time']]
rides_test_p2

Unnamed: 0,eta,ata,a_t_a_result.ride_a_t_a,created_at,source_lat,source_lng,destination_lat,destination_lng,e_d_d,clickhouse_time
3,1067,1095,1072,2022-09-11 22:11:05,35.442017,51.564331,35.461838,51.656166,11.072,2022-09-11 22:35:23
8,767,815,696,2022-09-11 08:47:42,35.737583,51.817417,35.716343,51.786205,7.461,2022-09-11 09:05:35
10,798,826,772,2022-09-11 12:47:43,35.735069,51.860020,35.737427,51.817623,7.522,2022-09-11 13:04:42
13,745,768,736,2022-09-11 16:02:11,35.712906,51.367687,35.759655,51.376125,7.483,2022-09-11 16:49:31
16,1088,2192,2203,2022-09-11 07:40:03,35.731453,51.521435,35.752289,51.441925,13.943,2022-09-11 08:20:56
...,...,...,...,...,...,...,...,...,...,...
2729969,1118,871,855,2022-10-09 20:28:09,35.717136,51.331192,35.775993,51.331837,9.915,2022-10-09 20:49:39
2729974,1027,832,980,2022-10-09 16:25:23,35.759262,51.329350,35.726742,51.324627,6.720,2022-10-09 16:49:19
2729983,747,978,1500,2022-10-09 06:18:11,35.802494,51.362171,35.772793,51.409565,9.157,2022-10-09 06:53:49
2729984,1041,1672,1717,2022-10-09 18:09:08,35.745148,51.398754,35.775940,51.348122,11.953,2022-10-09 18:40:16


In [15]:
rides_train_p2['source_h3_4'] = rides_train_p2.apply(lambda row: h3.geo_to_h3(row.source_lat, row.source_lng, 4), axis=1)
rides_train_p2['dest_h3_4'] = rides_train_p2.apply(lambda row: h3.geo_to_h3(row.destination_lat, row.destination_lng, 4), axis=1)
rides_train_p2['hour'] = rides_train_p2.apply(lambda row: ((datetime.timestamp(datetime.strptime(row.created_at,"%Y-%m-%d %H:%M:%S")) - 1662147000) % 604800) / 3600, axis=1)
rides_train_p2['holiday'] = rides_train_p2.apply(lambda row: int(row.created_at.split()[0] in train_holidays), axis=1)
rides_train_p2

Unnamed: 0,eta,ata,a_t_a_result.ride_a_t_a,created_at,source_lat,source_lng,destination_lat,destination_lng,e_d_d,clickhouse_time,source_h3_4,dest_h3_4,hour,holiday
2,759,604,576,2022-08-02 09:54:15,35.779682,51.418243,35.759449,51.411560,0.000000,2022-08-02 10:08:24,842cf31ffffffff,842cf31ffffffff,81.904167,0
19,1088,1324,1436,2022-08-02 06:55:51,35.668488,51.471096,35.737984,51.507107,0.000000,2022-08-02 07:28:51,842cf31ffffffff,842cf31ffffffff,78.930833,0
24,930,1637,1327,2022-08-02 09:55:29,35.743820,51.465885,35.758488,51.441872,0.000000,2022-08-02 10:30:20,842cf31ffffffff,842cf31ffffffff,81.924722,0
27,783,982,956,2022-08-02 19:58:17,35.753742,51.367241,35.735798,51.405380,0.000000,2022-08-02 20:23:58,842cf31ffffffff,842cf31ffffffff,91.971389,0
32,888,843,810,2022-08-02 21:13:47,35.638977,51.334576,35.671574,51.302036,0.000000,2022-08-02 21:32:32,842cf31ffffffff,842cf31ffffffff,93.229722,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4063322,1140,4362,4332,2022-09-10 17:12:02,35.675362,51.467590,35.727814,51.416153,23.992001,2022-09-10 18:29:51,842cf31ffffffff,842cf31ffffffff,17.200556,0
4063323,995,951,1116,2022-09-10 22:35:28,35.682247,51.462456,35.641071,51.500271,10.609000,2022-09-10 23:01:50,842cf31ffffffff,842cf31ffffffff,22.591111,0
4063328,1187,1503,1476,2022-09-10 06:46:01,35.736115,51.313038,35.758835,51.443092,17.391001,2022-09-10 07:19:37,842cf31ffffffff,842cf31ffffffff,6.766944,0
4063329,631,811,634,2022-09-10 14:24:21,35.783543,51.381916,35.802864,51.403702,6.729000,2022-09-10 14:42:34,842cf31ffffffff,842cf31ffffffff,14.405833,0


In [16]:
rides_test_p2['source_h3_4'] = rides_test_p2.apply(lambda row: h3.geo_to_h3(row.source_lat, row.source_lng, 4), axis=1)
rides_test_p2['dest_h3_4'] = rides_test_p2.apply(lambda row: h3.geo_to_h3(row.destination_lat, row.destination_lng, 4), axis=1)
rides_test_p2['hour'] = rides_test_p2.apply(lambda row: ((datetime.timestamp(datetime.strptime(row.created_at,"%Y-%m-%d %H:%M:%S")) - 1662147000) % 604800) / 3600, axis=1)
rides_test_p2['holiday'] = rides_test_p2.apply(lambda row: int(row.created_at.split()[0] in test_holidays), axis=1)
rides_test_p2

Unnamed: 0,eta,ata,a_t_a_result.ride_a_t_a,created_at,source_lat,source_lng,destination_lat,destination_lng,e_d_d,clickhouse_time,source_h3_4,dest_h3_4,hour,holiday
3,1067,1095,1072,2022-09-11 22:11:05,35.442017,51.564331,35.461838,51.656166,11.072,2022-09-11 22:35:23,842cf35ffffffff,842cf37ffffffff,46.184722,0
8,767,815,696,2022-09-11 08:47:42,35.737583,51.817417,35.716343,51.786205,7.461,2022-09-11 09:05:35,842cf33ffffffff,842cf33ffffffff,32.795000,0
10,798,826,772,2022-09-11 12:47:43,35.735069,51.860020,35.737427,51.817623,7.522,2022-09-11 13:04:42,842cf33ffffffff,842cf33ffffffff,36.795278,0
13,745,768,736,2022-09-11 16:02:11,35.712906,51.367687,35.759655,51.376125,7.483,2022-09-11 16:49:31,842cf31ffffffff,842cf31ffffffff,40.036389,0
16,1088,2192,2203,2022-09-11 07:40:03,35.731453,51.521435,35.752289,51.441925,13.943,2022-09-11 08:20:56,842cf31ffffffff,842cf31ffffffff,31.667500,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2729969,1118,871,855,2022-10-09 20:28:09,35.717136,51.331192,35.775993,51.331837,9.915,2022-10-09 20:49:39,842cf31ffffffff,842cf31ffffffff,45.469167,0
2729974,1027,832,980,2022-10-09 16:25:23,35.759262,51.329350,35.726742,51.324627,6.720,2022-10-09 16:49:19,842cf31ffffffff,842cf31ffffffff,41.423056,0
2729983,747,978,1500,2022-10-09 06:18:11,35.802494,51.362171,35.772793,51.409565,9.157,2022-10-09 06:53:49,842cf31ffffffff,842cf31ffffffff,31.303056,0
2729984,1041,1672,1717,2022-10-09 18:09:08,35.745148,51.398754,35.775940,51.348122,11.953,2022-10-09 18:40:16,842cf31ffffffff,842cf31ffffffff,43.152222,0


In [17]:
train_p2_geo_x = rides_train_p2[["eta", "source_lat", "source_lng", "destination_lat", "destination_lng", "hour", "holiday"]]
train_p2_h3_x = rides_train_p2[["eta", "source_h3_4", "dest_h3_4", "hour", "holiday"]]
train_p2_geo_y = rides_train_p2["a_t_a_result.ride_a_t_a"]
train_p2_h3_y = rides_train_p2["a_t_a_result.ride_a_t_a"]

In [18]:
test_p2_geo_x = rides_test_p2[["eta", "source_lat", "source_lng", "destination_lat", "destination_lng", "hour", "holiday"]]
test_p2_h3_x = rides_test_p2[["eta", "source_h3_4", "dest_h3_4", "hour", "holiday"]]
test_p2_geo_y = rides_test_p2["a_t_a_result.ride_a_t_a"]
test_p2_h3_y = rides_test_p2["a_t_a_result.ride_a_t_a"]

In [19]:
dummy_train_rides = pd.get_dummies(train_p2_h3_x, prefix={'source_h3_4':'source_h3_4',
                                                             'dest_h3_4':'dest_h3_4'})
dummy_train_rides.describe()

Unnamed: 0,eta,hour,holiday,source_h3_4_842cd4bffffffff,source_h3_4_842cd59ffffffff,source_h3_4_842cd5bffffffff,source_h3_4_842cf31ffffffff,source_h3_4_842cf33ffffffff,source_h3_4_842cf35ffffffff,source_h3_4_842cf37ffffffff,source_h3_4_842cf3dffffffff,dest_h3_4_842cd4bffffffff,dest_h3_4_842cd59ffffffff,dest_h3_4_842cd5bffffffff,dest_h3_4_842cf31ffffffff,dest_h3_4_842cf33ffffffff,dest_h3_4_842cf35ffffffff,dest_h3_4_842cf37ffffffff,dest_h3_4_842cf39ffffffff,dest_h3_4_842cf3dffffffff
count,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0,906580.0
mean,869.374405,82.669705,0.020807,0.000415,0.00119,3e-06,0.916016,0.01312,0.028867,0.013185,0.027204,0.000627,0.001142,3e-06,0.914442,0.013163,0.028246,0.013134,1.7e-05,0.029227
std,170.274985,46.614441,0.142737,0.020361,0.034479,0.001819,0.277364,0.113787,0.167432,0.114065,0.162679,0.025023,0.033769,0.001819,0.27971,0.113971,0.165674,0.113849,0.004068,0.168444
min,600.0,0.001389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,721.0,40.240278,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,855.0,85.841667,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,1010.0,117.4025,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1199.0,167.777222,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
print(np.count_nonzero(dummy_train_rides, axis=0))
print(dummy_train_rides.columns)

[906580 906580  18863    376   1079      3 830442  11894  26170  11953
  24663    568   1035      3 829015  11933  25607  11907     15  26497]
Index(['eta', 'hour', 'holiday', 'source_h3_4_842cd4bffffffff',
       'source_h3_4_842cd59ffffffff', 'source_h3_4_842cd5bffffffff',
       'source_h3_4_842cf31ffffffff', 'source_h3_4_842cf33ffffffff',
       'source_h3_4_842cf35ffffffff', 'source_h3_4_842cf37ffffffff',
       'source_h3_4_842cf3dffffffff', 'dest_h3_4_842cd4bffffffff',
       'dest_h3_4_842cd59ffffffff', 'dest_h3_4_842cd5bffffffff',
       'dest_h3_4_842cf31ffffffff', 'dest_h3_4_842cf33ffffffff',
       'dest_h3_4_842cf35ffffffff', 'dest_h3_4_842cf37ffffffff',
       'dest_h3_4_842cf39ffffffff', 'dest_h3_4_842cf3dffffffff'],
      dtype='object')


In [21]:
geo_noise = np.count_nonzero(dummy_train_rides, axis=0) < 100

cols = dummy_train_rides.columns
for col, noise in zip(cols, geo_noise):
    if noise:
        # DOESN'T REINDEX
        # remove the rows where rides_train_p2_x is in the noisy geo (geo is 1)
        train_p2_h3_x = train_p2_h3_x[dummy_train_rides[col] == 0]
        train_p2_h3_y = train_p2_h3_y[dummy_train_rides[col] == 0]

train_p2_h3_x.describe()

  train_p2_h3_x = train_p2_h3_x[dummy_train_rides[col] == 0]


Unnamed: 0,eta,hour,holiday
count,906562.0,906562.0,906562.0
mean,869.371288,82.669818,0.020806
std,170.273111,46.614555,0.142735
min,600.0,0.001389,0.0
25%,721.0,40.240278,0.0
50%,855.0,85.841667,0.0
75%,1010.0,117.402986,0.0
max,1199.0,167.777222,1.0


In [22]:
ohe = ColumnTransformer(
    [('OHE', OneHotEncoder(handle_unknown='ignore', sparse=False),['source_h3_4', 'dest_h3_4'])],
    remainder = 'passthrough'
    ).fit(train_p2_h3_x)

In [23]:
ohe.get_feature_names_out()

array(['OHE__source_h3_4_842cd4bffffffff',
       'OHE__source_h3_4_842cd59ffffffff',
       'OHE__source_h3_4_842cf31ffffffff',
       'OHE__source_h3_4_842cf33ffffffff',
       'OHE__source_h3_4_842cf35ffffffff',
       'OHE__source_h3_4_842cf37ffffffff',
       'OHE__source_h3_4_842cf3dffffffff',
       'OHE__dest_h3_4_842cd4bffffffff', 'OHE__dest_h3_4_842cd59ffffffff',
       'OHE__dest_h3_4_842cf31ffffffff', 'OHE__dest_h3_4_842cf33ffffffff',
       'OHE__dest_h3_4_842cf35ffffffff', 'OHE__dest_h3_4_842cf37ffffffff',
       'OHE__dest_h3_4_842cf3dffffffff', 'remainder__eta',
       'remainder__hour', 'remainder__holiday'], dtype=object)

In [24]:
train_p2_h3_x = ohe.transform(train_p2_h3_x)

In [25]:
test_p2_h3_x = ohe.transform(test_p2_h3_x)

In [26]:
reg_geo_p2 = LinearRegression().fit(train_p2_geo_x, train_p2_geo_y)

In [27]:
print(reg_geo_p2.coef_)
print(reg_geo_p2.intercept_)

[  1.20311512 -44.048468    20.0090256  205.23249916  34.36445879
  -0.28966438 -29.90940304]
-8584.249072201364


In [28]:
train_pred_geo = reg_geo_p2.predict(train_p2_geo_x)
test_pred_geo = reg_geo_p2.predict(test_p2_geo_x)

In [29]:
reg_h3_p2 = LinearRegression().fit(train_p2_h3_x, train_p2_h3_y)

In [30]:
print(reg_h3_p2.coef_)
print(reg_h3_p2.intercept_)

[-1.17315119e+11 -1.17315119e+11 -1.17315119e+11 -1.17315119e+11
 -1.17315119e+11 -1.17315119e+11 -1.17315119e+11 -9.81033836e+10
 -9.81033835e+10 -9.81033836e+10 -9.81033836e+10 -9.81033836e+10
 -9.81033836e+10 -9.81033836e+10  1.20414930e+00 -2.92968484e-01
 -3.09155959e+01]
215418502656.19525


In [31]:
train_pred_h3 = reg_h3_p2.predict(train_p2_h3_x)
test_pred_h3 = reg_h3_p2.predict(test_p2_h3_x)

In [32]:
known_h3_index = np.count_nonzero(test_p2_h3_x[:, 0:17], axis=1) == 2

In [33]:
test_p2_h3_y = test_p2_h3_y[known_h3_index]
test_p2_h3_x = test_p2_h3_x[known_h3_index]
test_pred_h3 = test_pred_h3[known_h3_index]

In [34]:
train_p2_geo_x

Unnamed: 0,eta,source_lat,source_lng,destination_lat,destination_lng,hour,holiday
2,759,35.779682,51.418243,35.759449,51.411560,81.904167,0
19,1088,35.668488,51.471096,35.737984,51.507107,78.930833,0
24,930,35.743820,51.465885,35.758488,51.441872,81.924722,0
27,783,35.753742,51.367241,35.735798,51.405380,91.971389,0
32,888,35.638977,51.334576,35.671574,51.302036,93.229722,0
...,...,...,...,...,...,...,...
4063322,1140,35.675362,51.467590,35.727814,51.416153,17.200556,0
4063323,995,35.682247,51.462456,35.641071,51.500271,22.591111,0
4063328,1187,35.736115,51.313038,35.758835,51.443092,6.766944,0
4063329,631,35.783543,51.381916,35.802864,51.403702,14.405833,0


In [35]:
print("ETA train MAE", mean_absolute_error(train_p2_geo_y, train_p2_geo_x['eta']))
print("prediction train MAE", mean_absolute_error(train_p2_geo_y, train_pred_geo))

print("ETA test MAE ", mean_absolute_error(test_p2_geo_y, test_p2_geo_x['eta']))
print("prediction test MAE", mean_absolute_error(test_p2_geo_y, test_pred_geo))


print("ETA train MSE", mean_squared_error(train_p2_geo_y, train_p2_geo_x['eta']))
print("prediction train MSE", mean_squared_error(train_p2_geo_y, train_pred_geo))

print("ETA test MSE ", mean_squared_error(test_p2_geo_y, test_p2_geo_x['eta']))
print("prediction test MSE", mean_squared_error(test_p2_geo_y, test_pred_geo))


print("ETA train explained_variance_score", explained_variance_score(train_p2_geo_y, train_p2_geo_x['eta']))
print("prediction train explained_variance_score", explained_variance_score(train_p2_geo_y, train_pred_geo))

print("ETA test explained_variance_score ", explained_variance_score(test_p2_geo_y, test_p2_geo_x['eta']))
print("prediction test explained_variance_score", explained_variance_score(test_p2_geo_y, test_pred_geo))


print("ETA train median_absolute_error", median_absolute_error(train_p2_geo_y, train_p2_geo_x['eta']))
print("prediction train median_absolute_error", median_absolute_error(train_p2_geo_y, train_pred_geo))

print("ETA test median_absolute_error ", median_absolute_error(test_p2_geo_y, test_p2_geo_x['eta']))
print("prediction test median_absolute_error", median_absolute_error(test_p2_geo_y, test_pred_geo))


print("ETA train r2_score", r2_score(train_p2_geo_y, train_p2_geo_x['eta']))
print("prediction train r2_score", r2_score(train_p2_geo_y, train_pred_geo))

print("ETA test r2_score ", r2_score(test_p2_geo_y, test_p2_geo_x['eta']))
print("prediction test r2_score", r2_score(test_p2_geo_y, test_pred_geo))


print("ETA train MAPE", mean_absolute_percentage_error(train_p2_geo_y, train_p2_geo_x['eta']))
print("prediction train MAPE", mean_absolute_percentage_error(train_p2_geo_y, train_pred_geo))

print("ETA test MAPE ", mean_absolute_percentage_error(test_p2_geo_y, test_p2_geo_x['eta']))
print("prediction test MAPE", mean_absolute_percentage_error(test_p2_geo_y, test_pred_geo))

ETA train MAE 181.1573429813144
prediction train MAE 177.55149375080015
ETA test MAE  182.14698815705523
prediction test MAE 181.01447786079595
ETA train MSE 93722.89682543185
prediction train MSE 77895.36257470411
ETA test MSE  102572.98804455608
prediction test MSE 86905.54962074457
ETA train explained_variance_score 0.339926594598459
prediction train explained_variance_score 0.3536449294103323
ETA test explained_variance_score  0.3115969693696259
prediction test explained_variance_score 0.3253427331027492
ETA train median_absolute_error 108.0
prediction train median_absolute_error 129.96738825080502
ETA test median_absolute_error  106.0
prediction test median_absolute_error 129.8712364018029
ETA train r2_score 0.22231224566964858
prediction train r2_score 0.3536449294103323
ETA test r2_score  0.20370830568559584
prediction test r2_score 0.325337316655269
ETA train MAPE 0.16111555184589296
prediction train MAPE 0.1782277015090005
ETA test MAPE  0.16145966180090804
prediction test MAP

In [46]:
train_p4_geo_x_normalized = (train_p4_geo_x - train_p4_geo_x.min()) / (train_p4_geo_x.max() - train_p4_geo_x.min())
train_p4_geo_y_normalized = (train_p4_geo_y - train_p4_geo_y.min()) / (train_p4_geo_y.max() - train_p4_geo_y.min())
# test_p4_geo_x_normalized = (test_p4_geo_x - train_p4_geo_x.min()) / (train_p4_geo_x.max() - train_p4_geo_x.min())
# test_p4_geo_y_normalized = (test_p4_geo_y - train_p4_geo_y.min()) / (train_p4_geo_y.max() - train_p4_geo_y.min())

reg_p4_geo_normalized = LinearRegression().fit(train_p4_geo_x_normalized, train_p4_geo_y_normalized)
print(reg_p4_geo_normalized.coef_)
print(reg_p4_geo_normalized.intercept_)

[ 0.95267888  0.03992745  0.01639903  0.14161629  0.07104881 -0.00869313
 -0.01985985]
0.027375837176890544


In [39]:
print("ETA train MAE", mean_absolute_error(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train MAE", mean_absolute_error(train_p4_h3_y, train_pred_h3))

print("ETA test MAE ", mean_absolute_error(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test MAE", mean_absolute_error(test_p4_h3_y, test_pred_h3))


print("ETA train MSE", mean_squared_error(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train MSE", mean_squared_error(train_p4_h3_y, train_pred_h3))

print("ETA test MSE ", mean_squared_error(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test MSE", mean_squared_error(test_p4_h3_y, test_pred_h3))


print("ETA train explained_variance_score", explained_variance_score(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train explained_variance_score", explained_variance_score(train_p4_h3_y, train_pred_h3))

print("ETA test explained_variance_score ", explained_variance_score(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test explained_variance_score", explained_variance_score(test_p4_h3_y, test_pred_h3))


print("ETA train median_absolute_error", median_absolute_error(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train median_absolute_error", median_absolute_error(train_p4_h3_y, train_pred_h3))

print("ETA test median_absolute_error ", median_absolute_error(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test median_absolute_error", median_absolute_error(test_p4_h3_y, test_pred_h3))


print("ETA train r2_score", r2_score(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train r2_score", r2_score(train_p4_h3_y, train_pred_h3))

print("ETA test r2_score ", r2_score(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test r2_score", r2_score(test_p4_h3_y, test_pred_h3))


print("ETA train MAPE", mean_absolute_percentage_error(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train MAPE", mean_absolute_percentage_error(train_p4_h3_y, train_pred_h3))

print("ETA test MAPE ", mean_absolute_percentage_error(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test MAPE", mean_absolute_percentage_error(test_p4_h3_y, test_pred_h3))

ETA train MAE 558.6899742605314
prediction train MAE 468.32320172854264
ETA test MAE  609.4802658229795
prediction test MAE 532.3905549659162
ETA train MSE 724259.5492995734
prediction train MSE 474510.4137146755
ETA test MSE  1005379.4640005743
prediction test MSE 706504.3378821977
ETA train explained_variance_score 0.5486781823287357
prediction train explained_variance_score 0.5735345690702949
ETA test explained_variance_score  0.4753275329161858
prediction test explained_variance_score 0.5084530615897137
ETA train median_absolute_error 353.0
prediction train median_absolute_error 347.19482421875
ETA test median_absolute_error  352.0
prediction test median_absolute_error 366.8814697265625
ETA train r2_score 0.3490729563150865
prediction train r2_score 0.5735345690702917
ETA test r2_score  0.2984759208241079
prediction test r2_score 0.5070221515223838
ETA train MAPE 0.16697790395524195
prediction train MAPE 0.1610154609737745
ETA test MAPE  0.1730999583791752
prediction test MAPE 0.17

In [47]:
train_p4_h3_x_normalized = (train_p4_h3_x - train_p4_h3_x.min()) / (train_p4_h3_x.max() - train_p4_h3_x.min())
train_p4_h3_y_normalized = (train_p4_h3_y - train_p4_h3_y.min()) / (train_p4_h3_y.max() - train_p4_h3_y.min())
# test_p4_h3_x_normalized = (test_p4_h3_x - train_p4_h3_x.min()) / (train_p4_h3_x.max() - train_p4_h3_x.min())
# test_p4_h3_y_normalized = (test_p4_h3_y - train_p4_h3_y.min()) / (train_p4_h3_y.max() - train_p4_h3_y.min())

reg_p4_h3_normalized = LinearRegression().fit(train_p4_h3_x_normalized, train_p4_h3_y_normalized)
print(reg_p4_h3_normalized.coef_)
print(reg_p4_h3_normalized.intercept_)

[ 3.38839243e+11  3.38839243e+11  3.38839244e+11  3.38839243e+11
  3.38839243e+11  3.38839243e+11  3.38839243e+11 -1.81469415e+11
 -1.81469415e+11 -1.81469415e+11 -1.81469415e+11 -1.81469415e+11
 -1.81469415e+11 -1.81469415e+11 -1.81469415e+11 -1.81469415e+11
 -1.81469415e+11  1.10072538e+00 -4.58049363e-01 -1.50679642e+02]
-17055362.3876918


In [None]:
%set_env AWS_ACCESS_KEY_ID=SokXIEc1g9vNqCJt4CSObyk6vumoOOPQ
%set_env AWS_SECRET_ACCESS_KEY=QNyTpGhFjUTYSP9VKmfhpUizwKr0t8gk
%set_env MLFLOW_S3_ENDPOINT_URL=https://minio-clustered-smapp-storage.apps.private.teh-1.snappcloud.io

# create experiment
%set_env MLFLOW_TRACKING_URI=https://mlflow.apps.private.okd4.teh-1.snappcloud.io/
# mlflow experiments create --experiment-name elahe

# run script under experiment
%set_env MLFLOW_EXPERIMENT_NAME=elahe
# cd save/
# python test.py

In [None]:
! echo $AWS_ACCESS_KEY_ID
! echo $AWS_SECRET_ACCESS_KEY
! echo $MLFLOW_S3_ENDPOINT_URL
! echo $MLFLOW_TRACKING_URI
! echo $MLFLOW_EXPERIMENT_NAME

In [None]:
# with mlflow.start_run(run_name="regression") as run:
#     # Log the sklearn model and register as version 1
#     mlflow.sklearn.log_model(
#         sk_model=reg_p4,
#         artifact_path="regression",
#         registered_model_name="reg-model"
#     )

In [None]:
# with mlflow.start_run(run_name="YOUR_RUN_NAME") as run:
#     params = {"n_estimators": 5, "random_state": 42}
#     sk_learn_rfr = RandomForestRegressor(**params)
#
#     # Log parameters and metrics using the MLflow APIs
#     mlflow.log_params(params)
#     mlflow.log_param("param_1", randint(0, 100))
#     mlflow.log_metrics({"metric_1": random(), "metric_2": random() + 1})
#
#     # Log the sklearn model and register as version 1
#     mlflow.sklearn.log_model(
#         sk_model=sk_learn_rfr,
#         artifact_path="sklearn-model",
#         registered_model_name="sk-learn-random-forest-reg-model"
#     )