In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, median_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from scipy.stats import *
import h3
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from datetime import datetime
from math import floor
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn

In [2]:
ride_columns = ['accepted_driver_id', 'created_at', 'passenger_id', 'source_lat',
       'source_lng', 'destination_lat', 'destination_lng', 'eta', 'provider',
       'ata', 'id', 'city']

khatkesh_columns = ['ride_id','driver_id','a_t_a_result.arrival_a_t_a','a_t_a_result.boarding_a_t_a','a_t_a_result.ride_a_t_a','a_t_a_result.arrival_probe_result.probe.point.lat','a_t_a_result.arrival_probe_result.probe.point.lon','a_t_a_result.arrival_probe_result.probe.timestamp','a_t_a_result.arrival_probe_result.confidence','a_t_a_result.arrival_probe_result.h3_index', 'a_t_a_result.arrival_probe_result.k_ring_level','a_t_a_result.boarding_probe_result.probe.point.lat','a_t_a_result.boarding_probe_result.probe.point.lon','a_t_a_result.boarding_probe_result.probe.timestamp','a_t_a_result.boarding_probe_result.confidence','a_t_a_result.boarding_probe_result.h3_index','a_t_a_result.boarding_probe_result.k_ring_level','a_t_a_result.final_destination_probe_result.probe.point.lat','a_t_a_result.final_destination_probe_result.probe.point.lon','a_t_a_result.final_destination_probe_result.probe.timestamp','a_t_a_result.final_destination_probe_result.confidence','a_t_a_result.final_destination_probe_result.h3_index','a_t_a_result.final_destination_probe_result.k_ring_level','a_t_a_result.destination_probe_result.probe.point.lat','a_t_a_result.destination_probe_result.probe.point.lon','a_t_a_result.destination_probe_result.probe.timestamp','a_t_a_result.destination_probe_result.confidence','a_t_a_result.destination_probe_result.h3_index','a_t_a_result.destination_probe_result.k_ring_level','a_t_a_result.extra_destination_probe_result.probe.point.lat','a_t_a_result.extra_destination_probe_result.probe.point.lon','a_t_a_result.extra_destination_probe_result.probe.timestamp','a_t_a_result.extra_destination_probe_result.confidence','a_t_a_result.extra_destination_probe_result.h3_index','a_t_a_result.extra_destination_probe_result.k_ring_level','pickup_a_d_d_result.distance','pickup_a_d_d_result.confidence','pickup_a_d_d_result.route_ratio','pickup_a_d_d_result.g_p_s_ratio','ride_a_d_d_result.distance','ride_a_d_d_result.confidence','ride_a_d_d_result.route_ratio','ride_a_d_d_result.g_p_s_ratio','total_a_d_d_confidence','in_ride_allotment','e_d_d','clickhouse_time','hash']

train_dates = ['2022-08-02', '2022-08-03', '2022-08-04', '2022-08-05', '2022-08-06', '2022-08-07', '2022-08-08',
               '2022-08-09', '2022-08-10', '2022-08-11', '2022-08-12', '2022-08-13', '2022-08-14', '2022-08-15',
               '2022-08-16', '2022-08-17', '2022-08-18', '2022-08-19', '2022-08-20', '2022-08-21', '2022-08-22',
               '2022-08-23', '2022-08-24', '2022-08-25', '2022-08-26', '2022-08-27', '2022-08-28', '2022-08-29',
               '2022-08-30', '2022-08-31', '2022-09-01', '2022-09-02', '2022-09-03', '2022-09-04', '2022-09-05',
               '2022-09-06', '2022-09-07', '2022-09-08', '2022-09-09', '2022-09-10']

train_holidays = ['2022-08-07', '2022-08-08']

test_dates = ['2022-09-11', '2022-09-12', '2022-09-13', '2022-09-14', '2022-09-15', '2022-09-16', '2022-09-17',
              '2022-09-18', '2022-09-19', '2022-09-20', '2022-09-21', '2022-09-22', '2022-09-23', '2022-09-24',
              '2022-09-25', '2022-09-26', '2022-09-27', '2022-09-28', '2022-09-29', '2022-09-30', '2022-10-01',
              '2022-10-02', '2022-10-03', '2022-10-04', '2022-10-05', '2022-10-06', '2022-10-07', '2022-10-08',
              '2022-10-09']

test_holidays = ['2022-09-17', '2022-09-25', '2022-09-27', '2022-10-05']

In [3]:
rides_train = pd.read_csv('../rides_train.csv')
rides_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4063341 entries, 0 to 4063340
Data columns (total 59 columns):
 #   Column                                                       Dtype  
---  ------                                                       -----  
 0   accepted_driver_id                                           int64  
 1   created_at                                                   object 
 2   passenger_id                                                 int64  
 3   source_lat                                                   float64
 4   source_lng                                                   float64
 5   destination_lat                                              float64
 6   destination_lng                                              float64
 7   eta                                                          int64  
 8   provider                                                     object 
 9   ata                                                          int64  

In [4]:
rides_test = pd.read_csv('../rides_test.csv')
rides_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2729988 entries, 0 to 2729987
Data columns (total 59 columns):
 #   Column                                                       Dtype  
---  ------                                                       -----  
 0   accepted_driver_id                                           int64  
 1   created_at                                                   object 
 2   passenger_id                                                 int64  
 3   source_lat                                                   float64
 4   source_lng                                                   float64
 5   destination_lat                                              float64
 6   destination_lng                                              float64
 7   eta                                                          int64  
 8   provider                                                     object 
 9   ata                                                          int64  

In [None]:
rides_train_p1 = rides_train[rides_train['eta'] < 600]
rides_train_p1

In [None]:
rides_test_p1 = rides_test[(rides_test['eta'] < 600)]
rides_test_p1

There are some duplicates in the data

In [None]:
rides_train_p1 = rides_train_p1.drop_duplicates(subset=['ride_id', 'ata'])
len(rides_train_p1)

In [None]:
rides_test_p1 = rides_test_p1.drop_duplicates(subset=['ride_id', 'ata'])
len(rides_test_p1)

In [None]:
rides_train_p1 = rides_train_p1[(rides_train_p1['a_t_a_result.ride_a_t_a'] < 10800)]
rides_train_p1.describe()

In [None]:
rides_train_p1 = rides_train_p1[(rides_train_p1['a_t_a_result.destination_probe_result.confidence'] > 0) & (rides_train_p1['a_t_a_result.boarding_probe_result.confidence'] > 0)]
rides_train_p1.describe()

In [None]:
rides_test_p1 = rides_test_p1[(rides_test_p1['a_t_a_result.destination_probe_result.confidence'] > 0) & (rides_test_p1['a_t_a_result.boarding_probe_result.confidence'] > 0)]
rides_test_p1.describe()

In [None]:
rides_train_p1 = rides_train_p1[['eta', 'ata', 'a_t_a_result.ride_a_t_a', 'created_at', 'source_lat', 'source_lng', 'destination_lat', 'destination_lng', 'e_d_d', 'clickhouse_time']]
rides_train_p1

In [None]:
rides_test_p1 = rides_test_p1[['eta', 'ata', 'a_t_a_result.ride_a_t_a', 'created_at', 'source_lat', 'source_lng', 'destination_lat', 'destination_lng', 'e_d_d', 'clickhouse_time']]
rides_test_p1

In [None]:
rides_train_p1['source_h3_4'] = rides_train_p1.apply(lambda row: h3.geo_to_h3(row.source_lat, row.source_lng, 4), axis=1)
rides_train_p1['dest_h3_4'] = rides_train_p1.apply(lambda row: h3.geo_to_h3(row.destination_lat, row.destination_lng, 4), axis=1)
rides_train_p1['hour'] = rides_train_p1.apply(lambda row: ((datetime.timestamp(datetime.strptime(row.created_at,"%Y-%m-%d %H:%M:%S")) - 1662147000) % 604800) / 3600, axis=1)
rides_train_p1['holiday'] = rides_train_p1.apply(lambda row: int(row.created_at.split()[0] in train_holidays), axis=1)
rides_train_p1

In [None]:
rides_test_p1['source_h3_4'] = rides_test_p1.apply(lambda row: h3.geo_to_h3(row.source_lat, row.source_lng, 4), axis=1)
rides_test_p1['dest_h3_4'] = rides_test_p1.apply(lambda row: h3.geo_to_h3(row.destination_lat, row.destination_lng, 4), axis=1)
rides_test_p1['hour'] = rides_test_p1.apply(lambda row: ((datetime.timestamp(datetime.strptime(row.created_at,"%Y-%m-%d %H:%M:%S")) - 1662147000) % 604800) / 3600, axis=1)
rides_test_p1['holiday'] = rides_test_p1.apply(lambda row: int(row.created_at.split()[0] in test_holidays), axis=1)
rides_test_p1

In [None]:
train_p1_geo_x = rides_train_p1[["eta", "source_lat", "source_lng", "destination_lat", "destination_lng", "hour", "holiday"]]
train_p1_h3_x = rides_train_p1[["eta", "source_h3_4", "dest_h3_4", "hour", "holiday"]]
train_p1_geo_y = rides_train_p1["a_t_a_result.ride_a_t_a"]
train_p1_h3_y = rides_train_p1["a_t_a_result.ride_a_t_a"]

In [None]:
test_p1_geo_x = rides_test_p1[["eta", "source_lat", "source_lng", "destination_lat", "destination_lng", "hour", "holiday"]]
test_p1_h3_x = rides_test_p1[["eta", "source_h3_4", "dest_h3_4", "hour", "holiday"]]
test_p1_geo_y = rides_test_p1["a_t_a_result.ride_a_t_a"]
test_p1_h3_y = rides_test_p1["a_t_a_result.ride_a_t_a"]

In [None]:
dummy_train_rides = pd.get_dummies(train_p1_h3_x, prefix={'source_h3_4':'source_h3_4',
                                                             'dest_h3_4':'dest_h3_4'})
dummy_train_rides.describe()

In [None]:
print(np.count_nonzero(dummy_train_rides, axis=0))
print(dummy_train_rides.columns)

In [None]:
geo_noise = np.count_nonzero(dummy_train_rides, axis=0) < 100

cols = dummy_train_rides.columns
for col, noise in zip(cols, geo_noise):
    if noise:
        # DOESN'T REINDEX
        # remove the rows where rides_train_p1_x is in the noisy geo (geo is 1)
        train_p1_h3_x = train_p1_h3_x[dummy_train_rides[col] == 0]
        train_p1_h3_y = train_p1_h3_y[dummy_train_rides[col] == 0]

train_p1_h3_x.describe()

In [None]:
ohe = ColumnTransformer(
    [('OHE', OneHotEncoder(handle_unknown='ignore', sparse=False),['source_h3_4', 'dest_h3_4'])],
    remainder = 'passthrough'
    ).fit(train_p1_h3_x)

In [None]:
ohe.get_feature_names_out()

In [None]:
train_p1_h3_x = ohe.transform(train_p1_h3_x)

In [None]:
test_p1_h3_x = ohe.transform(test_p1_h3_x)

In [None]:
reg_geo_p1 = LinearRegression().fit(train_p1_geo_x, train_p1_geo_y)

In [None]:
print(reg_geo_p1.coef_)
print(reg_geo_p1.intercept_)

In [None]:
train_pred_geo = reg_geo_p1.predict(train_p1_geo_x)
test_pred_geo = reg_geo_p1.predict(test_p1_geo_x)

In [None]:
reg_h3_p1 = LinearRegression().fit(train_p1_h3_x, train_p1_h3_y)

In [None]:
print(reg_h3_p1.coef_)
print(reg_h3_p1.intercept_)

In [None]:
train_pred_h3 = reg_h3_p1.predict(train_p1_h3_x)
test_pred_h3 = reg_h3_p1.predict(test_p1_h3_x)

In [None]:
known_h3_index = np.count_nonzero(test_p1_h3_x[:, 0:17], axis=1) == 2

In [None]:
test_p1_h3_y = test_p1_h3_y[known_h3_index]
test_p1_h3_x = test_p1_h3_x[known_h3_index]
test_pred_h3 = test_pred_h3[known_h3_index]

In [None]:
train_p1_geo_x

In [44]:
print("ETA train MAE", mean_absolute_error(train_p1_geo_y, train_p1_geo_x['eta']))
print("prediction train MAE", mean_absolute_error(train_p1_geo_y, train_pred_geo))

print("ETA test MAE ", mean_absolute_error(test_p1_geo_y, test_p1_geo_x['eta']))
print("prediction test MAE", mean_absolute_error(test_p1_geo_y, test_pred_geo))


print("ETA train MSE", mean_squared_error(train_p1_geo_y, train_p1_geo_x['eta']))
print("prediction train MSE", mean_squared_error(train_p1_geo_y, train_pred_geo))

print("ETA test MSE ", mean_squared_error(test_p1_geo_y, test_p1_geo_x['eta']))
print("prediction test MSE", mean_squared_error(test_p1_geo_y, test_pred_geo))


print("ETA train explained_variance_score", explained_variance_score(train_p1_geo_y, train_p1_geo_x['eta']))
print("prediction train explained_variance_score", explained_variance_score(train_p1_geo_y, train_pred_geo))

print("ETA test explained_variance_score ", explained_variance_score(test_p1_geo_y, test_p1_geo_x['eta']))
print("prediction test explained_variance_score", explained_variance_score(test_p1_geo_y, test_pred_geo))


print("ETA train median_absolute_error", median_absolute_error(train_p1_geo_y, train_p1_geo_x['eta']))
print("prediction train median_absolute_error", median_absolute_error(train_p1_geo_y, train_pred_geo))

print("ETA test median_absolute_error ", median_absolute_error(test_p1_geo_y, test_p1_geo_x['eta']))
print("prediction test median_absolute_error", median_absolute_error(test_p1_geo_y, test_pred_geo))


print("ETA train r2_score", r2_score(train_p1_geo_y, train_p1_geo_x['eta']))
print("prediction train r2_score", r2_score(train_p1_geo_y, train_pred_geo))

print("ETA test r2_score ", r2_score(test_p1_geo_y, test_p1_geo_x['eta']))
print("prediction test r2_score", r2_score(test_p1_geo_y, test_pred_geo))


print("ETA train MAPE", mean_absolute_percentage_error(train_p1_geo_y, train_p1_geo_x['eta']))
print("prediction train MAPE", mean_absolute_percentage_error(train_p1_geo_y, train_pred_geo))

print("ETA test MAPE ", mean_absolute_percentage_error(test_p1_geo_y, test_p1_geo_x['eta']))
print("prediction test MAPE", mean_absolute_percentage_error(test_p1_geo_y, test_pred_geo))

ETA train MAE 328.5940455458648
prediction train MAE 297.90919973258815
ETA test MAE  334.4388657208343
prediction test MAE 313.0926640552622
ETA train MSE 267538.4672356914
prediction train MSE 197278.02374889096
ETA test MSE  307447.5143727894
prediction test MSE 236500.08092897446
ETA train explained_variance_score 0.17476575601716515
prediction train explained_variance_score 0.19219475004177133
ETA test explained_variance_score  0.15194470364736523
prediction test explained_variance_score 0.17348450223625478
ETA train median_absolute_error 205.0
prediction train median_absolute_error 224.17002217840854
ETA test median_absolute_error  197.0
prediction test median_absolute_error 226.25554632038256
ETA train r2_score -0.0955045792321001
prediction train r2_score 0.19219475004177133
ETA test r2_score  -0.07452246219765657
prediction test r2_score 0.17343729453082235
ETA train MAPE 0.16670909089984084
prediction train MAPE 0.17256322803076857
ETA test MAPE  0.16769182531397583
predictio

In [46]:
train_p4_geo_x_normalized = (train_p4_geo_x - train_p4_geo_x.min()) / (train_p4_geo_x.max() - train_p4_geo_x.min())
train_p4_geo_y_normalized = (train_p4_geo_y - train_p4_geo_y.min()) / (train_p4_geo_y.max() - train_p4_geo_y.min())
# test_p4_geo_x_normalized = (test_p4_geo_x - train_p4_geo_x.min()) / (train_p4_geo_x.max() - train_p4_geo_x.min())
# test_p4_geo_y_normalized = (test_p4_geo_y - train_p4_geo_y.min()) / (train_p4_geo_y.max() - train_p4_geo_y.min())

reg_p4_geo_normalized = LinearRegression().fit(train_p4_geo_x_normalized, train_p4_geo_y_normalized)
print(reg_p4_geo_normalized.coef_)
print(reg_p4_geo_normalized.intercept_)

[ 0.95267888  0.03992745  0.01639903  0.14161629  0.07104881 -0.00869313
 -0.01985985]
0.027375837176890544


In [39]:
print("ETA train MAE", mean_absolute_error(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train MAE", mean_absolute_error(train_p4_h3_y, train_pred_h3))

print("ETA test MAE ", mean_absolute_error(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test MAE", mean_absolute_error(test_p4_h3_y, test_pred_h3))


print("ETA train MSE", mean_squared_error(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train MSE", mean_squared_error(train_p4_h3_y, train_pred_h3))

print("ETA test MSE ", mean_squared_error(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test MSE", mean_squared_error(test_p4_h3_y, test_pred_h3))


print("ETA train explained_variance_score", explained_variance_score(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train explained_variance_score", explained_variance_score(train_p4_h3_y, train_pred_h3))

print("ETA test explained_variance_score ", explained_variance_score(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test explained_variance_score", explained_variance_score(test_p4_h3_y, test_pred_h3))


print("ETA train median_absolute_error", median_absolute_error(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train median_absolute_error", median_absolute_error(train_p4_h3_y, train_pred_h3))

print("ETA test median_absolute_error ", median_absolute_error(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test median_absolute_error", median_absolute_error(test_p4_h3_y, test_pred_h3))


print("ETA train r2_score", r2_score(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train r2_score", r2_score(train_p4_h3_y, train_pred_h3))

print("ETA test r2_score ", r2_score(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test r2_score", r2_score(test_p4_h3_y, test_pred_h3))


print("ETA train MAPE", mean_absolute_percentage_error(train_p4_h3_y, train_p4_h3_x[:, 17]))
print("prediction train MAPE", mean_absolute_percentage_error(train_p4_h3_y, train_pred_h3))

print("ETA test MAPE ", mean_absolute_percentage_error(test_p4_h3_y, test_p4_h3_x[:, 17]))
print("prediction test MAPE", mean_absolute_percentage_error(test_p4_h3_y, test_pred_h3))

ETA train MAE 558.6899742605314
prediction train MAE 468.32320172854264
ETA test MAE  609.4802658229795
prediction test MAE 532.3905549659162
ETA train MSE 724259.5492995734
prediction train MSE 474510.4137146755
ETA test MSE  1005379.4640005743
prediction test MSE 706504.3378821977
ETA train explained_variance_score 0.5486781823287357
prediction train explained_variance_score 0.5735345690702949
ETA test explained_variance_score  0.4753275329161858
prediction test explained_variance_score 0.5084530615897137
ETA train median_absolute_error 353.0
prediction train median_absolute_error 347.19482421875
ETA test median_absolute_error  352.0
prediction test median_absolute_error 366.8814697265625
ETA train r2_score 0.3490729563150865
prediction train r2_score 0.5735345690702917
ETA test r2_score  0.2984759208241079
prediction test r2_score 0.5070221515223838
ETA train MAPE 0.16697790395524195
prediction train MAPE 0.1610154609737745
ETA test MAPE  0.1730999583791752
prediction test MAPE 0.17

In [47]:
train_p4_h3_x_normalized = (train_p4_h3_x - train_p4_h3_x.min()) / (train_p4_h3_x.max() - train_p4_h3_x.min())
train_p4_h3_y_normalized = (train_p4_h3_y - train_p4_h3_y.min()) / (train_p4_h3_y.max() - train_p4_h3_y.min())
# test_p4_h3_x_normalized = (test_p4_h3_x - train_p4_h3_x.min()) / (train_p4_h3_x.max() - train_p4_h3_x.min())
# test_p4_h3_y_normalized = (test_p4_h3_y - train_p4_h3_y.min()) / (train_p4_h3_y.max() - train_p4_h3_y.min())

reg_p4_h3_normalized = LinearRegression().fit(train_p4_h3_x_normalized, train_p4_h3_y_normalized)
print(reg_p4_h3_normalized.coef_)
print(reg_p4_h3_normalized.intercept_)

[ 3.38839243e+11  3.38839243e+11  3.38839244e+11  3.38839243e+11
  3.38839243e+11  3.38839243e+11  3.38839243e+11 -1.81469415e+11
 -1.81469415e+11 -1.81469415e+11 -1.81469415e+11 -1.81469415e+11
 -1.81469415e+11 -1.81469415e+11 -1.81469415e+11 -1.81469415e+11
 -1.81469415e+11  1.10072538e+00 -4.58049363e-01 -1.50679642e+02]
-17055362.3876918


In [None]:
%set_env AWS_ACCESS_KEY_ID=SokXIEc1g9vNqCJt4CSObyk6vumoOOPQ
%set_env AWS_SECRET_ACCESS_KEY=QNyTpGhFjUTYSP9VKmfhpUizwKr0t8gk
%set_env MLFLOW_S3_ENDPOINT_URL=https://minio-clustered-smapp-storage.apps.private.teh-1.snappcloud.io

# create experiment
%set_env MLFLOW_TRACKING_URI=https://mlflow.apps.private.okd4.teh-1.snappcloud.io/
# mlflow experiments create --experiment-name elahe

# run script under experiment
%set_env MLFLOW_EXPERIMENT_NAME=elahe
# cd save/
# python test.py

In [None]:
! echo $AWS_ACCESS_KEY_ID
! echo $AWS_SECRET_ACCESS_KEY
! echo $MLFLOW_S3_ENDPOINT_URL
! echo $MLFLOW_TRACKING_URI
! echo $MLFLOW_EXPERIMENT_NAME

In [None]:
# with mlflow.start_run(run_name="regression") as run:
#     # Log the sklearn model and register as version 1
#     mlflow.sklearn.log_model(
#         sk_model=reg_p4,
#         artifact_path="regression",
#         registered_model_name="reg-model"
#     )

In [None]:
# with mlflow.start_run(run_name="YOUR_RUN_NAME") as run:
#     params = {"n_estimators": 5, "random_state": 42}
#     sk_learn_rfr = RandomForestRegressor(**params)
#
#     # Log parameters and metrics using the MLflow APIs
#     mlflow.log_params(params)
#     mlflow.log_param("param_1", randint(0, 100))
#     mlflow.log_metrics({"metric_1": random(), "metric_2": random() + 1})
#
#     # Log the sklearn model and register as version 1
#     mlflow.sklearn.log_model(
#         sk_model=sk_learn_rfr,
#         artifact_path="sklearn-model",
#         registered_model_name="sk-learn-random-forest-reg-model"
#     )