In [1]:
import pandas as pd

In [2]:
orders_df = pd.read_csv("datasets/orders.csv")
nodes_df = pd.read_csv("datasets/nodes.csv")
geolocation_df = pd.read_csv("datasets/geolocation.csv")
weather_df = pd.read_json("datasets/weather.json")

### Orders info

In [3]:
orders_df.shape

(5000, 5)

In [4]:
orders_df.dtypes

Id                     int64
running_time          object
completed_time        object
route_distance_km    float64
delta_time           float64
dtype: object

In [5]:
orders_df.describe()

Unnamed: 0,Id,route_distance_km,delta_time
count,5000.0,5000.0,5000.0
mean,1.437166e+17,3.705945,613.1616
std,5.322209e+18,1.767206,213.802608
min,-9.220688e+18,0.0,50.0
25%,-4.422074e+18,2.35975,441.0
50%,1.826612e+17,3.394,614.0
75%,4.817401e+18,4.75325,788.0
max,9.222287e+18,15.954,999.0


In [6]:
orders_df.head()

Unnamed: 0,Id,running_time,completed_time,route_distance_km,delta_time
0,-4773019581999572651,2022-01-24 18:30:21,2022-01-24 18:44:43,3.74,862.0
1,-7575630690398473489,2022-01-24 06:53:53,2022-01-24 07:06:26,3.526,753.0
2,-6264582368520213833,2022-01-24 10:00:59,2022-01-24 10:15:58,5.071,899.0
3,5964315354301636538,2022-01-24 14:28:05,2022-01-24 14:35:08,2.867,423.0
4,1372379574816145639,2022-01-24 11:57:29,2022-01-24 12:06:29,3.751,540.0


### Nodes info

In [7]:
nodes_df.shape

(480291, 5)

In [8]:
nodes_df.dtypes

Id               int64
node_start       int64
node_finish      int64
distance       float64
speed          float64
dtype: object

In [9]:
nodes_df.describe()

Unnamed: 0,Id,node_start,node_finish,distance,speed
count,480291.0,480291.0,480291.0,480291.0,476307.0
mean,1.325593e+17,3831874000.0,3831719000.0,37.992984,31.293073
std,5.329873e+18,2702820000.0,2702986000.0,43.718081,10.131391
min,-9.221524e+18,10980420.0,10980420.0,0.076575,5.0
25%,-4.399445e+18,1570777000.0,1570777000.0,9.548104,24.0
50%,1.556848e+17,4439628000.0,4439628000.0,22.23788,30.0
75%,4.817189e+18,5214945000.0,5215058000.0,50.181445,37.0
max,9.222287e+18,8952487000.0,8952487000.0,1374.510868,100.0


In [10]:
nodes_df.head()

Unnamed: 0,Id,node_start,node_finish,distance,speed
0,-2627062893189810184,10980432,2133368107,17.414917,32.0
1,-2627062893189810184,10980433,5212387954,17.186539,26.0
2,-2627062893189810184,10980445,5221700954,28.513481,26.0
3,-2627062893189810184,10980498,10980445,154.266122,25.0
4,-2627062893189810184,10980647,1986137911,8.542824,29.0


### Weather info

In [11]:
weather_df.shape

(3, 1)

In [12]:
weather_df.head()

Unnamed: 0,data
nearest_area,"[{'areaName': [{'value': 'Odessa'}], 'country'..."
request,"[{'type': 'City', 'query': 'Odessa, Ukraine'}]"
weather,"[{'date': '2022-01-24', 'astronomy': [{'sunris..."


### Geolocation

In [13]:
geolocation_df.head()

Unnamed: 0.1,Unnamed: 0,Id,node_start,node_finish,distance,speed,lon_start,lat_start,tags_start,lon_end,lat_end,tags_end
0,0,1403210517021349791,290773715,2041636852,64.755864,36.0,30.724242,46.438158,,30.724242,46.438158,
1,1,-6121885842330106250,290773715,2041636852,64.755799,36.0,30.724242,46.438158,,30.724242,46.438158,
2,2,1360686813931198539,290773715,2041636852,64.755867,41.0,30.724242,46.438158,,30.724242,46.438158,
3,3,-6585029545990233408,290773715,2041636852,64.755776,46.0,30.724242,46.438158,,30.724242,46.438158,
4,4,-7417366892351501483,290773715,2041636852,64.755791,51.0,30.724242,46.438158,,30.724242,46.438158,


In [14]:
geolocation_df = geolocation_df[["Id", "lon_start", "lat_start", "lon_end", "lat_end"]]

In [15]:
geolocation_df.shape

(428737, 5)

In [16]:
geolocation_df.head()

Unnamed: 0,Id,lon_start,lat_start,lon_end,lat_end
0,1403210517021349791,30.724242,46.438158,30.724242,46.438158
1,-6121885842330106250,30.724242,46.438158,30.724242,46.438158
2,1360686813931198539,30.724242,46.438158,30.724242,46.438158
3,-6585029545990233408,30.724242,46.438158,30.724242,46.438158
4,-7417366892351501483,30.724242,46.438158,30.724242,46.438158


### Preprocessing

1. Finding average speed during the running time

In [17]:
speed_mean_df = nodes_df.groupby(["Id"], as_index=False).agg(**{"average_speed": ("speed", "mean")})

In [18]:
speed_mean_df.shape

(6000, 2)

### Final info

In [19]:
df = orders_df \
    .merge(nodes_df, on='Id', how='inner') \
    .merge(speed_mean_df, on='Id', how='inner') \
    .merge(geolocation_df, on='Id', how='inner')

In [20]:
df.head(n=100)

Unnamed: 0,Id,running_time,completed_time,route_distance_km,delta_time,node_start,node_finish,distance,speed,average_speed,lon_start,lat_start,lon_end,lat_end


In [21]:
df.shape

(0, 14)