## Load Libraries

In [1]:
import pandas          as pd
import numpy           as np
import pyarrow.parquet as pq
import plotly.express  as px
import plotly.io       as pio

pio.renderers.default='notebook'

## Read data

### NM2 TRIP DATA (2021-03-03)

In [4]:
%%time
nm2_trip_data_file = '/home/darkoe1/notebooks/Telematics/nm2_get_data/NM2_RT_TripData_OneFilePerDay/trip_summary_nm2_2021-03-06.parquet'


nm2_trip_data = (pq.read_table(nm2_trip_data_file)
                 .to_pandas()
                 .sort_values('trip_start_datetime_utc')
                 .reset_index(drop=True)
                 )

print(f"Number of Trip Days: {(nm2_trip_data.trip_start_datetime_utc.dt.date).nunique()}")
print(f"Number of Load Days: {len(pd.date_range('2020-06-05', '2021-03-02'))}")
print(f"Trip Count: {len(nm2_trip_data):,}")
print(f"Driver Count: {nm2_trip_data.short_user_id.nunique():,}")
nm2_trip_data.head(3)

Number of Trip Days: 11
Number of Load Days: 271
Trip Count: 105,553
Driver Count: 20,201
CPU times: user 240 ms, sys: 91.6 ms, total: 331 ms
Wall time: 310 ms


Unnamed: 0,drive_id,trip_label,trip_start_datetime_utc,trip_end_datetime_utc,trip_utc_offset,adjusted_distance_km,sum_rq5,moving_sec,percent_trip_missing,short_user_id,account_id,load_datetime_utc
0,AE323D6D-1369-486E-BBB0-4EB78A20B1BA,car,2021-02-19 21:22:17,2021-02-19 21:35:03,-06:00:00,14.1216,2120.075093,674,11.34,23579523,af01e3b8-87e6-4f6a-a655-2c7665cd6149,2021-05-21 07:32:03
1,423B4B53-23EF-49EC-B8C1-DC063BD8DBDD,car,2021-02-19 23:56:09,2021-02-20 00:10:52,-06:00:00,36.7948,2472.642917,668,21.83,23579523,af01e3b8-87e6-4f6a-a655-2c7665cd6149,2021-05-20 20:25:45
2,02F5D1EB-CD2E-49D0-B173-EB8736E8EB1A,bike,2021-02-21 04:45:12,2021-02-21 04:49:24,-05:00:00,1.4124,287.676992,250,0.0,80668142,0fbd7b01-14c3-4d3f-aa03-03cf467c6274,2021-05-20 16:15:15


Check if there are records with distance < 0 (i.e trip detail are in ascending order)

In [8]:
nm2_trip_data.query('adjusted_distance_km < 0')

Unnamed: 0,drive_id,trip_label,trip_start_datetime_utc,trip_end_datetime_utc,trip_utc_offset,adjusted_distance_km,sum_rq5,moving_sec,percent_trip_missing,short_user_id,account_id,load_datetime_utc


Check if a user has only one trip label (No they don't)

In [11]:
nm2_trip_data.query('account_id == "0fbd7b01-14c3-4d3f-aa03-03cf467c6274"')

Unnamed: 0,drive_id,trip_label,trip_start_datetime_utc,trip_end_datetime_utc,trip_utc_offset,adjusted_distance_km,sum_rq5,moving_sec,percent_trip_missing,short_user_id,account_id,load_datetime_utc
2,02F5D1EB-CD2E-49D0-B173-EB8736E8EB1A,bike,2021-02-21 04:45:12,2021-02-21 04:49:24,-05:00:00,1.4124,287.676992,250,0.0,80668142,0fbd7b01-14c3-4d3f-aa03-03cf467c6274,2021-05-20 16:15:15
3,CA0B75F9-F067-4908-8CCF-A0599AF401A7,bike,2021-02-21 15:38:05,2021-02-21 15:49:46,-05:00:00,8.445,1077.079619,691,1.57,80668142,0fbd7b01-14c3-4d3f-aa03-03cf467c6274,2021-05-21 09:21:34
4,F4FBED47-BBE2-44E5-ACBE-3CDA1F15A21E,car,2021-02-21 17:23:50,2021-02-21 17:58:45,-05:00:00,62.613,3959.649822,2082,0.19,80668142,0fbd7b01-14c3-4d3f-aa03-03cf467c6274,2021-05-21 12:07:26
5,CD57F686-DB5C-44B5-A34D-03773E8D9C77,car,2021-02-21 18:01:03,2021-02-21 18:05:52,-05:00:00,7.298,1045.688135,232,0.69,80668142,0fbd7b01-14c3-4d3f-aa03-03cf467c6274,2021-05-21 09:33:58
6,F18042D8-A21D-41CD-B734-2B5DAAF87936,car,2021-02-22 16:21:30,2021-02-22 16:23:11,-05:00:00,0.9612,336.768629,69,15.69,80668142,0fbd7b01-14c3-4d3f-aa03-03cf467c6274,2021-05-21 11:53:36
7,4BD88816-19F3-47E2-9872-486F92F5E6E1,car,2021-02-22 17:21:19,2021-02-22 17:25:24,-05:00:00,4.4508,1258.454578,230,0.0,80668142,0fbd7b01-14c3-4d3f-aa03-03cf467c6274,2021-05-20 21:03:32
68477,AA6D5876-099B-4D89-AF1E-DC25BB8FC42F,car,2021-03-06 19:02:53,2021-03-06 19:10:27,-05:00:00,5.169,2252.388379,396,0.66,80668142,0fbd7b01-14c3-4d3f-aa03-03cf467c6274,2021-05-21 07:17:13


### NM2 User Label Data

In [None]:
%%time
nm2_user_label_file = '/home/alis15/notebooks/Telematics/1.2 CMT/nm2_UserLabel_06052020_03022021.parquet'

nm2_user_label_data = (pq.read_table(nm2_user_label_file)
                       .to_pandas()
                       .sort_values('load_date')
                       .reset_index(drop=True)
                       )

print(f"Record Count: {len(nm2_user_label_data):,}")
nm2_user_label_data.head(3)

### Driver Summary

In [13]:
%%time
nm2_driver_summary = (nm2_trip_data
                      .set_index('short_user_id')
                      .reset_index()
                      .groupby(["short_user_id", "account_id"])
                      .agg(adjusted_distance_km=("adjusted_distance_km", "sum"),
                           sum_rq5=("sum_rq5", "sum"),
                           moving_sec=("moving_sec", "sum"),
                           last_trip_date = ("trip_end_datetime_utc", "max"),
                           number_of_trips = ("trip_end_datetime_utc", "count"))
                      .assign(Rq5a=lambda df: df.sum_rq5/df.moving_sec)
                      .reset_index()
                      .filter(["short_user_id", "account_id", "adjusted_distance_km", "Rq5a", "number_of_trips"], axis=1)
                      )
print(f"Driver Count: {len(nm2_driver_summary):,}")
nm2_driver_summary.head(3)

Driver Count: 20,201
CPU times: user 170 ms, sys: 9.54 ms, total: 179 ms
Wall time: 178 ms


Unnamed: 0,short_user_id,account_id,adjusted_distance_km,Rq5a,number_of_trips
0,10000357,37b36adb-708a-4e4e-a142-4d27dabff397,99.4888,5.114341,7
1,10021061,8dd694ac-6783-4526-bc01-c715380bab52,94.7279,4.254542,4
2,10021470,2e3b0771-d5e4-4895-8dc3-ac06934c2eb0,144.3952,3.431643,22


In [14]:
nm2_driver_summary.query('account_id == "0fbd7b01-14c3-4d3f-aa03-03cf467c6274"')

Unnamed: 0,short_user_id,account_id,adjusted_distance_km,Rq5a,number_of_trips
15819,80668142,0fbd7b01-14c3-4d3f-aa03-03cf467c6274,90.3494,2.586761,7


### Distribution of sum_rq5

In [6]:
# Plot
fig_rq5a = px.histogram(nm2_trip_data,
                        x='sum_rq5',
                        title='<b>sum_rq5</b>',
                        labels={'sum_rq5': '<b>sum_rq5</b>'},
                        template="plotly_white",
                        width=800, height=600)

# Layout
fig_rq5a.update_layout(font=dict(family="Courier New, monospace", size=12),
                       hovermode=False)

fig_rq5a.show()

### Distribution of  Number of Trips

In [15]:
# Plot
fig_trips_count = px.histogram(nm2_driver_summary,
                               x='number_of_trips',
                               title='<b>Distribution of Number of Trips</b>',
                               labels={'number_of_trips': '<b>Number of Trips</b>'},
                               template="plotly_white",
                               width=800, height=600)

# Layout
fig_trips_count.update_layout(font=dict(family="Courier New, monospace", size=12),
                              hovermode=False)

fig_trips_count.show()

## Scoring

### Number of days scored

In [None]:
nm2_driver_scoring = (nm2_driver_heartbeat
                      .assign(scoring_days = lambda df: (np.nanmax(df[['data_collection_start', 'last_trip_date', 'last_request_date']].values, axis=1) 
                                                         - df.data_collection_start).dt.days + 1)
                     )
print(f"Driver Count: {len(nm2_driver_scoring):,}")
nm2_driver_scoring.head(3)