In [2]:
import numpy as np
import pandas as pd
from scipy.special import kl_div
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime
import os

if not os.path.exists("images"):
    os.mkdir("images")
path = '/Users/georgiinusuev/PycharmProjects/SurgeVSLongRides'

In [91]:
df = pd.read_csv("/2comp/data/Result_134.csv")

In [92]:
kl_df = pd.DataFrame(columns=df['created_date_order_part'].unique(), index=df['created_date_order_part'].unique())
for dist1 in df['created_date_order_part'].unique():
    dist1_df = df[df['created_date_order_part'] == dist1]
    dist1_kl = []
    for dist2 in df['created_date_order_part'].unique():
        dist2_df = df[df['created_date_order_part'] == dist2]
        dist1_kl = dist1_kl + [sum(kl_div(dist1_df['normalized_frequency'][:min(len(dist1_df), len(dist2_df))].reset_index(drop=True), 
                                          dist2_df['normalized_frequency'][:min(len(dist1_df), len(dist2_df))].reset_index(drop=True)))]
    kl_df.loc[dist1] = dist1_kl
   
kl_df = kl_df.astype(float)

fig = px.imshow(kl_df,
                labels={'x': 'Date', 'y': 'Date', 'color': 'KL Divergence'},
                x=kl_df.columns,
                y=kl_df.index,
                color_continuous_scale='Viridis',
                title='KL Divergence Heatmap')

fig.update_layout(
    width=800,  
    height=800,  
    title={'x': 0.5, 'xanchor': 'center'})
fig.show()
fig.write_image(path+"/images/fig3.svg")

In [93]:
fig = px.line(df, 
              x='bin_start', 
              y='normalized_frequency', 
              color='created_date_order_part',
              labels={'bin_start': 'Distance Bin Start (meters)', 
                      'normalized_frequency': 'Normalized Frequency'},
              title='Normalized Distribution of Distance in Meters by Day (Binned)')
fig.update_layout(xaxis_range=[0,25000], width=1200)
fig.show()
fig.write_image(path+"/images/fig2.svg")

In [88]:
filtered_df = df[df['created_date_order_part'].isin(['2024-03-19', '2024-05-12', '2024-05-13'])]
filtered_df = filtered_df.replace({'created_date_order_part': {'2024-03-19':'Tuesday, 2024-03-19', '2024-05-12': 'Sunday, 2024-05-12', '2024-05-13': 'Monday, 2024-05-13'}})
fig = px.line(filtered_df, 
              x='bin_start', 
              y='normalized_frequency', 
              color='created_date_order_part',
              labels={'bin_start': 'Distance Bin Start (meters)', 
                      'normalized_frequency': 'Normalized Frequency'},
              title='Normalized Distribution of Distance in Meters by Day (Binned)')
fig.update_layout(xaxis_range=[0,25000], width=1200)
fig.show()
fig.write_image(path+"/images/fig4.svg")

KeyError: 'created_date_order_part'

In [89]:
df_rides = pd.read_csv("/2comp/data/indriver_e6e40_emart_incity_detail.csv")
df_rides['created_date_order_part'] = pd.to_datetime(df_rides['created_date_order_part']).dt.date
df_rides['day_of_week'] = pd.to_datetime(df_rides['created_date_order_part']).dt.day_name()

In [77]:
fig = px.line(df_rides.groupby(['created_date_order_part']).count()['#'].reset_index(),
              x='created_date_order_part',
              y='#')
fig.update_layout(width=1200, title_text='# of Rides',
                  showlegend=False)
fig.show()
fig.write_image(path+"/images/fig1.svg")

In [98]:
def q20(x):
    return x.quantile(0.2)
def q50(x):
    return x.quantile(0.5)
def q80(x):
    return x.quantile(0.8)

quantile_df = df_rides.groupby(['created_date_order_part']).agg(distance_in_meters_q02=('distance_in_meters', q20),
                                                                distance_in_meters_q05=('distance_in_meters', q50), 
                                                                distance_in_meters_q08=('distance_in_meters', q80)).reset_index()

quantile_df['created_date_order_part'] = pd.to_datetime(quantile_df['created_date_order_part'])
quantile_df['day_of_week'] = quantile_df['created_date_order_part'].dt.day_name()
quantile_df['created_date_order_part'] = quantile_df['created_date_order_part'].dt.date

fig = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.1,
                    subplot_titles=("0.2 Quantile", "0.5 Quantile (Median)", "0.8 Quantile"))

fig.add_trace(
    go.Scatter(x=quantile_df['created_date_order_part'], 
               y=quantile_df['distance_in_meters_q02'], 
               mode='lines+markers',
               name='0.2 Quantile',
               customdata=quantile_df[['created_date_order_part', 'distance_in_meters_q02', 'day_of_week']],
               hovertemplate='Date: %{customdata[0]}<br>Day of the week: %{customdata[2]}<br>Distance: %{customdata[1]} meters<extra></extra>'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=quantile_df['created_date_order_part'], 
               y=quantile_df['distance_in_meters_q05'], 
               mode='lines+markers',
               name='0.5 Quantile (Median)',
               customdata=quantile_df[['created_date_order_part', 'distance_in_meters_q05', 'day_of_week']],
               hovertemplate='Date: %{customdata[0]}<br>Day of the week: %{customdata[2]}<br>Distance: %{customdata[1]} meters<extra></extra>'),
    row=2, col=1
)

fig.add_trace(
    go.Scatter(x=quantile_df['created_date_order_part'], 
               y=quantile_df['distance_in_meters_q08'], 
               mode='lines+markers',
               name='0.8 Quantile',
               customdata=quantile_df[['created_date_order_part', 'distance_in_meters_q08', 'day_of_week']],
               hovertemplate='Date: %{customdata[0]}<br>Day of the week: %{customdata[2]}<br>Distance: %{customdata[1]} meters<extra></extra>'),
    row=3, col=1
)

fig.update_layout(height=1200, width=900, title_text="Quantiles of Distance in Meters Over Time", hoversubplots="axis", hovermode="x", showlegend=False)
fig.show()
fig.write_image(path+"/images/fig5.svg")

In [79]:
df['created_date_order_part'] = pd.to_datetime(df['created_date_order_part'])
df['day_of_week'] = df['created_date_order_part'].dt.day_name()
distance_delta = abs(df['bin_start'][1] - df['bin_start'][0])

minlong_dist_df = {}
for date_sunday in df[df['day_of_week'] == 'Sunday']['created_date_order_part'].unique():
    df_sunday = df[df['created_date_order_part'] == date_sunday]
    date_monday = date_sunday + datetime.timedelta(days=1)
    df_monday = df[df['created_date_order_part'] == date_monday]
    minlong_dist = df_monday.loc[df_monday['normalized_frequency'].idxmax()]['bin_start']
    while df_monday[df_monday['bin_start'] == minlong_dist]['normalized_frequency'].values > df_sunday[df_sunday['bin_start'] == minlong_dist]['normalized_frequency'].values:
        minlong_dist = minlong_dist + distance_delta
    minlong_dist_df[date_sunday] = minlong_dist

minlong_dist_df = pd.DataFrame(minlong_dist_df.items(), columns=['Date', 'Minimal_Long_Ride'])

fig = px.line(minlong_dist_df, x='Date', y='Minimal_Long_Ride', markers=True)
fig.add_hline(y=minlong_dist_df['Minimal_Long_Ride'].mean(), 
              line_dash='dot',
              annotation_text=f'Mean value: {minlong_dist_df['Minimal_Long_Ride'].mean():.0f}')
fig.update_layout(width=1200, title_text='Monday and Sunday Distribution Intersection Bin')
fig.show()
fig.write_image(path+"/images/fig6.svg")

In [80]:
filtered_df = pd.read_csv("/2comp/data/Result_149.csv")
filtered_df = filtered_df.replace({'day_of_week': {1:'Sunday', 2: 'Monday'}})
fig = px.line(filtered_df,
              x='bin_start',
              y='normalized_frequency',
              color='day_of_week',
              labels={'bin_start': 'Distance Bin Start (meters)',
                      'normalized_frequency': 'Normalized Frequency'},
              title='Normalized Distribution of Distance in Meters by Day (Binned): Wednesdays vs Sundays')
fig.update_layout(xaxis_range=[0,25000])
fig.show()

In [95]:
df_weekdays = pd.read_csv('/2comp/data/MEP-1656/distance_dist_grouped_by_weekday.csv')
fig = px.line(df_weekdays, 
              x='bin_start', 
              y='normalized_frequency', 
              color='week_day',
              labels={'bin_start': 'Distance Bins (meters)', 
                      'normalized_frequency': 'Normalized Frequency'},
              title='Distribution of Distance by Day of Week')
fig.update_layout(xaxis_range=[0,25000], width=1200)
fig.show()

In [82]:
# df_mondays = pd.read_csv(
#     '/Users/georgiinusuev/PycharmProjects/SurgeVSLongRides/data/MEP-1656/distance_dist_grouped_by_date.csv')
# df_mondays = df_mondays[df_mondays['week_day'] == 'Monday']
# fig = px.line(df_mondays,
#               x='bin_start',
#               y='normalized_frequency',
#               color='created_date_order_part',
#               labels={'bin_start': 'Distance Bins (meters)',
#                       'normalized_frequency': 'Normalized Frequency'},
#               title='Distribution of Distance by Date (Mondays only)')
# fig.add_trace(go.Scatter(
#     x=df_weekdays[df_weekdays['week_day'] == 'Monday']['bin_start'],
#     y=df_weekdays[df_weekdays['week_day'] == 'Monday']['normalized_frequency'],
#     name='Aggregated Monday'
# ))
# fig.update_layout(xaxis_range=[0, 25000], width=800)
# fig.show()
# 
# df_aggr = df_weekdays[(df_weekdays['week_day'] == 'Monday') & (df_weekdays['bin_start'] <= 25000)]
# df_true = df_mondays[df_mondays['bin_start'] <= 25000]
# df_merged = pd.merge(df_true, df_aggr, on='bin_start', suffixes=('_true', '_aggr'))
# df_merged['delta_normalized_frequency'] = df_merged['normalized_frequency_true'] - df_merged[
#     'normalized_frequency_aggr']
# df_merged['rel_delta_normalized_frequency'] = (df_merged['normalized_frequency_true'] - df_merged[
#     'normalized_frequency_aggr']) / df_merged['normalized_frequency_true']
# 
# fig = px.line(df_merged.groupby(by=['bin_start'])[['delta_normalized_frequency']].sum().reset_index(),
#               x='bin_start',
#               y='delta_normalized_frequency',
#               labels={'bin_start': 'Distance Bins (meters)',
#                       'delta_normalized_frequency': 'Sum of Deltas'},
#               title='Sum of Diffs Between Normalized Frequencies, Compared to Aggregated Monday')
# fig.update_layout(xaxis_range=[0, 25000], width=800)
# fig.show()
# 
# fig = px.line(df_merged,
#               x='bin_start',
#               y='delta_normalized_frequency',
#               color='created_date_order_part',
#               labels={'bin_start': 'Distance Bins (meters)',
#                       'delta_normalized_frequency': 'Delta'},
#               title='Difference Between Normalized Frequencies, Compared to Aggregated Monday')
# fig.update_layout(xaxis_range=[0, 25000], width=800)
# fig.show()
# 
# fig = px.line(df_merged,
#               x='bin_start',
#               y='rel_delta_normalized_frequency',
#               color='created_date_order_part',
#               labels={'bin_start': 'Distance Bins (meters)',
#                       'rel_delta_normalized_frequency': 'Relative Delta'},
#               title='Relative Difference Between Normalized Frequencies, Compared to Aggregated Monday')
# fig.update_layout(xaxis_range=[0, 25000], width=800)
# fig.show()

In [96]:
df_DayEnd = pd.read_csv('/2comp/data/MEP-1656/distance_dist_grouped_weekend_weekday.csv')
df_SatSun = df_weekdays[df_weekdays['week_day'].isin(['Saturday', 'Sunday'])].rename(columns={'week_day': 'day_type'})
df = pd.concat([df_DayEnd, df_SatSun])

fig = px.line(df, 
              x='bin_start', 
              y='normalized_frequency', 
              color='day_type',
              labels={'bin_start': 'Distance Bin Start (meters)', 
                      'normalized_frequency': 'Normalized Frequency'},
              title='Normalized Distribution of Distance in Meters by Day (Binned)')
fig.update_layout(xaxis_range=[0,25000], width=1200)
fig.show()

In [97]:
df_share = pd.read_csv('/2comp/data/Result_4.csv').sort_values(by='created_date_order_part')
fig=px.line(x=df_share['created_date_order_part'], y=df_share['frequency'])
fig.update_layout(xaxis=dict(title='Date'),
                  yaxis=dict(title='Sahre'),
                  title='Sahre of Rides Longer than 8.7km',
                  width=1200)
fig.show()

In [85]:
df_share.sort_values(by='created_date_order_part')

Unnamed: 0,#,created_date_order_part,frequency
39,40,2024-03-20,0.346451
23,24,2024-03-21,0.348675
38,39,2024-03-22,0.350363
7,8,2024-03-23,0.375004
0,1,2024-03-24,0.383197
...,...,...,...
31,32,2024-05-27,0.350617
61,62,2024-05-28,0.345821
16,17,2024-05-29,0.344969
34,35,2024-05-30,0.344432


In [8]:
pptx_df = pd.read_csv("/2comp/data/pptx/preza3.csv")
fig = px.line(pptx_df,
              x='bin_start',
              y='normalized_frequency',
              color='week_day',
              labels={'bin_start': '',
                      'normalized_frequency': 'Normalized Frequency'},
              title='Normalized Distribution of Distance in Meters')
fig.update_layout(xaxis_range=[0,25000], width=800)
fig.show()

pptx_df = pd.read_csv('/2comp/data/pptx/preza2.csv')
fig = px.line(pptx_df, 
              x='bin_start', 
              y='normalized_frequency', 
              color='day_type',
              labels={'bin_start': 'Distance Bins (meters)', 
                      'normalized_frequency': 'Normalized Frequency'})
fig.update_layout(xaxis_range=[0,25000], width=800)
fig.show()

In [14]:
pptx_df = pd.read_csv("/2comp/data/pptx/preza4.csv")
fig = px.line(pptx_df,
              x='created_date_order_part',
              y='LR_share',
              labels={'created_date_order_part': '',
                      'LR_share': 'Share'},
              title='Share of Long Rides and Total Rides')
fig.update_layout(width=800)
fig.show()

pptx_df = pd.read_csv('/2comp/data/pptx/preza5.csv')
fig = px.line(pptx_df, 
              x='created_date_order_part', 
              y='cnt', 
              labels={'created_date_order_part': '', 
                      'cnt': 'Rides'})
fig.update_layout(width=800)
fig.show()