In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
file_path = '../Data/'
output_file_path = file_path + 'Output/'

## Attach uber time and hour of day information to the sampled 119w OD pairs that have reference in 2022 uber movement data

In [None]:
# Import all the sampled OD pairs that have reference in the 2020 uber movement
all_uber = pd.read_csv(output_file_path + 'OD_pairs_uber_all_strongly_119w.csv')

In [None]:
# deduplicate
uber_dedup = all_uber.drop_duplicates(subset=['oid', 'did'], keep='first')

In [None]:
# import uber 2020 travel time
uber_2020 = pd.read_csv(file_path + "los_angeles-censustracts-2020-1-All-HourlyAggregate.csv",
                        dtype={"sourceid": float, "dstid": float})

In [None]:
uber_2020['uber_OD'] = list(zip(uber_2020.sourceid, uber_2020.dstid))

In [None]:
uber_2020['uber_OD'] = uber_2020['uber_OD'].astype(str)
uber_dedup['uber_OD'] = uber_dedup['uber_OD'].astype(str)

In [None]:
# Merge sampled OD pairs with uber movement travel time result
uber_dedup_merge = uber_dedup.merge(uber_2020, how='left', on='uber_OD')

In [None]:
# Visualize the number of sampled OD pairs by hour of day with uber movement travel time
plt.figure(figsize=(12, 6))
ax = sns.histplot(data=uber_dedup_merge, x="hod", stat='count', discrete=True)
for p in ax.patches:
    ax.text(p.get_x() + p.get_width() / 2, p.get_height(),
            '{}'.format(int(p.get_height())),
            ha='center', va='bottom', size='x-small', color='b', weight='semibold')
plt.title('Number of observations by hour of day of all sampled OD pairs')
bins = range(0, 24)
plt.xticks(bins)
plt.ylabel('Number of Observations')
plt.xlabel('hour of the day')

In [None]:
# If an OD pairs have multiple hour of day travel time, sample one hour of day
uber_dedup_merge_sample = uber_dedup_merge.groupby(['oid', 'did']).apply(
    lambda x: x.sample(1, random_state=123)).reset_index(drop=True)

In [None]:
# Save it to csv
uber_dedup_merge_sample.to_csv(output_file_path + 'uber_dedup_hod_119w.csv')

## Sample 10,000 3am OD pairs

In [None]:
df = pd.read_csv(output_file_path + 'uber_dedup_hod_119w.csv')

In [None]:
df3am = df[df['hod'] == 3]
df3am_10000 = df3am.sample(10000, random_state=123).copy()
df3am_10000.to_csv(output_file_path + 'OD3am_10000.csv')

In [None]:
new_df3am = df3am.drop(df3am_10000.index)
new_df3am_10000 = new_df3am.sample(10000, random_state=123).copy()
new_df3am_10000.to_csv(output_file_path + 'OD3am_10000_addon.csv')

In [None]:
frames = [df3am_10000, new_df3am_10000]
combined_df = pd.concat(frames)

In [None]:
combined_df.to_csv(output_file_path + 'OD3am_20000.csv')