In [1]:
!pip install pandas
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 6.3 MB/s eta 0:00:011
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.64.1


In [1]:
import pandas as pd
import os
from tqdm import tqdm
from src.enum.setup.Paths import Paths
from src.enum.setup.FileFormat import FileFormat
from src.enum.setup.FileName import FileName
from src.enum.setup.Dataset import Dataset
from src.enum.setup.City import City
from src.utils import utils
import json
from pandas import json_normalize
from itertools import product
from joblib import Parallel, delayed
from src.model.Provider import Provider
from src.enum.setup.FileSetup import FileSetup
import sumolib

In [2]:
#sumo_net = sumolib.net.readNet(FileSetup.NET_SUMO.value)
#provider = Provider(utils.read_setup(FileSetup.PROVIDER.value), sumo_net)

In [3]:
input_absolute_path_to_file = utils.generate_absolute_path_to_file(
    Paths.TAZ,
    FileName.TAZ_POLY_DICT,
    FileFormat.JSON,
    Dataset.UBER,
    City.SAN_FRANCISCO
)

json_data = utils.read_file_from_absolute_path_to_file(input_absolute_path_to_file, FileFormat.JSON)
df_boundary = json_normalize(json_data)

In [4]:
concat_dataset_absolute_path_to_file = utils.generate_absolute_path_to_file(
    Paths.MOBILITY,
    FileName.TRAVEL_TIME_CONCAT,
    FileFormat.CSV,
    Dataset.UBER,
    City.SAN_FRANCISCO
)

removed = 0

if not os.path.exists(concat_dataset_absolute_path_to_file):
    path_to_dir = utils.generate_absolute_path_to_dir(
        Paths.MOBILITY,
        FileFormat.CSV,
        Dataset.UBER
    )

    file_name_list = utils.list_data_files_in_dir(path_to_dir, "sf_uber_speed")
    df_list = []
    for file_name in tqdm(file_name_list):
        input_absolute_path_to_file = os.path.join(
            Paths.MOBILITY.value,
            Dataset.UBER.value,
            FileFormat.CSV.value,
            file_name
        )
        df = pd.read_csv(input_absolute_path_to_file)
        initial_rows = df.shape[0]
        df = df.dropna().astype({'sourceid': 'int64', 'dstid': 'int64'})
        after_rows = df.shape[0]
        removed += initial_rows - after_rows
        df_list.append(df)
    df_travel_time = pd.concat(df_list, axis=0, ignore_index=True)
    #df_travel_time.to_csv(concat_dataset_absolute_path_to_file, index=False)
else:
    df_travel_time = pd.read_csv(concat_dataset_absolute_path_to_file)

100%|██████████| 20/20 [06:24<00:00, 19.22s/it]


In [5]:
print(f"Removed {removed} rows from datasets.")

Removed 2 rows from datasets.


In [6]:
df_boundary = df_boundary.rename(columns={"@id" :"taz_id", "param.MOVEMENT_I": "movement_id"})
df_boundary = df_boundary[["taz_id","movement_id"]].astype({'taz_id': "string", 'movement_id': 'int64'})
df_boundary.head()

Unnamed: 0,taz_id,movement_id
0,100011,3578
1,100012,3583
2,100013,3586
3,100014,3599
4,100015,3602


In [35]:
df_boundary.shape

(731, 2)

In [7]:
df_travel_time.shape

(727903593, 7)

In [8]:
df_travel_time.head()


Unnamed: 0,sourceid,dstid,hod,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time
0,497,532,22.0,105.17,30.26,97.98,1.55
1,497,576,22.0,529.71,376.59,402.15,2.2
2,532,495,16.0,378.86,336.16,295.15,1.86
3,500,852,20.0,461.5,256.63,368.43,2.09
4,576,495,16.0,772.0,537.45,653.48,1.69


In [9]:
columns = ["sourceid", "dstid", "mean_travel_time","standard_deviation_travel_time"]
df_travel_time = df_travel_time[columns].groupby(by=["sourceid", "dstid"]).mean().reset_index()

In [10]:
df_travel_time.shape

(3647502, 4)

In [81]:
columns = ["src_movement_id","dst_movement_id", "sourceid", "dstid", "mean_travel_time","standard_deviation_travel_time"]
df_travel_time_taz_id = pd.merge(df_travel_time, df_boundary, how="inner", left_on="sourceid", right_on="movement_id").rename(columns={"sourceid": "src_movement_id","taz_id": "sourceid"})
df_travel_time_taz_id = pd.merge(df_travel_time_taz_id, df_boundary, how="inner", left_on="dstid", right_on="movement_id").rename(columns={"dstid": "dst_movement_id", "taz_id": "dstid"})
df_travel_time_taz_id = df_travel_time_taz_id[columns]
df_travel_time_taz_id.shape

(506359, 6)

In [82]:
df_travel_time_taz_id.head()


Unnamed: 0,src_movement_id,dst_movement_id,sourceid,dstid,mean_travel_time,standard_deviation_travel_time
0,3370,3370,100281,100281,158.787222,250.462222
1,3372,3370,100301,100281,300.791106,160.16601
2,3375,3370,100324,100281,550.289084,169.947614
3,3377,3370,100392,100281,938.935842,268.909035
4,3382,3370,100393,100281,807.820289,278.18212


In [83]:
df_travel_time_taz_id.shape

(506359, 6)

In [84]:
boundary_product = set(product(df_boundary['taz_id'], df_boundary['taz_id']))
dataset_couples = set(zip(df_travel_time_taz_id['sourceid'], df_travel_time_taz_id['dstid']))

missing_couples = list(boundary_product - dataset_couples)

print(f"Missing couples {len(missing_couples)}")

Missing couples 28002


In [87]:
df_missing_couples = pd.DataFrame(missing_couples, columns=['sourceid', 'dstid'])
df_missing_couples.head()

Unnamed: 0,sourceid,dstid
0,100996,100325
1,101003,100022
2,10017D,101181
3,100274,101071
4,101182,10001F


In [88]:
df_missing_couples.shape


(28002, 2)

In [95]:
output_absolute_path_to_file = utils.generate_absolute_path_to_file(
    Paths.MOBILITY,
    FileName.TRAVEL_TIME_OUT,
    FileFormat.CSV,
    Dataset.UBER,
    City.SAN_FRANCISCO
)
utils.check_path_exists(output_absolute_path_to_file, is_path_file=True)
df_travel_time_taz_id.to_csv(output_absolute_path_to_file, index=False)

/Users/davidemolinelli/Downloads/ride-sharing-simulator-taz/src/enum/setup/../../../data/sf/mobility/uber/csv/sf_uber_travel_time.csv


In [97]:
output_absolute_path_to_file = utils.generate_absolute_path_to_file(
    Paths.MOBILITY,
    FileName.TRAVEL_TIME_MISSING_COUPLES,
    FileFormat.CSV,
    Dataset.UBER,
    City.SAN_FRANCISCO
)

utils.check_path_exists(output_absolute_path_to_file, is_path_file=True)
df_missing_couples.to_csv(output_absolute_path_to_file, index=False)

In [107]:
travel_time_dict = {}

for row in df_travel_time_taz_id.to_dict(orient="records"):
    if not row["sourceid"] in travel_time_dict:
        travel_time_dict[row["sourceid"]] = {}

    """expected_price_surge_1x = provider.compute_price(
        row["mean_travel_time"],

    )"""

    travel_time_dict[row["sourceid"]] = {
        **travel_time_dict[row["sourceid"]],
        row["dstid"]: {
            "mean_travel_time": row["mean_travel_time"],
            "std_travel_time": row["standard_deviation_travel_time"]
        }
    }

output_absolute_path_to_file = utils.generate_absolute_path_to_file(
    Paths.MOBILITY,
    FileName.TRAVEL_TIME_OUT,
    FileFormat.JSON,
    Dataset.UBER,
    City.SAN_FRANCISCO
)
utils.export_file_from_absolute_path(output_absolute_path_to_file, FileFormat.JSON, travel_time_dict)

In [105]:
travel_time_missing_couples_dict = {}

for row in df_missing_couples.to_dict(orient="records"):
    if row["sourceid"] not in travel_time_missing_couples_dict:
        travel_time_missing_couples_dict[row["sourceid"]] = []
    travel_time_missing_couples_dict[row["sourceid"]].append(row["dstid"])

output_absolute_path_to_file = utils.generate_absolute_path_to_file(
    Paths.MOBILITY,
    FileName.TRAVEL_TIME_MISSING_COUPLES,
    FileFormat.JSON,
    Dataset.UBER,
    City.SAN_FRANCISCO
)

utils.export_file_from_absolute_path(output_absolute_path_to_file, FileFormat.JSON, travel_time_missing_couples_dict)


In [13]:
concat_mean_dataset_absolute_path_to_file = utils.generate_absolute_path_to_file(
    Paths.MOBILITY,
    FileName.TRAVEL_TIME_CONCAT_MEAN,
    FileFormat.CSV,
    Dataset.UBER,
    City.SAN_FRANCISCO
)

df_travel_time.to_csv(concat_dataset_absolute_path_to_file, index=False)