In [1]:
import os

# from the current `tute_1` directory, go back two levels to the `MAST30034` directory
output_relative_dir = '../data/landing/'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# now, for each type of data set we will need, we will create the paths
for target_dir in ('wifi_data','tlc_data'): # taxi_zones should already exist
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)

In [2]:
DATA_RANGE = {
    '2022': range(1, 7),
}
# this is the URL template as of 07/2023
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_"#year-month.parquet

In [3]:
from urllib.request import urlretrieve
tlc_output_dir = output_relative_dir + 'tlc_data'

for year in DATA_RANGE:
    for month in DATA_RANGE[year]:
        # 0-fill i.e 1 -> 01, 2 -> 02, etc
        month = str(month).zfill(2) 
        print(f"Begin month {month}")
        
        # generate url
        url = f'{URL_TEMPLATE}{year}-{month}.parquet'
        # generate output location and filename
        output_dir = f"{tlc_output_dir}/{year}-{month}.parquet"
        # download
        urlretrieve(url, output_dir) 
        
        print(f"Completed {month}-{year}")

Begin month 01
Completed 01-2022
Begin month 02
Completed 02-2022
Begin month 03
Completed 03-2022
Begin month 04
Completed 04-2022
Begin month 05
Completed 05-2022
Begin month 06
Completed 06-2022


In [4]:
NUM_ROWS_PLUTO = 3319
MAX_ROWS_PER_CALL = 1000
RELEVANT_COLUMNS = ["latitude", "longitude","Activated","OBJECTID"]

In [5]:
import pandas as pd
from sodapy import Socrata
from tqdm import tqdm

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", None)

for i in tqdm(range(0, NUM_ROWS_PLUTO, MAX_ROWS_PER_CALL)):
    # print(f"Downloading {i} to {i + MAX_ROWS_PER_CALL} of {NUM_ROWS_PLUTO}")
    results = client.get("yjub-udmw", limit=MAX_ROWS_PER_CALL, offset=i, select=",".join(RELEVANT_COLUMNS))
    results_df = pd.DataFrame.from_records(results)
    results_df.to_parquet(f"{output_relative_dir}wifi_data/wifi_data_{i}.parquet", index=False)

100%|██████████| 4/4 [00:05<00:00,  1.31s/it]
