In [1]:
import os
import pandas as pd
import numpy as np
import zipfile
import shutil

import dask
import dask.dataframe as dd

In [2]:
from IPython.display import Audio, display

url_done = (
    "https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav"
)
url_done = "https://www.myinstants.com/media/sounds/taco-bell-bong-sfx.mp3"
url_done = "https://www.myinstants.com/media/sounds/magic_immune.mp3"
# url_done="https://www.myinstants.com/media/sounds/tindeck_1.mp3"
# url_done="https://www.myinstants.com/media/sounds/dun_dun_1.mp3"


def allDone():
    display(
        Audio(
            url=url_done,
            autoplay=True,
        )
    )


url_exception = "http://www.wav-sounds.com/movie/austinpowers.wav"
url_exception = "https://www.myinstants.com/media/sounds/roblox-death-sound_1.mp3"


def play_sound(self, etype, value, tb, tb_offset=None):
    self.showtraceback((etype, value, tb), tb_offset=tb_offset)
    display(Audio(url=url_exception, autoplay=True))


get_ipython().set_custom_exc((Exception,), play_sound)

In [3]:
data_dir = os.path.join(os.getcwd(), "..", "data")

In [4]:
stop_events = os.path.join(data_dir, "stop_event")

In [12]:
for file in os.listdir(stop_events):   
    file = os.path.join(stop_events, file)
    if zipfile.is_zipfile(file): 
        print(f'extracting: {file}')
        with zipfile.ZipFile(file) as item: 
            item.extractall(stop_events)  

Summer 2019 Stop Event and Pass Census.zip
yes
trimet_stop_event - Fall 2018 v2.zip
yes
trimet_stop_event - Winter 2018.zip
yes
2 trimet_stop_event - Aug 31 and Sep 1 2020.zip
yes
2 trimet_stop_event - Mar to Aug 2020.zip
yes
2 trimet_stop_event - Fall 2019.zip
yes
2 TriMet Stop Event and Census Spring 2019.zip
yes
Winter2019 StopEvent PassCen.zip
yes
2 trimet_stop_event - Sep 2020 to Mar 2021.zip
yes


In [21]:
stop_event_files = os.listdir(stop_events)

for item in stop_event_files:
    if '.zip' in item or 'census' in item:
        os.remove(os.path.join(stop_events, item))

In [22]:
import datetime
from functools import lru_cache


def lower_case_sort_columns(df):
    df.columns = df.columns.str.lower()

    df = df[
        [
            "arrive_time",
            "data_source",
            "direction",
            "door",
            "dwell",
            "estimated_load",
            "leave_time",
            "lift",
            "location_id",
            "maximum_speed",
            "offs",
            "ons",
            "pattern_distance",
            "route_number",
            "schedule_status",
            "service_date",
            "service_key",
            "stop_time",
            "train",
            "train_mileage",
            "trip_number",
            "vehicle_number",
            "x_coordinate",
            "y_coordinate",
        ]
    ]
    return df


def parse_stop_event(df):

    month_dict = {
        "JAN": "01",
        "FEB": "02",
        "MAR": "03",
        "APR": "04",
        "MAY": "05",
        "JUN": "06",
        "JUL": "07",
        "AUG": "08",
        "SEP": "09",
        "OCT": "10",
        "NOV": "11",
        "DEC": "12",
    }

    @lru_cache()
    def parse_date(date_str):
        date = date_str.split(":")[0]
        day = date[:2]
        month = month_dict[date[2:5]]
        year = date[5:]
        return year + "/" + month + "/" + day

    df["service_date"] = df["service_date"].apply(parse_date)

    df["service_date"] = pd.to_datetime(df["service_date"], format="%Y/%m/%d")

    df["day_of_year"] = df["service_date"].dt.day_of_year
    df["day_of_week"] = df["service_date"].dt.day_of_week

    df["arrival_deviance"] = df["stop_time"] - df["arrive_time"]
    df["arrive_deviance_departure_delta"] = (
        df["arrival_deviance"] + df["leave_time"] - df["arrive_time"]
    )

    minutes_per_time_cat = 5
    times = ["stop_time", "arrive_time", "leave_time"]
    time_cats = ["time_cat_" + x for x in times]
    df[time_cats] = df[times] // 60 // minutes_per_time_cat

    return df

In [23]:

stop_event_file_names = [
    "2 trimet_stop_event - Sep 2020 to Mar 2021.csv",
    "2 trimet_stop_event - Aug 31 and Sep 1 2020.csv",
    "2 trimet_stop_event - Fall 2019.csv",
    "2 trimet_stop_event - Mar to Aug 2020.csv",
    "2 trimet_stop_event - Spring 2019.csv",
    "2 trimet_stop_event - Summer 2019.csv",
    "2 trimet_stop_event - Winter 2018.csv",
    "2 trimet_stop_event - Winter 2019-20.csv",
    "trimet_stop_event - Fall 2018 v2.csv",
]


stop_event_dfs = []
for file_name in stop_event_file_names:
    print("working on", file_name)
    file_name = os.path.join(stop_events, file_name)
    df = dd.read_csv(
        file_name,
        dtype={
            "LOCATION_DISTANCE": "float32",
            "PATTERN_DISTANCE": "float32",
            "TRAIN_MILEAGE": "float32",
            "X_COORDINATE": "float32",
            "Y_COORDINATE": "float32",
            "DATA_SOURCE": "float64",
            "LOCATION_ID": "float64",
            "SCHEDULE_STATUS": "float64",
        },
    )
    df = lower_case_sort_columns(df)
    df = df[df["route_number"] == 9].compute()
    df = parse_stop_event(df)
    stop_event_dfs.append(df)

df = pd.concat(stop_event_dfs, axis=0, ignore_index=True)

times = ["stop_time", "arrive_time", "leave_time"]
time_cats = ["time_cat_" + x for x in times]
categories = [
    "vehicle_number",
    "train",
    "route_number",
    "direction",
    "service_key",
    "location_id",
    "door",
    "lift",
    "day_of_year",
    "day_of_week",
    *time_cats,
]

df[categories] = df[categories].astype("category")

df = df.sort_values(["service_date", "train", "trip_number", "stop_time"])
df = df.reset_index()

pickle_dir = os.path.join(data_dir, "mega_pickle")
df.to_pickle(pickle_dir)

allDone()

working on 2 trimet_stop_event - Sep 2020 to Mar 2021.csv
working on 2 trimet_stop_event - Aug 31 and Sep 1 2020.csv
working on 2 trimet_stop_event - Fall 2019.csv
working on 2 trimet_stop_event - Mar to Aug 2020.csv


  df = pandas_read_text(


working on 2 trimet_stop_event - Spring 2019.csv
working on 2 trimet_stop_event - Summer 2019.csv
working on 2 trimet_stop_event - Winter 2018.csv
working on 2 trimet_stop_event - Winter 2019-20.csv
working on trimet_stop_event - Fall 2018 v2.csv


In [None]:
# save_file = os.path.join(data_dir, "mega_stop_event_1.hdf")
# df = pd.read_hdf(save_file, "/df")

# pickle_dir = os.path.join(data_dir, "mega_pickle")
# df.to_pickle(pickle_dir)