In [7]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
import os
import pandas as pd
import requests
pd.set_option("display.max_columns", 30)

In [8]:
"""
1. get data from S3
2. weather data transformaiton
3. taxi trips transformations
4. update payment_type
5. update company_master
6. update taxi_trips with company and payment_type
7. upload weather data to S3
8. upload taxi data to S3
9. upload the newest payment_type_master and company_master
"""

'\n1. get data from S3\n2. weather data transformaiton\n3. taxi trips transformations\n4. update payment_type\n5. update company_master\n6. update taxi_trips with company and payment_type\n7. upload weather data to S3\n8. upload taxi data to S3\n9. upload the newest payment_type_master and company_master\n'

Taxi trips transformation codes

In [9]:
current_datetime = datetime.now() - relativedelta(months=2)
formatted_datetime = current_datetime.strftime("%Y-%m-%d")

url = (
    f"https://data.cityofchicago.org/resource/ajtu-isnz.json?"
    f"$where=trip_start_timestamp >= '{formatted_datetime}T00:00:00' "
    f"AND trip_start_timestamp <= '{formatted_datetime}T23:59:59'&$limit=30000"
)
response = requests.get(url)

data = response.json()


In [10]:
taxi_trips = pd.DataFrame(data)

In [11]:
taxi_trips.drop(["pickup_census_tract", "dropoff_census_tract"], axis=1, inplace=True)
taxi_trips.drop(["pickup_centroid_location", "dropoff_centroid_location"], axis=1, inplace=True)

taxi_trips.info()

taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                           "dropoff_community_area": "dropoff_community_area_id"}, inplace=True)

taxi_trips["datetime_for_weather"] = pd.to_datetime(taxi_trips["trip_start_timestamp"]).dt.floor("H")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23278 entries, 0 to 23277
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   trip_id                     23278 non-null  object
 1   taxi_id                     23278 non-null  object
 2   trip_start_timestamp        23278 non-null  object
 3   trip_end_timestamp          23278 non-null  object
 4   trip_seconds                23275 non-null  object
 5   trip_miles                  23278 non-null  object
 6   pickup_community_area       22725 non-null  object
 7   dropoff_community_area      21335 non-null  object
 8   fare                        23214 non-null  object
 9   tips                        23214 non-null  object
 10  tolls                       23214 non-null  object
 11  extras                      23214 non-null  object
 12  trip_total                  23214 non-null  object
 13  payment_type                23278 non-null  ob

  taxi_trips["datetime_for_weather"] = pd.to_datetime(taxi_trips["trip_start_timestamp"]).dt.floor("H")


In [12]:
taxi_trips.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
0,9a0eee16b60afd495822f50ab2b7559f775777d5,de9289a1fce135051f6e9206044f8211c86a1a6acd68d4...,2024-06-12T23:45:00.000,2024-06-13T00:15:00.000,1616,12.86,8,48.0,34.0,0.0,0,0.0,34.0,Prcard,Flash Cab,41.899602111,-87.633308037,41.729676423,-87.572717134,2024-06-12 23:00:00
1,99a385190a881371dcde15bd12a49e3b7865b1f0,8dbaf4164524af926ddeab5dd084183e585f4cb8cc47fe...,2024-06-12T23:45:00.000,2024-06-13T00:15:00.000,1210,12.65,76,4.0,32.0,7.3,0,4.0,43.8,Credit Card,Medallion Leasin,41.980264315,-87.913624596,41.975170943,-87.687515515,2024-06-12 23:00:00
2,9414b92a1cf66b8b8b18ade545acc66559f635a6,294ae9e2f433bb460731f9199b1837de3cf88ad987a42b...,2024-06-12T23:45:00.000,2024-06-13T00:00:00.000,1260,12.4,76,,32.0,0.0,0,6.0,38.0,Cash,Taxi Affiliation Services,41.980264315,-87.913624596,,,2024-06-12 23:00:00
3,8c83163b2a8d6dd025276d1b9ae201f86152b010,1dca7e7332893fa1836f448df24637325574b860632c18...,2024-06-12T23:45:00.000,2024-06-13T00:15:00.000,1589,5.93,76,10.0,21.75,0.0,0,7.5,29.25,Cash,Flash Cab,41.980264315,-87.913624596,41.985015101,-87.804532006,2024-06-12 23:00:00
4,8c12322c21accd3d99f37bf7b01bd5b2bf5204e9,545ac2dfd5b722e0f0d884cc68ec27d5b0537410edd8bf...,2024-06-12T23:45:00.000,2024-06-12T23:45:00.000,275,0.57,28,32.0,9.0,1.15,0,0.0,10.15,Mobile,Tac - American United Dispatch,41.874005383,-87.66351755,41.878865584,-87.625192142,2024-06-12 23:00:00


taxi trips transformation function

In [None]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """ Perform transformation with the taxi data

    Parameters:
    -----------
        taxi_trips : pd.DataFrame
            The dataframe holding the daly taxi trips

    Returns:
    --------
        pd.dataframe
        The cleaned transformed Dataframe holding the taxi trips.
    """

    if not isinstance(taxi_trips, pd.DataFrame):
        raise TypeError("taxi_trips is not a valid pandas datarfame.")

    taxi_trips.drop(["pickup_census_tract", "dropoff_census_tract"], axis=1, inplace=True)
    taxi_trips.drop(["pickup_centroid_location", "dropoff_centroid_location"], axis=1, inplace=True)

    taxi_trips.info()

    taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                            "dropoff_community_area": "dropoff_community_area_id"}, inplace=True)

    taxi_trips["datetime_for_weather"] = pd.to_datetime(taxi_trips["trip_start_timestamp"]).dt.floor("H")

    return taxi_trips

company update codes

In [14]:
company_master = taxi_trips["company"].drop_duplicates().reset_index(drop=True)

company_master = pd.DataFrame(
    {
        "company_id": range(1, len(company_master) + 1),
        "company": company_master
    }
)

company_master.tail()

Unnamed: 0,company_id,company
29,30,4623 - 27290 Jay Kim
30,31,3556 - 36214 RC Andrews Cab
31,32,Petani Cab Corp
32,33,Tac - Yellow Non Color
33,34,Metro Jet Taxi A.


In [16]:
new_company_data = [
    {"company": "5167 - 71969 5167 Taxi Inc"},
    {"company": "X"},
    {"company": "Y"},
]

new_company_mapping = pd.DataFrame(new_company_data)

new_company_mapping

Unnamed: 0,company
0,5167 - 71969 5167 Taxi Inc
1,X
2,Y


In [18]:
company_max_id = company_master["company_id"].max()
company_max_id

np.int64(34)

In [20]:
new_companies_list = []

for company in new_company_mapping["company"].values:
    if company not in company_master["company"].values:
        new_companies_list.append(company)

# one line
# new_companies_list_one_line = [company for company in new_company_mapping["company"].values if company not in company_master["company"].values]

new_companies_list

['X', 'Y']

In [22]:
new_companies_df = pd.DataFrame({
    "company_id": range(company_max_id + 1, company_max_id + len(new_companies_list) + 1),
    "company": new_companies_list
})

new_companies_df

Unnamed: 0,company_id,company
0,35,X
1,36,Y
