In [42]:
from io import StringIO
import os

import boto3
import pandas as pd

pd.set_option("display.max_columns", 50)

In [9]:
aws_acces_key_id = os.getenv("AWS_ACCESS_KEY")
aws_secret_key = os.getenv("AWS_SECRET_KEY")

In [10]:
def read_csv_from_s3(bucket: str, path: str, filename: str) -> pd.DataFrame:
    """Downloads a csv file from an S3 bucket
    
    Parameters:
    -----------
    bucket : str
        The bucket where the files at.
    path : str
        The folders to the files.
    filename : str
        Name of the file.
        
    Returns:
    --------
    pd.DataFrame
        A Dataframe of the downloaded file.
    """
    
    s3 = boto3.client("s3", aws_access_key_id=aws_acces_key_id, aws_secret_access_key=aws_secret_key)

    full_path = f"{path}{filename}"
    
    object = s3.get_object(Bucket=bucket, Key=full_path)
    object = object["Body"].read().decode("utf-8")
    output_df = pd.read_csv(StringIO(object))

    return output_df
    


In [19]:
s3 = boto3.client("s3", aws_access_key_id=aws_acces_key_id, aws_secret_access_key=aws_secret_key)
bucket = "cubix-chicago-taxi-goga"

payment_type_path = "transformed_data/payment_type/"
community_areas_path = "transformed_data/community_areas/"
company_path = "transformed_data/company/"
date_path = "transformed_data/date/"
taxi_trips_path = "transformed_data/taxi_trips/"
weather_path = "transformed_data/weather/"


In [20]:
community_areas = read_csv_from_s3(bucket=bucket, path=community_areas_path, filename="community_areas_master.csv")
company = read_csv_from_s3(bucket=bucket, path=company_path, filename="company_master.csv")
date = read_csv_from_s3(bucket=bucket, path=date_path, filename="date_dimension.csv")
payment_type = read_csv_from_s3(bucket=bucket, path=payment_type_path, filename="payment_type_master.csv")


In [21]:
trips_list = []
weather_list = []

In [22]:
for file in  s3.list_objects(Bucket=bucket, Prefix=taxi_trips_path)['Contents']:
    taxi_trip_key = file["Key"]

    if taxi_trip_key.split("/")[-1].strip() != "":
        if taxi_trip_key.split(".")[1] == "csv":
            
            filename = taxi_trip_key.split("/")[-1]
            trip = read_csv_from_s3(bucket, taxi_trips_path, filename)

            trips_list.append(trip)
            print(f"{filename} has been added")


taxi_2024-06-09.csv has been added
taxi_2024-06-10.csv has been added
taxi_2024-06-11.csv has been added
taxi_2024-06-12.csv has been added
taxi_2024-06-13.csv has been added
taxi_2024-06-14.csv has been added
taxi_2024-06-15.csv has been added
taxi_2024-06-16.csv has been added
taxi_2024-06-17.csv has been added
taxi_2024-06-18.csv has been added
taxi_2024-06-19.csv has been added
taxi_2024-06-20.csv has been added


In [25]:
trips = pd.concat(trips_list, ignore_index=True)

In [26]:
trips

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,payment_type_id,company_id
0,ff5658868e41b86c16bd4703278109bc68f261e2,489e78d30b4d287ed76527efc8533b5ef45e7b6c496d92...,2024-06-09T23:45:00.000,2024-06-10T00:00:00.000,660.0,1.70,28.0,28.0,8.50,4.00,0.0,0.0,12.50,41.874005,-87.663518,41.874005,-87.663518,2024-06-09 23:00:00,2,15
1,3b50f837e475c88c0037601b1dba293b75443c2d,61d83eb1d34ccd8dac5bdd962bc13e2ac077b0c7ecbf1d...,2024-06-09T23:45:00.000,2024-06-10T00:15:00.000,1440.0,14.30,76.0,7.0,52.75,0.00,0.0,4.0,57.25,41.980264,-87.913625,41.922686,-87.649489,2024-06-09 23:00:00,2,14
2,3dbf9607c7e56885bab3f3d5ce65c5780866185e,4f4cae403660ec4c1e2e3b03128bd7a5af139619e3e513...,2024-06-09T23:45:00.000,2024-06-09T23:45:00.000,0.0,0.00,76.0,76.0,3.25,0.00,0.0,0.0,3.25,41.979071,-87.903040,41.979071,-87.903040,2024-06-09 23:00:00,1,3
3,46d4c7730778ff19bf485ee3e70c0c4ac14da22b,13016372e777da1289d557edbe4ce2be8a68e77bc64768...,2024-06-09T23:45:00.000,2024-06-10T00:15:00.000,1643.0,15.68,76.0,7.0,39.25,4.38,0.0,4.0,48.13,41.980264,-87.913625,41.922686,-87.649489,2024-06-09 23:00:00,3,6
4,4a0937943b29b734d7ad8bd1c0011817687b3727,ae21d024a249783394ef3fcd91c8abad23fe127549fe96...,2024-06-09T23:45:00.000,2024-06-10T00:00:00.000,420.0,0.10,8.0,24.0,9.50,3.00,0.0,0.0,12.50,41.892508,-87.626215,41.906026,-87.675312,2024-06-09 23:00:00,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243853,1b0722051c0a9fa9abf60297b4ee3aab6ab4cbda,3b58b6e6c05297a7feacac9a19e049b714eacfa1d49d48...,2024-06-20T00:00:00.000,2024-06-20T00:00:00.000,5.0,0.00,32.0,32.0,20.00,0.00,0.0,0.0,20.50,41.878866,-87.625192,41.878866,-87.625192,2024-06-20 00:00:00,2,6
243854,1809bb9c2786063ef17a3c6eac380cb3edff1053,6fa0364cac8d604e8a7613c08fdcbe5bfa05e361d4a758...,2024-06-20T00:00:00.000,2024-06-20T00:15:00.000,1126.0,12.56,28.0,71.0,32.25,0.00,0.0,0.0,32.25,41.874005,-87.663518,41.744205,-87.656306,2024-06-20 00:00:00,4,4
243855,1548b04603c0017ce7fbda16c37efe85ce9f6ad3,e6ec7ccfda661f2050f932f672395ff77cd1359b3ae62b...,2024-06-20T00:00:00.000,2024-06-20T00:15:00.000,1587.0,18.22,76.0,32.0,45.25,7.61,0.0,5.0,58.36,41.979071,-87.903040,41.884987,-87.620993,2024-06-20 00:00:00,3,8
243856,12aa210addda8e79f8bbc3f5baf74ea801441241,bd54bee5660726988fc54355ab54fc7cfd5c3917d911f9...,2024-06-20T00:00:00.000,2024-06-20T00:45:00.000,2474.0,36.09,76.0,,85.50,0.00,4.0,47.0,137.00,41.979071,-87.903040,,,2024-06-20 00:00:00,2,6


In [23]:
for file in  s3.list_objects(Bucket=bucket, Prefix=weather_path)['Contents']:
    weather_key = file["Key"]
    
    if weather_key.split("/")[-1].strip() != "":
        if weather_key.split(".")[1] == "csv":
            
            filename = weather_key.split("/")[-1]
            weather_daily = read_csv_from_s3(bucket, weather_path, filename)

            weather_list.append(weather_daily)
            print(f"{filename} has been added")



weather_2024-06-09.csv has been added
weather_2024-06-10.csv has been added
weather_2024-06-11.csv has been added
weather_2024-06-12.csv has been added
weather_2024-06-13.csv has been added
weather_2024-06-14.csv has been added
weather_2024-06-15.csv has been added
weather_2024-06-16.csv has been added
weather_2024-06-17.csv has been added
weather_2024-06-18.csv has been added
weather_2024-06-19.csv has been added
weather_2024-06-20.csv has been added


In [27]:
weather = pd.concat(weather_list, ignore_index=True)

In [28]:
weather

Unnamed: 0,datetime,temperature,wind_speed,rain,precipitation
0,2024-06-09 00:00:00,15.8,22.1,0.0,0.0
1,2024-06-09 01:00:00,15.0,16.4,0.1,0.1
2,2024-06-09 02:00:00,14.5,17.8,0.0,0.0
3,2024-06-09 03:00:00,13.9,14.2,0.0,0.0
4,2024-06-09 04:00:00,13.4,8.1,0.0,0.0
...,...,...,...,...,...
283,2024-06-20 19:00:00,23.9,17.7,0.0,0.0
284,2024-06-20 20:00:00,23.6,19.3,0.0,0.0
285,2024-06-20 21:00:00,21.7,20.1,0.0,0.0
286,2024-06-20 22:00:00,22.1,17.9,0.0,0.0


Join them together

In [62]:
trips_full = pd.merge(trips, weather, left_on="datetime_for_weather", right_on="datetime", how="inner")
trips_full = trips_full.drop(columns=["datetime"])

In [63]:
trips_full = pd.merge(trips_full, company, left_on="company_id", right_on="company_id", how="inner")
trips_full = trips_full.drop(columns=["company_id"])

In [64]:
trips_full = pd.merge(trips_full, payment_type, left_on="payment_type_id", right_on="payment_type_id", how="inner")
trips_full = trips_full.drop(columns=["payment_type_id"])

In [65]:
trips_full = pd.merge(trips_full, community_areas, left_on="pickup_community_area_id", right_on="area_code", how="inner")
trips_full = trips_full.drop(columns=["pickup_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "pickup_community_area_name"}, inplace=True)

In [66]:
trips_full = pd.merge(trips_full, community_areas, left_on="dropoff_community_area_id", right_on="area_code", how="inner")
trips_full = trips_full.drop(columns=["dropoff_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "dropoff_community_area_name"}, inplace=True)

In [67]:
trips_full.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,temperature,wind_speed,rain,precipitation,company,payment_type,pickup_community_area_name,dropoff_community_area_name
0,ff5658868e41b86c16bd4703278109bc68f261e2,489e78d30b4d287ed76527efc8533b5ef45e7b6c496d92...,2024-06-09T23:45:00.000,2024-06-10T00:00:00.000,660.0,1.7,8.5,4.0,0.0,0.0,12.5,41.874005,-87.663518,41.874005,-87.663518,2024-06-09 23:00:00,22.1,29.7,0.0,0.0,Chicago City Taxi Association,Credit Card,Near West Side,Near West Side
1,3b50f837e475c88c0037601b1dba293b75443c2d,61d83eb1d34ccd8dac5bdd962bc13e2ac077b0c7ecbf1d...,2024-06-09T23:45:00.000,2024-06-10T00:15:00.000,1440.0,14.3,52.75,0.0,0.0,4.0,57.25,41.980264,-87.913625,41.922686,-87.649489,2024-06-09 23:00:00,22.1,29.7,0.0,0.0,Globe Taxi,Credit Card,O'Hare,Lincoln Park
2,3dbf9607c7e56885bab3f3d5ce65c5780866185e,4f4cae403660ec4c1e2e3b03128bd7a5af139619e3e513...,2024-06-09T23:45:00.000,2024-06-09T23:45:00.000,0.0,0.0,3.25,0.0,0.0,0.0,3.25,41.979071,-87.90304,41.979071,-87.90304,2024-06-09 23:00:00,22.1,29.7,0.0,0.0,Taxi Affiliation Services,Cash,O'Hare,O'Hare
3,46d4c7730778ff19bf485ee3e70c0c4ac14da22b,13016372e777da1289d557edbe4ce2be8a68e77bc64768...,2024-06-09T23:45:00.000,2024-06-10T00:15:00.000,1643.0,15.68,39.25,4.38,0.0,4.0,48.13,41.980264,-87.913625,41.922686,-87.649489,2024-06-09 23:00:00,22.1,29.7,0.0,0.0,Taxicab Insurance Agency Llc,Mobile,O'Hare,Lincoln Park
4,4a0937943b29b734d7ad8bd1c0011817687b3727,ae21d024a249783394ef3fcd91c8abad23fe127549fe96...,2024-06-09T23:45:00.000,2024-06-10T00:00:00.000,420.0,0.1,9.5,3.0,0.0,0.0,12.5,41.892508,-87.626215,41.906026,-87.675312,2024-06-09 23:00:00,22.1,29.7,0.0,0.0,Taxi Affiliation Services,Credit Card,Near North Side,West Town
