In [1]:
from io import StringIO
import os

import boto3
import pandas as pd

pd.set_option("display.max_columns", 50)

In [2]:
aws_acces_key_id = os.getenv("AWS_ACCESS_KEY")
aws_secret_key = os.getenv("AWS_SECRET_KEY")

In [3]:
def read_csv_from_s3(bucket: str, path: str, filename: str) -> pd.DataFrame:
    """Downloads a csv file from an S3 bucket
    
    Parameters:
    -----------
    bucket : str
        The bucket where the files at.
    path : str
        The folders to the files.
    filename : str
        Name of the file.
        
    Returns:
    --------
    pd.DataFrame
        A Dataframe of the downloaded file.
    """
    
    s3 = boto3.client("s3", aws_access_key_id=aws_acces_key_id, aws_secret_access_key=aws_secret_key)

    full_path = f"{path}{filename}"
    
    object = s3.get_object(Bucket=bucket, Key=full_path)
    object = object["Body"].read().decode("utf-8")
    output_df = pd.read_csv(StringIO(object))

    return output_df
    


In [4]:
s3 = boto3.client("s3", aws_access_key_id=aws_acces_key_id, aws_secret_access_key=aws_secret_key)
bucket = "cubix-chicago-taxi-goga"

payment_type_path = "transformed_data/payment_type/"
community_areas_path = "transformed_data/community_areas/"
company_path = "transformed_data/company/"
date_path = "transformed_data/date/"
taxi_trips_path = "transformed_data/taxi_trips/"
weather_path = "transformed_data/weather/"


In [5]:
community_areas = read_csv_from_s3(bucket=bucket, path=community_areas_path, filename="community_areas_master.csv")
company = read_csv_from_s3(bucket=bucket, path=company_path, filename="company_master.csv")
date = read_csv_from_s3(bucket=bucket, path=date_path, filename="date_dimension.csv")
payment_type = read_csv_from_s3(bucket=bucket, path=payment_type_path, filename="payment_type_master.csv")


In [6]:
trips_list = []
weather_list = []

In [7]:
for file in  s3.list_objects(Bucket=bucket, Prefix=taxi_trips_path)['Contents']:
    taxi_trip_key = file["Key"]

    if taxi_trip_key.split("/")[-1].strip() != "":
        if taxi_trip_key.split(".")[1] == "csv":
            
            filename = taxi_trip_key.split("/")[-1]
            trip = read_csv_from_s3(bucket, taxi_trips_path, filename)

            trips_list.append(trip)
            print(f"{filename} has been added")


taxi_2024-06-09.csv has been added
taxi_2024-06-10.csv has been added
taxi_2024-06-11.csv has been added
taxi_2024-06-12.csv has been added
taxi_2024-06-13.csv has been added
taxi_2024-06-14.csv has been added
taxi_2024-06-15.csv has been added
taxi_2024-06-16.csv has been added
taxi_2024-06-17.csv has been added
taxi_2024-06-18.csv has been added
taxi_2024-06-19.csv has been added
taxi_2024-06-20.csv has been added
taxi_2024-06-21.csv has been added


In [8]:
trips = pd.concat(trips_list, ignore_index=True)

In [9]:
trips

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,payment_type_id,company_id
0,ff5658868e41b86c16bd4703278109bc68f261e2,489e78d30b4d287ed76527efc8533b5ef45e7b6c496d92...,2024-06-09T23:45:00.000,2024-06-10T00:00:00.000,660.0,1.70,28.0,28.0,8.50,4.00,0.0,0.0,12.50,41.874005,-87.663518,41.874005,-87.663518,2024-06-09 23:00:00,2,15
1,3b50f837e475c88c0037601b1dba293b75443c2d,61d83eb1d34ccd8dac5bdd962bc13e2ac077b0c7ecbf1d...,2024-06-09T23:45:00.000,2024-06-10T00:15:00.000,1440.0,14.30,76.0,7.0,52.75,0.00,0.0,4.0,57.25,41.980264,-87.913625,41.922686,-87.649489,2024-06-09 23:00:00,2,14
2,3dbf9607c7e56885bab3f3d5ce65c5780866185e,4f4cae403660ec4c1e2e3b03128bd7a5af139619e3e513...,2024-06-09T23:45:00.000,2024-06-09T23:45:00.000,0.0,0.00,76.0,76.0,3.25,0.00,0.0,0.0,3.25,41.979071,-87.903040,41.979071,-87.903040,2024-06-09 23:00:00,1,3
3,46d4c7730778ff19bf485ee3e70c0c4ac14da22b,13016372e777da1289d557edbe4ce2be8a68e77bc64768...,2024-06-09T23:45:00.000,2024-06-10T00:15:00.000,1643.0,15.68,76.0,7.0,39.25,4.38,0.0,4.0,48.13,41.980264,-87.913625,41.922686,-87.649489,2024-06-09 23:00:00,3,6
4,4a0937943b29b734d7ad8bd1c0011817687b3727,ae21d024a249783394ef3fcd91c8abad23fe127549fe96...,2024-06-09T23:45:00.000,2024-06-10T00:00:00.000,420.0,0.10,8.0,24.0,9.50,3.00,0.0,0.0,12.50,41.892508,-87.626215,41.906026,-87.675312,2024-06-09 23:00:00,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265758,3a39fbfb598a88add8d216814d31e3d60a166b25,0ee86e2a204cc2224e9ff2494686ee474cce6aba093a1b...,2024-06-21T00:00:00.000,2024-06-21T00:30:00.000,2144.0,16.54,28.0,76.0,46.00,0.00,0.0,28.5,74.50,41.874005,-87.663518,41.980264,-87.913625,2024-06-21 00:00:00,1,9
265759,2dad0b10baac8ae9e8d507b99e9289dbdb4adc43,72260bbc4c07a8dcb92f078dd6eca595937f051a64ecfd...,2024-06-21T00:00:00.000,2024-06-21T00:15:00.000,646.0,1.69,8.0,32.0,8.25,2.00,0.0,0.0,10.75,41.899602,-87.633308,41.878866,-87.625192,2024-06-21 00:00:00,2,11
265760,2bc331ac80f6bba6f6b9a37bb2c8efee4297839b,3f7d8054c7c41782afe5526091ff0e5db98ba3652db8e8...,2024-06-21T00:00:00.000,2024-06-21T00:30:00.000,1617.0,17.75,8.0,75.0,27.18,0.00,0.0,0.0,27.18,41.899602,-87.633308,41.689730,-87.669054,2024-06-21 00:00:00,3,9
265761,279446039790a14009f84edef78ba6ce1d6d88bf,6f3c8bee0eb9f2eb5b7bd4c279792dbf7c2e1bd5f5c6db...,2024-06-21T00:00:00.000,2024-06-21T00:15:00.000,1323.0,14.13,76.0,6.0,35.50,0.00,0.0,4.0,40.00,41.980264,-87.913625,41.944227,-87.655998,2024-06-21 00:00:00,2,4


In [10]:
for file in  s3.list_objects(Bucket=bucket, Prefix=weather_path)['Contents']:
    weather_key = file["Key"]
    
    if weather_key.split("/")[-1].strip() != "":
        if weather_key.split(".")[1] == "csv":
            
            filename = weather_key.split("/")[-1]
            weather_daily = read_csv_from_s3(bucket, weather_path, filename)

            weather_list.append(weather_daily)
            print(f"{filename} has been added")



weather_2024-06-09.csv has been added
weather_2024-06-10.csv has been added
weather_2024-06-11.csv has been added
weather_2024-06-12.csv has been added
weather_2024-06-13.csv has been added
weather_2024-06-14.csv has been added
weather_2024-06-15.csv has been added
weather_2024-06-16.csv has been added
weather_2024-06-17.csv has been added
weather_2024-06-18.csv has been added
weather_2024-06-19.csv has been added
weather_2024-06-20.csv has been added
weather_2024-06-21.csv has been added


In [11]:
weather = pd.concat(weather_list, ignore_index=True)

In [12]:
weather

Unnamed: 0,datetime,temperature,wind_speed,rain,precipitation
0,2024-06-09 00:00:00,15.8,22.1,0.0,0.0
1,2024-06-09 01:00:00,15.0,16.4,0.1,0.1
2,2024-06-09 02:00:00,14.5,17.8,0.0,0.0
3,2024-06-09 03:00:00,13.9,14.2,0.0,0.0
4,2024-06-09 04:00:00,13.4,8.1,0.0,0.0
...,...,...,...,...,...
307,2024-06-21 19:00:00,30.0,14.7,0.0,0.0
308,2024-06-21 20:00:00,30.9,14.0,0.0,0.0
309,2024-06-21 21:00:00,31.8,9.4,0.0,0.0
310,2024-06-21 22:00:00,33.5,12.7,0.0,0.0


Join them together

In [13]:
trips_full = pd.merge(trips, weather, left_on="datetime_for_weather", right_on="datetime", how="inner")
trips_full = trips_full.drop(columns=["datetime"])

In [14]:
trips_full = pd.merge(trips_full, company, left_on="company_id", right_on="company_id", how="inner")
trips_full = trips_full.drop(columns=["company_id"])

In [15]:
trips_full = pd.merge(trips_full, payment_type, left_on="payment_type_id", right_on="payment_type_id", how="inner")
trips_full = trips_full.drop(columns=["payment_type_id"])

In [16]:
trips_full = pd.merge(trips_full, community_areas, left_on="pickup_community_area_id", right_on="area_code", how="inner")
trips_full = trips_full.drop(columns=["pickup_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "pickup_community_area_name"}, inplace=True)

In [17]:
trips_full = pd.merge(trips_full, community_areas, left_on="dropoff_community_area_id", right_on="area_code", how="inner")
trips_full = trips_full.drop(columns=["dropoff_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "dropoff_community_area_name"}, inplace=True)

In [18]:
trips_full.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,temperature,wind_speed,rain,precipitation,company,payment_type,pickup_community_area_name,dropoff_community_area_name
0,ff5658868e41b86c16bd4703278109bc68f261e2,489e78d30b4d287ed76527efc8533b5ef45e7b6c496d92...,2024-06-09T23:45:00.000,2024-06-10T00:00:00.000,660.0,1.7,8.5,4.0,0.0,0.0,12.5,41.874005,-87.663518,41.874005,-87.663518,2024-06-09 23:00:00,22.1,29.7,0.0,0.0,Chicago City Taxi Association,Credit Card,Near West Side,Near West Side
1,3b50f837e475c88c0037601b1dba293b75443c2d,61d83eb1d34ccd8dac5bdd962bc13e2ac077b0c7ecbf1d...,2024-06-09T23:45:00.000,2024-06-10T00:15:00.000,1440.0,14.3,52.75,0.0,0.0,4.0,57.25,41.980264,-87.913625,41.922686,-87.649489,2024-06-09 23:00:00,22.1,29.7,0.0,0.0,Globe Taxi,Credit Card,O'Hare,Lincoln Park
2,3dbf9607c7e56885bab3f3d5ce65c5780866185e,4f4cae403660ec4c1e2e3b03128bd7a5af139619e3e513...,2024-06-09T23:45:00.000,2024-06-09T23:45:00.000,0.0,0.0,3.25,0.0,0.0,0.0,3.25,41.979071,-87.90304,41.979071,-87.90304,2024-06-09 23:00:00,22.1,29.7,0.0,0.0,Taxi Affiliation Services,Cash,O'Hare,O'Hare
3,46d4c7730778ff19bf485ee3e70c0c4ac14da22b,13016372e777da1289d557edbe4ce2be8a68e77bc64768...,2024-06-09T23:45:00.000,2024-06-10T00:15:00.000,1643.0,15.68,39.25,4.38,0.0,4.0,48.13,41.980264,-87.913625,41.922686,-87.649489,2024-06-09 23:00:00,22.1,29.7,0.0,0.0,Taxicab Insurance Agency Llc,Mobile,O'Hare,Lincoln Park
4,4a0937943b29b734d7ad8bd1c0011817687b3727,ae21d024a249783394ef3fcd91c8abad23fe127549fe96...,2024-06-09T23:45:00.000,2024-06-10T00:00:00.000,420.0,0.1,9.5,3.0,0.0,0.0,12.5,41.892508,-87.626215,41.906026,-87.675312,2024-06-09 23:00:00,22.1,29.7,0.0,0.0,Taxi Affiliation Services,Credit Card,Near North Side,West Town


Daily Trip Counts

In [19]:
# Define a custom color palette
color_palette = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#c2c2f0', '#ffb3e6', '#ff6666', '#ffb366', '#b3b3ff', '#66ff66']

# Plotting the data
trips_per_weekday.plot(kind="bar", figsize=(14, 8), fontsize=12, color=color_palette)

# Adding titles and labels
plt.xlabel("Weekday", fontsize = 13)
plt.ylabel("Count of the trips", fontsize = 13)
plt.title("Daily trip counts", fontsize = 20, pad = 18)

# Show and save the plot into a file
figure = plt.gcf()

plt.show()

NameError: name 'trips_per_weekday' is not defined