#### This notebook is used to download the source data for the rest of the section

In [1]:
import sys
sys.path.append('../preparing_data')
from download_data import download_file

In [4]:
# Define files to download
import os
url_prefix = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download"
taxi_type = "green"
year = "2021"

for month in range(1,13):
    if (month < 10):
        mm = "0" + str(month)
    else:
        mm = str(month)

    url = f"{url_prefix}/{taxi_type}/{taxi_type}_tripdata_{year}-{mm}.csv.gz"
    filepath = f"{os.getcwd()}/data/raw/{taxi_type}/{year}/{mm}/{taxi_type}_tripdata_{year}-{mm}.csv.gz"
    download_file(url=url, filepath=filepath)


File downloaded successfully to '/home/jdelzio/data-engineering-zoomcamp/week_5/data/raw/green/2021/01/green_tripdata_2021-01.csv.gz'
File downloaded successfully to '/home/jdelzio/data-engineering-zoomcamp/week_5/data/raw/green/2021/02/green_tripdata_2021-02.csv.gz'
File downloaded successfully to '/home/jdelzio/data-engineering-zoomcamp/week_5/data/raw/green/2021/03/green_tripdata_2021-03.csv.gz'
File downloaded successfully to '/home/jdelzio/data-engineering-zoomcamp/week_5/data/raw/green/2021/04/green_tripdata_2021-04.csv.gz'
File downloaded successfully to '/home/jdelzio/data-engineering-zoomcamp/week_5/data/raw/green/2021/05/green_tripdata_2021-05.csv.gz'
File downloaded successfully to '/home/jdelzio/data-engineering-zoomcamp/week_5/data/raw/green/2021/06/green_tripdata_2021-06.csv.gz'
File downloaded successfully to '/home/jdelzio/data-engineering-zoomcamp/week_5/data/raw/green/2021/07/green_tripdata_2021-07.csv.gz'
Failed to download file. Status Code: 404
Failed to download f

In [9]:
import pyspark
from pyspark.sql import SparkSession

In [10]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/04 01:56:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/04 01:56:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [11]:
from pyspark.sql import types
import pandas as pd

In [12]:
# Define data types
green_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("lpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("lpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("ehail_fee", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("trip_type", types.IntegerType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

yellow_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("tpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("tpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

In [13]:
#Create spark data models
year = 2020

for month in range(1, 13):
    print(f'processing data for {year}/{month}')

    input_path = f'data/raw/green/{year}/{month:02d}/'
    output_path = f'data/pq/green/{year}/{month:02d}/'

    df_green = spark.read \
        .option("header", "true") \
        .schema(green_schema) \
        .csv(input_path)

    df_green \
        .repartition(4) \
        .write.parquet(output_path)

processing data for 2020/1


                                                                                

processing data for 2020/2


                                                                                

processing data for 2020/3


                                                                                

processing data for 2020/4
processing data for 2020/5
processing data for 2020/6
processing data for 2020/7
processing data for 2020/8
processing data for 2020/9
processing data for 2020/10


                                                                                

processing data for 2020/11
processing data for 2020/12


In [14]:
year = 2021 

for month in range(1, 8):
    print(f'processing data for {year}/{month}')

    input_path = f'data/raw/green/{year}/{month:02d}/'
    output_path = f'data/pq/green/{year}/{month:02d}/'

    df_green = spark.read \
        .option("header", "true") \
        .schema(green_schema) \
        .csv(input_path)

    df_green \
        .repartition(4) \
        .write.parquet(output_path)

processing data for 2021/1


                                                                                

processing data for 2021/2
processing data for 2021/3


                                                                                

processing data for 2021/4
processing data for 2021/5


                                                                                

processing data for 2021/6
processing data for 2021/7


                                                                                

In [15]:
year = 2020

for month in range(1, 13):
    print(f'processing data for {year}/{month}')

    input_path = f'data/raw/yellow/{year}/{month:02d}/'
    output_path = f'data/pq/yellow/{year}/{month:02d}/'

    df_yellow = spark.read \
        .option("header", "true") \
        .schema(yellow_schema) \
        .csv(input_path)

    df_yellow \
        .repartition(4) \
        .write.parquet(output_path)

processing data for 2020/1


                                                                                

processing data for 2020/2


                                                                                

processing data for 2020/3


                                                                                

processing data for 2020/4


                                                                                

processing data for 2020/5


                                                                                

processing data for 2020/6


                                                                                

processing data for 2020/7


                                                                                

processing data for 2020/8


                                                                                

processing data for 2020/9


                                                                                

processing data for 2020/10


                                                                                

processing data for 2020/11


                                                                                

processing data for 2020/12


                                                                                

In [16]:
year = 2021

for month in range(1, 8):
    print(f'processing data for {year}/{month}')

    input_path = f'data/raw/yellow/{year}/{month:02d}/'
    output_path = f'data/pq/yellow/{year}/{month:02d}/'

    df_yellow = spark.read \
        .option("header", "true") \
        .schema(yellow_schema) \
        .csv(input_path)

    df_yellow \
        .repartition(4) \
        .write.parquet(output_path)

processing data for 2021/1


                                                                                

processing data for 2021/2


                                                                                

processing data for 2021/3


                                                                                

processing data for 2021/4


                                                                                

processing data for 2021/5


                                                                                

processing data for 2021/6


                                                                                

processing data for 2021/7


                                                                                

processing data for 2021/8


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/home/jdelzio/data-engineering-zoomcamp/week_5/data/raw/yellow/2021/08.

In [17]:
spark.stop()