In [None]:
import os
from minio import Minio

import pyspark
from pyspark import SparkFiles
from pyspark.sql import SparkSession

In [None]:
AWS_S3_ENDPOINT = os.environ.get('AWS_S3_ENDPOINT')
AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY')

In [None]:
def create_bucket(bucket_name):
    client = Minio(
        'localhost:9000',
        access_key=AWS_ACCESS_KEY_ID,
        secret_key=AWS_SECRET_ACCESS_KEY,
        secure=False
    )

    found = client.bucket_exists(bucket_name)
    if not found:
        client.make_bucket(bucket_name)
        return print(f'Creating bucket: {bucket_name}')
    else:
        return print(f'Bucket: {bucket_name} already in use')

In [None]:
#Bronze bucket 
create_bucket('local-data-lake-bronze')

#Silver bucket
create_bucket('local-data-lake-silver')

In [None]:
conf = (
    pyspark.SparkConf()
        .set('spark.jars.packages','org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262')
        .set('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
        .set('spark.hadoop.fs.s3a.path.style.access', 'true')
        .set('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false')
        .set('spark.hadoop.fs.s3a.endpoint', AWS_S3_ENDPOINT)
        .set('spark.hadoop.fs.s3a.access.key', AWS_ACCESS_KEY_ID)
        .set('spark.hadoop.fs.s3a.secret.key', AWS_SECRET_ACCESS_KEY)      
)

In [None]:
spark = SparkSession.builder \
    .master('local[5]') \
    .appName('local-data-platform') \
    .config(conf=conf) \
    .getOrCreate()

sc = spark.sparkContext
print(f'The Pyspark version {spark.version} is running...')

In [None]:
def load_taxi_data(dt):
    url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{dt}.parquet'
    sc.addFile(url)
    
    df = spark.read.parquet(SparkFiles.get(f'yellow_tripdata_{dt}.parquet'))
    
    return df

In [None]:
dt = '2023-06'

df = load_taxi_data(dt)
print(f'Total number of file rows: {df.count()}')

df \
    .write \
    .mode('overwrite') \
    .parquet(f's3a://local-data-lake-bronze/{dt}/yellow_taxi.parquet')

In [None]:
spark.stop()