In [1]:
import os
from minio import Minio

import pyspark
from pyspark import SparkFiles
from pyspark.sql import SparkSession

In [2]:
AWS_S3_ENDPOINT = os.environ.get('AWS_S3_ENDPOINT')
AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY')

In [3]:
def create_bucket(bucket_name):
    client = Minio(
        'localhost:9000',
        access_key=AWS_ACCESS_KEY_ID,
        secret_key=AWS_SECRET_ACCESS_KEY,
        secure=False
    )

    found = client.bucket_exists(bucket_name)
    if not found:
        client.make_bucket(bucket_name)
        return print(f'Creating bucket: {bucket_name}')
    else:
        return print(f'Bucket: {bucket_name} already in use')

In [4]:
#Bronze bucket 
create_bucket('local-data-lake-bronze')

#Silver bucket
create_bucket('local-data-lake-silver')

Bucket: local-data-lake-bronze already in use
Bucket: local-data-lake-silver already in use


In [5]:
conf = (
    pyspark.SparkConf()
        .set('spark.jars.packages','org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262')
        .set('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
        .set('spark.hadoop.fs.s3a.path.style.access', 'true')
        .set('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false')
        .set('spark.hadoop.fs.s3a.endpoint', AWS_S3_ENDPOINT)
        .set('spark.hadoop.fs.s3a.access.key', AWS_ACCESS_KEY_ID)
        .set('spark.hadoop.fs.s3a.secret.key', AWS_SECRET_ACCESS_KEY)      
)

In [6]:
spark = SparkSession.builder \
    .master('local[5]') \
    .appName('local-data-platform') \
    .config(conf=conf) \
    .getOrCreate()

sc = spark.sparkContext
print(f'The Pyspark version {spark.version} is running...')

:: loading settings :: url = jar:file:/Users/dennislafferty/.pyenv/versions/3.10.13/envs/data-platform/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/dennislafferty/.ivy2/cache
The jars for the packages stored in: /Users/dennislafferty/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4f3f5c79-4651-4a0a-bf92-4b332d04f4df;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 195ms :: artifacts dl 6ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| s

The Pyspark version 3.5.1 is running...


In [7]:
def load_taxi_data(dt):
    url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{dt}.parquet'
    sc.addFile(url)
    
    df = spark.read.parquet(SparkFiles.get(f'yellow_tripdata_{dt}.parquet'))
    
    return df

In [8]:
dt = '2023-06'

df = load_taxi_data(dt)
print(f'Total number of file rows: {df.count()}')

df \
    .write \
    .mode('overwrite') \
    .parquet(f's3a://local-data-lake-bronze/{dt}/yellow_taxi.parquet')

                                                                                

Total number of file rows: 3307234


24/06/12 20:56:34 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [9]:
spark.stop()