### Ingest rider.csv file

In [0]:
%run "../_src/config"

In [0]:
%run "../_src/bronze_functions"

##### Step 1 - Read the CSV file using the spark dataframe reader

In [0]:
col_names = ["rider_id", "first", "last", "address", "birthday", "account_start_date", "account_end_date", "is_member"]

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, BooleanType

In [0]:
rider_schema = StructType(fields=[StructField("rider_id", StringType(), False),
                                     StructField("first", StringType(), True),
                                     StructField("last", StringType(), True),
                                     StructField("address", StringType(), True),
                                     StructField("birthday", DateType(), True),
                                     StructField("account_start_date", DateType(), True),
                                     StructField("account_end_date", DateType(), True),
                                     StructField("is_member", BooleanType(), True)
])

In [0]:
df = read_files('landingzone', 'riders', 'csv', col_names, rider_schema)

reading file from: /mnt/bikesharedlake/landingzone/riders.csv
file successfully read from:/mnt/bikesharedlake/landingzone/riders.csv


##### Step 2 - Add ingestion date to the dataframe

In [0]:
df = df_add_columns(df, add_timestamp=True)

In [0]:
display(df)

rider_id,first,last,address,birthday,account_start_date,account_end_date,is_member,ingestion_date
1001,Jennifer,Smith,397 Diana Ferry,1976-08-10,2019-11-01,2020-09-01,True,2023-06-18T16:27:07.706+0000
1002,Karen,Smith,644 Brittany Row Apt. 097,1998-08-10,2022-02-04,,True,2023-06-18T16:27:07.706+0000
1003,Bryan,Roberts,996 Dickerson Turnpike,1999-03-29,2019-08-26,,False,2023-06-18T16:27:07.706+0000
1004,Jesse,Middleton,7009 Nathan Expressway,1969-04-11,2019-09-14,,True,2023-06-18T16:27:07.706+0000
1005,Christine,Rodriguez,224 Washington Mills Apt. 467,1974-08-27,2020-03-24,,False,2023-06-18T16:27:07.706+0000
1006,Alicia,Taylor,1137 Angela Locks,2004-01-30,2020-11-27,2021-12-01,True,2023-06-18T16:27:07.706+0000
1007,Benjamin,Fernandez,979 Phillips Ways,1988-01-11,2016-12-11,,False,2023-06-18T16:27:07.706+0000
1008,John,Crawford,7691 Evans Court,1987-02-21,2021-03-28,2021-07-01,True,2023-06-18T16:27:07.706+0000
1009,Victoria,Ritter,9922 Jim Crest Apt. 319,1981-02-07,2020-06-12,2021-11-01,True,2023-06-18T16:27:07.706+0000
1010,Tracy,Austin,92973 Mary Ville,1996-04-07,2019-12-27,,True,2023-06-18T16:27:07.706+0000


In [0]:
write_parquet_table(df, 'bronze', 'rider')

writing bronze file: /mnt/bikesharedlake/bronze/rider
Table successfully written: /mnt/bikesharedlake/bronze/rider


##### Step 3 - Write data to datalake as delta

In [0]:
rider_df = read_files('bronze', 'rider', 'parquet')

In [0]:
rider_selected_df = rider_df.select(
    col('rider_id'), 
    col('first').alias('first_name'),
    col('last').alias('last_name'),
    col('address'), 
    col('birthday'),
    col('account_start_date'), 
    col('account_end_date'), 
    col('is_member'),
    col('ingestion_date'))

In [0]:
rider_selected_df.write.mode("overwrite").option("overwriteSchema", "true").format("delta").saveAsTable("silver.rider")

In [0]:
%sql
SELECT * FROM silver.rider;

rider_id,first_name,last_name,address,birthday,account_start_date,account_end_date,is_member,ingestion_date
1001,Jennifer,Smith,397 Diana Ferry,1976-08-10,2019-11-01,2020-09-01,True,2023-06-18T16:27:08.425+0000
1002,Karen,Smith,644 Brittany Row Apt. 097,1998-08-10,2022-02-04,,True,2023-06-18T16:27:08.425+0000
1003,Bryan,Roberts,996 Dickerson Turnpike,1999-03-29,2019-08-26,,False,2023-06-18T16:27:08.425+0000
1004,Jesse,Middleton,7009 Nathan Expressway,1969-04-11,2019-09-14,,True,2023-06-18T16:27:08.425+0000
1005,Christine,Rodriguez,224 Washington Mills Apt. 467,1974-08-27,2020-03-24,,False,2023-06-18T16:27:08.425+0000
1006,Alicia,Taylor,1137 Angela Locks,2004-01-30,2020-11-27,2021-12-01,True,2023-06-18T16:27:08.425+0000
1007,Benjamin,Fernandez,979 Phillips Ways,1988-01-11,2016-12-11,,False,2023-06-18T16:27:08.425+0000
1008,John,Crawford,7691 Evans Court,1987-02-21,2021-03-28,2021-07-01,True,2023-06-18T16:27:08.425+0000
1009,Victoria,Ritter,9922 Jim Crest Apt. 319,1981-02-07,2020-06-12,2021-11-01,True,2023-06-18T16:27:08.425+0000
1010,Tracy,Austin,92973 Mary Ville,1996-04-07,2019-12-27,,True,2023-06-18T16:27:08.425+0000
