### Ingest station.csv file

In [0]:
%run "../_src/config"

In [0]:
%run "../_src/bronze_functions"

##### Step 1 - Read the CSV file using the spark dataframe reader

In [0]:
col_names = ["station_id", "name", "latitude", "longitude"]

In [0]:
from pyspark.sql.types import StructType, StructField, VarcharType, StringType, DoubleType

In [0]:
station_schema = StructType(fields=[StructField("station_id", StringType(), False),
                                     StructField("name", StringType(), True),
                                     StructField("latitude", DoubleType(), True),
                                     StructField("longitude", DoubleType(), True)
])

In [0]:
df = read_files('landingzone', 'stations', 'csv', col_names, station_schema)

reading file from: /mnt/bikesharedlake/landingzone/stations.csv
file successfully read from:/mnt/bikesharedlake/landingzone/stations.csv


##### Step 2 - Add ingestion date to the dataframe

In [0]:
df = df_add_columns(df, add_timestamp=True)

In [0]:
display(df)

station_id,name,latitude,longitude,ingestion_date
KA1503000012,Clark St & Lake St,41.88579466666667,-87.63110066666668,2023-06-18T14:34:01.098+0000
637,Wood St & Chicago Ave,41.895634,-87.672069,2023-06-18T14:34:01.098+0000
13216,State St & 33rd St,41.8347335,-87.6258275,2023-06-18T14:34:01.098+0000
18003,Fairbanks St & Superior St,41.89580766666667,-87.62025316666669,2023-06-18T14:34:01.098+0000
KP1705001026,LaSalle Dr & Huron St,41.894877,-87.632326,2023-06-18T14:34:01.098+0000
13253,Lincoln Ave & Waveland Ave,41.948797,-87.675278,2023-06-18T14:34:01.098+0000
KA1503000044,Rush St & Hubbard St,41.890173,-87.62618499999999,2023-06-18T14:34:01.098+0000
KA1504000140,Winchester Ave & Elston Ave,41.92403733333333,-87.67641483333334,2023-06-18T14:34:01.098+0000
TA1305000032,Clinton St & Madison St,41.882242,-87.64106600000001,2023-06-18T14:34:01.098+0000
TA1306000012,Wells St & Huron St,41.89475366666667,-87.63440200000001,2023-06-18T14:34:01.098+0000


In [0]:
write_parquet_table(df, 'bronze', 'station')

writing bronze file: /mnt/bikesharedlake/bronze/station
Table successfully written: /mnt/bikesharedlake/bronze/station


##### Step 3 - Write data to datalake as delta

In [0]:
station_df = spark.read \
.option("header", True) \
.parquet(f"{bronze_folder_path}/station")

##### Step 3 - Select only the columns required & rename as required

In [0]:
from pyspark.sql.functions import to_timestamp, concat, col, lit

In [0]:
station_selected_df = station_df.select(
    col('station_id'), 
    col('name').alias('station_name'),
    col('latitude'),
    col('longitude'),
    col('ingestion_date')
)

In [0]:
station_selected_df.write.mode("overwrite").option("overwriteSchema", "true").format("delta").saveAsTable("silver.station")

In [0]:
%sql
SELECT * FROM silver.station;

station_id,station_name,latitude,longitude,ingestion_date
KA1503000012,Clark St & Lake St,41.88579466666667,-87.63110066666668,2023-06-18T14:34:01.310+0000
637,Wood St & Chicago Ave,41.895634,-87.672069,2023-06-18T14:34:01.310+0000
13216,State St & 33rd St,41.8347335,-87.6258275,2023-06-18T14:34:01.310+0000
18003,Fairbanks St & Superior St,41.89580766666667,-87.62025316666669,2023-06-18T14:34:01.310+0000
KP1705001026,LaSalle Dr & Huron St,41.894877,-87.632326,2023-06-18T14:34:01.310+0000
13253,Lincoln Ave & Waveland Ave,41.948797,-87.675278,2023-06-18T14:34:01.310+0000
KA1503000044,Rush St & Hubbard St,41.890173,-87.62618499999999,2023-06-18T14:34:01.310+0000
KA1504000140,Winchester Ave & Elston Ave,41.92403733333333,-87.67641483333334,2023-06-18T14:34:01.310+0000
TA1305000032,Clinton St & Madison St,41.882242,-87.64106600000001,2023-06-18T14:34:01.310+0000
TA1306000012,Wells St & Huron St,41.89475366666667,-87.63440200000001,2023-06-18T14:34:01.310+0000
