### Ingest trip.csv file

In [0]:
%run "../_src/config"

In [0]:
%run "../_src/bronze_functions"

##### Step 1 - Read the CSV file using the spark dataframe reader

In [0]:
col_names = ["trip_id", "rideable_type", "start_at", "ended_at", "start_station_id", "end_station_id", "rider_id"]

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, VarcharType, StringType, FloatType, TimestampType

In [0]:
trip_schema = StructType(fields=[StructField("trip_id", StringType(), False),
                                     StructField("rideable_type", StringType(), True),
                                     StructField("start_at", TimestampType(), True),
                                     StructField("ended_at", TimestampType(), True),
                                     StructField("start_station_id", StringType(), True),
                                     StructField("end_station_id", StringType(), True),
                                     StructField("rider_id", IntegerType(), True)
])

In [0]:
df = read_files('landingzone', 'trips', 'csv', col_names, trip_schema)

reading file from: /mnt/bikesharedlake/landingzone/trips.csv
file successfully read from:/mnt/bikesharedlake/landingzone/trips.csv


##### Step 2 - Add ingestion date to the dataframe

In [0]:
df = df_add_columns(df, add_timestamp=True)

In [0]:
display(df)

trip_id,rideable_type,start_at,ended_at,start_station_id,end_station_id,rider_id,ingestion_date
0FEFDE2603568365,classic_bike,2021-02-14T17:52:38.000+0000,2021-02-14T18:12:09.000+0000,525,16806,47854,2023-06-18T15:13:02.743+0000
E6159D746B2DBB91,electric_bike,2021-02-09T19:10:18.000+0000,2021-02-09T19:19:10.000+0000,KA1503000012,TA1305000029,70870,2023-06-18T15:13:02.743+0000
B32D3199F1C2E75B,classic_bike,2021-02-02T17:49:41.000+0000,2021-02-02T17:54:06.000+0000,637,TA1305000034,58974,2023-06-18T15:13:02.743+0000
83E463F23575F4BF,electric_bike,2021-02-23T15:07:23.000+0000,2021-02-23T15:22:37.000+0000,13216,TA1309000055,39608,2023-06-18T15:13:02.743+0000
BDAA7E3494E8D545,electric_bike,2021-02-24T15:43:33.000+0000,2021-02-24T15:49:05.000+0000,18003,KP1705001026,36267,2023-06-18T15:13:02.743+0000
A772742351171257,classic_bike,2021-02-01T17:47:42.000+0000,2021-02-01T17:48:33.000+0000,KP1705001026,KP1705001026,50104,2023-06-18T15:13:02.743+0000
295476889D9B79F8,classic_bike,2021-02-11T18:33:53.000+0000,2021-02-11T18:35:09.000+0000,18003,18003,19618,2023-06-18T15:13:02.743+0000
362087194BA4CC9A,classic_bike,2021-02-27T15:13:39.000+0000,2021-02-27T15:36:36.000+0000,KP1705001026,KP1705001026,16732,2023-06-18T15:13:02.743+0000
21630F715038CCB0,classic_bike,2021-02-20T08:59:42.000+0000,2021-02-20T09:17:04.000+0000,KP1705001026,KP1705001026,57068,2023-06-18T15:13:02.743+0000
A977EB7FE7F5CD3A,classic_bike,2021-02-20T08:58:16.000+0000,2021-02-20T08:58:41.000+0000,KP1705001026,KP1705001026,32712,2023-06-18T15:13:02.743+0000


In [0]:
write_parquet_table(df, 'bronze', 'trip')

writing bronze file: /mnt/bikesharedlake/bronze/trip
Table successfully written: /mnt/bikesharedlake/bronze/trip


##### Step 3 - Write data to datalake as delta

In [0]:
trip_df = spark.read \
.option("header", True) \
.parquet(f"{bronze_folder_path}/trip")

##### Step 3 - Add and convert the columns required

In [0]:
from pyspark.sql.functions import to_date, to_timestamp, concat, col, lit, substring

In [0]:
trip_added_df = trip_df.withColumn("trip_date", to_date(substring("start_at", 0, 10))) 


##### Step 4 - Select only the columns required & rename as required

In [0]:
trip_selected_df = trip_added_df.select(col('trip_id'),
            col('rideable_type'), col('trip_date'), col('start_at'), col('ended_at'),
            col('start_station_id'), col('end_station_id'), col('rider_id'), col('ingestion_date')
)

In [0]:
trip_selected_df.write.mode("overwrite").option("overwriteSchema", "true").format("delta").saveAsTable("silver.trip")

In [0]:
%sql
SELECT * FROM silver.trip;

trip_id,rideable_type,trip_date,start_at,ended_at,start_station_id,end_station_id,rider_id,ingestion_date
0FEFDE2603568365,classic_bike,2021-02-14,2021-02-14T17:52:38.000+0000,2021-02-14T18:12:09.000+0000,525,16806,47854,2023-06-18T15:13:03.372+0000
E6159D746B2DBB91,electric_bike,2021-02-09,2021-02-09T19:10:18.000+0000,2021-02-09T19:19:10.000+0000,KA1503000012,TA1305000029,70870,2023-06-18T15:13:03.372+0000
B32D3199F1C2E75B,classic_bike,2021-02-02,2021-02-02T17:49:41.000+0000,2021-02-02T17:54:06.000+0000,637,TA1305000034,58974,2023-06-18T15:13:03.372+0000
83E463F23575F4BF,electric_bike,2021-02-23,2021-02-23T15:07:23.000+0000,2021-02-23T15:22:37.000+0000,13216,TA1309000055,39608,2023-06-18T15:13:03.372+0000
BDAA7E3494E8D545,electric_bike,2021-02-24,2021-02-24T15:43:33.000+0000,2021-02-24T15:49:05.000+0000,18003,KP1705001026,36267,2023-06-18T15:13:03.372+0000
A772742351171257,classic_bike,2021-02-01,2021-02-01T17:47:42.000+0000,2021-02-01T17:48:33.000+0000,KP1705001026,KP1705001026,50104,2023-06-18T15:13:03.372+0000
295476889D9B79F8,classic_bike,2021-02-11,2021-02-11T18:33:53.000+0000,2021-02-11T18:35:09.000+0000,18003,18003,19618,2023-06-18T15:13:03.372+0000
362087194BA4CC9A,classic_bike,2021-02-27,2021-02-27T15:13:39.000+0000,2021-02-27T15:36:36.000+0000,KP1705001026,KP1705001026,16732,2023-06-18T15:13:03.372+0000
21630F715038CCB0,classic_bike,2021-02-20,2021-02-20T08:59:42.000+0000,2021-02-20T09:17:04.000+0000,KP1705001026,KP1705001026,57068,2023-06-18T15:13:03.372+0000
A977EB7FE7F5CD3A,classic_bike,2021-02-20,2021-02-20T08:58:16.000+0000,2021-02-20T08:58:41.000+0000,KP1705001026,KP1705001026,32712,2023-06-18T15:13:03.372+0000
