In [0]:
# %run "../config/Set-up Access to Azure Data Lake"

In [0]:
display(dbutils.fs.mounts())

In [0]:
df = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .option("multiLine", True) \
    .option("quote", '"') \
    .option("escape", '"') \
    .option("mode", "PERMISSIVE") \
    .option("delimiter", ",") \
    .csv("/mnt/devkivaloansdl/bronze/kiva_loans.csv")

display(df)

In [0]:
from pyspark.sql.functions import col

df_kiva_loans_selected_fields = df.select('date',col('id').alias('loan_id'), 'funded_amount', 'loan_amount', 'sector', 'country_code', 'country','region', 'currency', col('partner_id').cast('int'), col('term_in_months').cast('int'), 'repayment_interval')

display(df_kiva_loans_selected_fields)

In [0]:
from pyspark.sql.functions import to_date, current_date, current_timestamp, from_utc_timestamp, date_format

df_add_processed_date = df_kiva_loans_selected_fields.withColumn(
    "processed_timestamp", 
    date_format(from_utc_timestamp(current_timestamp(), "Australia/Melbourne"), "yyyy-MM-dd HH:mm:ss")
) \
.withColumnRenamed("date", "transaction_date") \
.filter(col("id").isNotNull())

display(df_add_processed_date)

In [0]:
df_drop_date_column = df_add_processed_date.drop("date")

In [0]:
# Crete a MANAGED DELTA table from the datafram
df_drop_date_column.write.mode("overwrite").format("delta").saveAsTable("silver.kiva_loans")

In [0]:
# Create a CSV file from the dataframe and save it from the mounted silver container
df_drop_date_column.write.mode("overwrite").format("csv").save("/mnt/devkivaloansdl/silver/kiva_loans")

In [0]:
# Delete the CSV file from the previous cell
dbutils.fs.rm("/mnt/devkivaloansdl/silver/kiva_loans", recurse=True)


In [0]:
# Create a parquet file from the dataframe and save it from the mounted silver container
df_drop_date_column.write.mode("overwrite").format("parquet").save("/mnt/devkivaloansdl/silver/kiva_loans")

In [0]:
df_kiva_loans_parquet = spark.read.parquet("/mnt/devkivaloansdl/silver/kiva_loans")

In [0]:
df_parquet_kiva_loans = spark.read.parquet("/mnt/devkivaloansdl/silver/kiva_loans")

display(df_parquet_kiva_loans)