In [3]:
from pyspark.sql.functions import year, month

In [None]:
import os, findspark
from pyspark.sql import SparkSession

# ———Include Spark by findspark ———
os.environ["JAVA_HOME"]   = r"C:\Program Files\Java\jdk-17"
os.environ["SPARK_HOME"]  = r"C:\tools\spark-3.5.5-bin-hadoop3"
os.environ["PATH"]        = os.environ["SPARK_HOME"] + r"\bin;" + os.environ["PATH"]
findspark.init()

# ——— JDBC jar path ———
jdbc_jar = r"C:\tools\postgresql-42.7.3.jar"

# ——— Start SparkSession with jar ———

spark = (
    SparkSession.builder
        .appName("KivaAzureWrite")
        .master("local[*]")
        .config("spark.jars", jdbc_jar)
        .config("spark.jars.packages", 
                "org.apache.hadoop:hadoop-azure:3.3.4,"
                "com.microsoft.azure:azure-storage:8.6.6,"
                "com.azure:azure-storage-file-datalake:12.19.1,"
                "com.azure:azure-identity:1.10.1")
        .config("spark.hadoop.fs.abfss.impl", "org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem")
        .getOrCreate()
)

spark.conf.set("fs.azure.account.key.kivastorageacc2.dfs.core.windows.net", "Storage_KEY")


# ——— Connect to Postgres in Docker via localhost ———
jdbc_url = "jdbc:postgresql://localhost:5432/KivaCrowdfunding"

df = (
    spark.read
         .format("jdbc")
         .option("url",      jdbc_url)
         .option("dbtable",  "public.kiva_table")
         .option("user",     "root")
         .option("password", "root")
         .option("driver",   "org.postgresql.Driver")
         .load()
)

df.head()



Row(index=63528, id=716200, funded_amount=900.0, loan_amount=900.0, activity='Motorcycle Transport', sector='Transportation', use='to purchase a motorcycle.', country_code='SV', country='El Salvador', region='Osicala', currency='USD', partner_id=199.0, posted_time=datetime.datetime(2014, 5, 26, 20, 29, 44), disbursed_time=datetime.datetime(2014, 5, 22, 10, 0), funded_time=datetime.datetime(2014, 6, 23, 21, 1, 20), term_in_months=20.0, lender_count=31, tags='volunteer_like', borrower_genders='male', repayment_interval='monthly', date=datetime.datetime(2014, 5, 26, 0, 0))

In [None]:

# Create a new column with the year and month extracted from the posted_time column 
df_with_date = df.withColumn("year", year("posted_time")) \
                 .withColumn("month", month("posted_time"))

df_with_date.select("posted_time", "year", "month").show(5)

+-------------------+----+-----+
|        posted_time|year|month|
+-------------------+----+-----+
|2014-05-26 20:29:44|2014|    5|
|2014-05-26 15:16:27|2014|    5|
|2014-05-26 12:48:49|2014|    5|
|2014-05-26 16:29:00|2014|    5|
|2014-05-26 12:54:03|2014|    5|
+-------------------+----+-----+
only showing top 5 rows



In [None]:
# Write Azure Data Lake Gen2 as parquet files with partitioning by year and month
df_with_date.write \
    .partitionBy("year", "month") \
    .mode("overwrite") \
    .parquet("abfss://kiva-bronze@kivastorageacc2.dfs.core.windows.net/kiva-data")



In [None]:
spark.stop()