In [None]:
import os
from dotenv import load_dotenv

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

load_dotenv()

user = os.getenv('PG_USER')
password = os.getenv('PG_PASSWORD')
host = os.getenv('PG_HOST', 'localhost')
port = os.getenv('PG_PORT', '5432')
dbname = os.getenv('POSTGRES_DB')

In [None]:
spark = SparkSession.builder \
    .appName("Weather Study") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.2.23") \
    .getOrCreate()

url = f"jdbc:postgresql://{host}:{port}/{dbname}"
properties = {
    "user": user,
    "password": password,
    "driver": "org.postgresql.Driver"
}

In [None]:
query = """
    SELECT weather.* 
    FROM weather 
    JOIN city ON weather.city_id = city.id 
    WHERE city.name = 'Canberra'
"""
df = spark.read.jdbc(url=url, table=f"({query}) as weather_data", properties=properties)
df.show()

In [None]:
df = df.withColumn('RainToday', when(col('Rainfall') > 0, 'Yes').otherwise('No'))
df.show()