In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import  col, round
from pyspark.sql.functions import to_timestamp, year, month, dayofmonth

In [None]:
# Datbricks creats session automatically we can see using 
# spark
# Manually creating
spark = SparkSession.builder \
    .appName("WeatherDataProcessing") \
    .getOrCreate()

In [None]:
ACCESS_KEY = "AKIAUYXOGVO7D65JFF4A"
SECRET_KEY = "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"
BUCKET_NAME = "openweather-etl-extracted-data"
FILE_PATH = "s3a://" + BUCKET_NAME + "/*.csv"


spark.conf.set("spark.sql.session.timeZone", "Asia/Kolkata")
# spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", ACCESS_KEY)
# spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", SECRET_KEY)
# spark.conf.set("fs.s3a.endpoint", "s3.ap-south-1.amazonaws.com")
# spark._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
# spark._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")

spark.conf.set("fs.s3a.access.key", ACCESS_KEY) 
spark.conf.set("fs.s3a.secret.key", SECRET_KEY)
spark.conf.set("fs.s3a.endpoint", "s3.ap-south-1.amazonaws.com")
spark.conf.set("com.amazonaws.services.s3.enableV4", "true") 
df_raw = spark.read.option("header", True).csv(FILE_PATH)

In [None]:
display(df_raw.limit(10))

In [None]:
df_raw.printSchema()

In [None]:
df = df_raw.select(
    col("City"),
    col("Country"),
    col("Weather_main"),
    col("Weather_subtype"),
    col("Temperature").cast("double"),
    col("Feels_Like").cast("double"),
    col("Min_Temp").cast("double"),
    col("Max_Temp").cast("double"),
    col("Pressure").cast("int"),
    col("Humidity").cast("int"),
    col("Visibility").cast("int"),
    col("Wind_speed").cast("double"),
    col("cloudiness_percent").cast("int"),
    col("Rain_mm_hour").cast("double"),
    col("Snow_mm_hour").cast("double"),
    col("Time").cast("long"),
    col("Timezone_offset").cast("int"),
    to_timestamp("Time_Recorded_local").alias("Time_Recorded_local"),
    to_timestamp("Sunrise_local").alias("Sunrise_local"),
    to_timestamp("Sunset_local").alias("Sunset_local")
)

df.printSchema()

In [None]:
df.printSchema()

In [None]:
df.limit(5).display()

In [None]:
from pyspark.sql.functions import to_timestamp, regexp_replace

df = df.withColumn(
    "Time_Recorded_local_clean",
    to_timestamp(
        regexp_replace("Time_Recorded_local", r"\+\d{2}:\d{2}$", ""),  # Remove +05:30
        "yyyy-MM-dd'T'HH:mm:ss"
    )
)

In [None]:
from pyspark.sql.functions import expr
df.select(expr("typeof(Time_Recorded_local)")).show()

In [None]:
from pyspark.sql.functions import date_format
df.select(date_format("Time_Recorded_local", "yyyy-MM-dd HH:mm:ss")).show()

In [None]:
# ✅ Convert Kelvin to Celsius (new columns)
df = df.withColumn("Temp_C", round(col("Temperature") - 273.15, 2)) \
       .withColumn("Feels_Like_C", round(col("Feels_Like") - 273.15, 2)) \
       .withColumn("Min_Temp_C", round(col("Min_Temp") - 273.15, 2)) \
       .withColumn("Max_Temp_C", round(col("Max_Temp") - 273.15, 2))

# ✅ Drop rows with null Snow data (if necessary)
df = df.dropna(subset=["Snow_mm_hour"])

# ✅ Derive additional features
df = df.withColumn("Year", year("Time_Recorded_local")) \
       .withColumn("Month", month("Time_Recorded_local")) \
       .withColumn("Day", dayofmonth("Time_Recorded_local"))

# ✅ Show basic statistics
print("=== Sample Rows ===")
df.show(5, truncate=False)

print("=== Schema ===")
df.printSchema()

print("=== Weather Condition Counts ===")
df.groupBy("Weather_main").count().show()

print("=== Average Temperature by Day ===")
df.groupBy("Day").avg("Temp_C").orderBy("Day").show()

# ✅ Save final dataframe back to S3 as a single Parquet file (or CSV if you prefer)
df.coalesce(1).write.mode("overwrite").parquet("s3a://your-public-bucket-name/final_output/")