In [80]:
# Run a shell command to ensure pyspark is downloaded in the Python environment.
!python -m pip install pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name
from pyspark.sql.functions import regexp_replace, split




[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [81]:
# Set up Spark Session, which is an entry point to the PySpark application.
spark = SparkSession.builder \
    .appName("Weather Insights") \
    .getOrCreate()

In [None]:
# Load CSV files using filepath wildcard to open all files of the same type at once.
# In some analysis we are concerned with the dataset to which specific data belongs.
# To capture that information, we can add a filename column to the data frame on creation.
filepath = "C:/Users/benci/College/Class/6th Year/2024 Fall (CS)/INTRO TO CLOUD COMPUTING/Projects/P4/rashmi-p4/data/*.csv"
dataframe = spark.read.option("header", "true").csv(filepath).withColumn("filename", input_file_name())

# It's beneficial to trim the filename for readability in results, hence the following line.
# To be clear, it splits the filename by delimiters then selects the last listed value, the
# actual name.
dataframe = dataframe.withColumn("filename", split(dataframe["filename"], "/")[15]) # Indexing by -1 wasn't working???

In [83]:
# Check if the data was loaded correctly by printing the schema and a few rows.
dataframe.printSchema()
dataframe.show(5)

root
 |-- STATION: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- ELEVATION: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- TEMP: string (nullable = true)
 |-- TEMP_ATTRIBUTES: string (nullable = true)
 |-- DEWP: string (nullable = true)
 |-- DEWP_ATTRIBUTES: string (nullable = true)
 |-- SLP: string (nullable = true)
 |-- SLP_ATTRIBUTES: string (nullable = true)
 |-- STP: string (nullable = true)
 |-- STP_ATTRIBUTES: string (nullable = true)
 |-- VISIB: string (nullable = true)
 |-- VISIB_ATTRIBUTES: string (nullable = true)
 |-- WDSP: string (nullable = true)
 |-- WDSP_ATTRIBUTES: string (nullable = true)
 |-- MXSPD: string (nullable = true)
 |-- GUST: string (nullable = true)
 |-- MAX: string (nullable = true)
 |-- MAX_ATTRIBUTES: string (nullable = true)
 |-- MIN: string (nullable = true)
 |-- MIN_ATTRIBUTES: string (nullable = true)
 |-- PRCP: string (nullable = t

In [84]:
# Group by filename and count rows
file_counts = dataframe.groupBy("filename").count()
file_counts.show(truncate=False)

+--------------------+-----+
|filename            |count|
+--------------------+-----+
|2020_72429793812.csv|366  |
|2016_72429793812.csv|366  |
|2017_72429793812.csv|365  |
|2015_72429793812.csv|365  |
|2018_72429793812.csv|365  |
|2019_72429793812.csv|365  |
|2022_72429793812.csv|365  |
|2021_72429793812.csv|365  |
|2020_99495199999.csv|365  |
|2023_72429793812.csv|365  |
|2018_99495199999.csv|363  |
|2015_99495199999.csv|355  |
|2019_99495199999.csv|345  |
|2024_72429793812.csv|301  |
|2023_99495199999.csv|276  |
|2017_99495199999.csv|283  |
|2022_99495199999.csv|259  |
|2024_99495199999.csv|133  |
|2021_99495199999.csv|104  |
+--------------------+-----+

