In [None]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pyspark
import pandas
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [None]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

In [None]:
# Write your imports here or insert cells below

from pyspark.sql import Row, DataFrame, Window, functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import substring, countDistinct, collect_set, broadcast

### Q1

In [None]:
# daily schema
schema = StructType([
    StructField("ID", StringType(), False),
    StructField("DATE", StringType(), False),
    StructField("ELEMENT", StringType(), False),
    StructField("VALUE", FloatType(), True),
    StructField("MEASUREMENT FLAG", StringType(), True),
    StructField("QUALITY FLAG", StringType(), True),
    StructField("SOURCE FLAG", StringType(), True),
    StructField("OBSERVATION TIME", StringType(), True),
])
daily = spark.read.csv("hdfs:///data/ghcnd/daily", schema=schema)
daily.printSchema()

In [None]:
daily_nz = daily.where(F.substring(F.col("ID"), 1, 2) == "NZ")

show_as_html(daily_nz, 20)

In [None]:
tmin_tmax_daily = (
    daily_nz
    .filter(F.col("ELEMENT").isin("TMIN", "TMAX") & F.col("value").isNotNull())
)
show_as_html(tmin_tmax_daily)

In [None]:
# Group by ELEMENT (TMIN and TMAX) and count the number of observations for each
element_count = (
    tmin_tmax_daily
    .groupBy("ELEMENT")
    .agg(F.count("VALUE").alias("observation_count"))
)


element_count.show()


In [None]:
##TMIN|           235380|
##TMAX|           252380|

In [None]:
tmin_tmax_daily_with_year = tmin_tmax_daily.withColumn("year", F.substring("DATE", 1, 4).cast("int"))


min_year = tmin_tmax_daily_with_year.agg(F.min("year")).collect()[0][0]
max_year = tmin_tmax_daily_with_year.agg(F.max("year")).collect()[0][0]




print(f"Years covered in the plot: {min_year} to {max_year}")
##1940 to 2024

In [None]:
tmin_tmax_daily_with_year = tmin_tmax_daily.withColumn("year", F.substring("DATE", 1, 4).cast("int"))
show_as_html(tmin_tmax_daily_with_year)

In [None]:
tmin_tmax_pandas.to_csv("data.csv", header=True,index=False)

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Convert to Pandas DataFrame
tmin_tmax_pandas = tmin_tmax_daily_with_year.select("ID", "DATE", "ELEMENT", "VALUE", "year").toPandas()

# Convert DATE to datetime format in pandas for plotting
tmin_tmax_pandas['DATE'] = pd.to_datetime(tmin_tmax_pandas['DATE'], format='%Y%m%d')

# Divide VALUES by 10 
tmin_tmax_pandas['VALUE'] = tmin_tmax_pandas['VALUE'] / 10

#  Filling gaps in case of missing data
def fill_missing_dates(df):
    full_range = pd.date_range(start=df['DATE'].min(), end=df['DATE'].max(), freq='D')
    df = df.set_index('DATE').reindex(full_range).rename_axis('DATE').reset_index()
    return df

tmin_tmax_pandas = tmin_tmax_pandas.groupby(['ID', 'ELEMENT']).apply(fill_missing_dates).reset_index(drop=True)

# Apply rolling average to smooth the time series
def apply_rolling_average(group):
    group = group.copy()
    group['VALUE'] = group['VALUE'].rolling(window=7, min_periods=1).mean()
    return group

tmin_tmax_pandas = tmin_tmax_pandas.groupby('ELEMENT').apply(apply_rolling_average).reset_index(drop=True)


# Plotting the graphs

stations = tmin_tmax_pandas['ID'].unique()

for station in stations:
    station_data = tmin_tmax_pandas[tmin_tmax_pandas['ID'] == station]
    
    tmin_data = station_data[station_data['ELEMENT'] == 'TMIN']
    tmax_data = station_data[station_data['ELEMENT'] == 'TMAX']
    
    # Create a new graph for each station
    fig = go.Figure()

    # Add TMIN trace
    fig.add_trace(go.Scatter(
        x=tmin_data['DATE'],
        y=tmin_data['VALUE'],
        mode='lines',
        name='TMIN',
        line=dict(color='orange', width=2)
    ))

    # Add TMAX trace
    fig.add_trace(go.Scatter(
        x=tmax_data['DATE'],
        y=tmax_data['VALUE'],
        mode='lines',
        name='TMAX',
        line=dict(color='blue', width=2)
    ))

    # Update layout
    fig.update_layout(
        title=f'TMIN and TMAX Over Time for Station {station}',
        xaxis_title='Date',
        yaxis_title='Temperature (°C)',
        legend_title="Temperature Type",
        hovermode='x unified',
        template='plotly_white',
        font=dict(size=12)
    )

    fig.show()


In [None]:
fig.write_image("fig.jpeg")

In [None]:

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Convert to Pandas DataFrame
tmin_tmax_pandas = tmin_tmax_daily_with_year.select("ID", "DATE", "ELEMENT", "VALUE", "year").toPandas()

# Convert DATE to datetime format in pandas for plotting
tmin_tmax_pandas['DATE'] = pd.to_datetime(tmin_tmax_pandas['DATE'], format='%Y%m%d')

# Divide all VALUE entries by 10
tmin_tmax_pandas['VALUE'] = tmin_tmax_pandas['VALUE'] / 10

# Handle missing data by filling gaps
def fill_missing_dates(df):
    full_range = pd.date_range(start=df['DATE'].min(), end=df['DATE'].max(), freq='D')
    df = df.set_index('DATE').reindex(full_range).rename_axis('DATE').reset_index()
    return df

tmin_tmax_pandas = tmin_tmax_pandas.groupby(['ID', 'ELEMENT']).apply(fill_missing_dates).reset_index(drop=True)

# Apply rolling average to smooth the time series
def apply_rolling_average(group):
    group = group.copy()
    group['VALUE'] = group['VALUE'].rolling(window=7, min_periods=1).mean()
    return group

tmin_tmax_pandas = tmin_tmax_pandas.groupby('ELEMENT').apply(apply_rolling_average).reset_index(drop=True)

# Plotting
stations = tmin_tmax_pandas['ID'].unique()
num_stations = len(stations)

# Define number of rows and columns for the subplots
num_cols = 3  
num_rows = (num_stations + num_cols - 1) // num_cols  

# Create subplot figure
fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=[f'Station {station}' for station in stations])

for idx, station in enumerate(stations):
    row = idx // num_cols + 1
    col = idx % num_cols + 1

    station_data = tmin_tmax_pandas[tmin_tmax_pandas['ID'] == station]
    
    tmin_data = station_data[station_data['ELEMENT'] == 'TMIN']
    tmax_data = station_data[station_data['ELEMENT'] == 'TMAX']
    
    # Add TMIN trace
    fig.add_trace(go.Scatter(
        x=tmin_data['DATE'],
        y=tmin_data['VALUE'],
        mode='lines',
        name='TMIN',
        line=dict(color='orange', width=2),
        showlegend=False  # Hide legend in individual subplots
    ), row=row, col=col)

    # Add TMAX trace
    fig.add_trace(go.Scatter(
        x=tmax_data['DATE'],
        y=tmax_data['VALUE'],
        mode='lines',
        name='TMAX',
        line=dict(color='blue', width=2),
        showlegend=False  # Hide legend in individual subplots
    ), row=row, col=col)

# Update layout
fig.update_layout(
    title='TMIN and TMAX Over Time for Each Station',
    xaxis_title='Date',
    yaxis_title='Temperature (°C)',
    legend_title="Temperature Type",
    hovermode='x unified',
    template='plotly_white',
    font=dict(size=12),
    height=600 + 200 * num_rows,  # Adjust height based on number of rows
    showlegend=True  # Show legend in the overall figure
)

# Adjust x-axis and y-axis titles to show for the first column and first row only
for i in range(num_cols):
    fig.update_xaxes(title_text='Date', row=num_rows, col=i + 1)
for i in range(num_rows):
    fig.update_yaxes(title_text='Temperature (°C)', row=i + 1, col=1)

fig.show()

In [None]:
fig.write_image("fig.jpeg")

In [None]:

import pandas as pd
import plotly.graph_objects as go

# Assuming `tmin_tmax_daily_with_year` is a Spark DataFrame and has been converted to Pandas for plotting
# Convert to Pandas DataFrame
tmin_tmax_pandas = tmin_tmax_daily_with_year.select("DATE", "ELEMENT", "VALUE", "year").toPandas()

# Convert DATE to datetime format in pandas for plotting
tmin_tmax_pandas['DATE'] = pd.to_datetime(tmin_tmax_pandas['DATE'], format='%Y%m%d')

# Divide all VALUE entries by 10
tmin_tmax_pandas['VALUE'] = tmin_tmax_pandas['VALUE'] / 10

# Step 1: Calculate the average TMIN and TMAX for each year
# Pivot the data to have 'TMIN' and 'TMAX' in separate columns
pivot_data = tmin_tmax_pandas.pivot_table(
    index='DATE', 
    columns='ELEMENT', 
    values='VALUE', 
    aggfunc='mean'
).reset_index()

# Rename columns for easier reference
pivot_data.columns.name = None  # Remove the columns name
pivot_data.rename(columns={'TMIN': 'Average_TMIN', 'TMAX': 'Average_TMAX'}, inplace=True)

# Step 2: Plot the average time series
fig1 = go.Figure()

# Add average TMIN trace
fig1.add_trace(go.Scatter(
    x=pivot_data['DATE'],
    y=pivot_data['Average_TMIN'],
    mode='lines',  # Use lines to connect the points
    name='Average TMIN',
    line=dict(color='orange')
))

# Add average TMAX trace
fig1.add_trace(go.Scatter(
    x=pivot_data['DATE'],
    y=pivot_data['Average_TMAX'],
    mode='lines',  # Use lines to connect the points
    name='Average TMAX',
    line=dict(color='blue')
))

# Step 3: Customize the layout
fig1.update_layout(
    title='Average TMIN and TMAX Over Time for New Zealand',
    xaxis_title='Date',
    yaxis_title='Temperature (°C)',
    legend_title="Temperature Type",
    hovermode='x unified'  # To show TMIN and TMAX values together on hover
)

# Show the consolidated plot
fig1.show()


In [None]:
fig1.write_image("fig1.jpeg")

In [None]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()