### Spark notebook ###

This notebook will only work in a Jupyter session running on `mathmadslinux2p`.

You can start your own Jupyter session on `mathmadslinux2p` and open this notebook in Chrome on the MADS Windows server by

**Steps**

1. Login to the MADS Windows server using https://mathportal.canterbury.ac.nz/.
2. Download or copy this notebook to your home directory.
3. Open powershell and run `ssh mathmadslinux2p`.
4. Run `start_pyspark_notebook` or `/opt/anaconda3/bin/jupyter-notebook --ip 132.181.129.68 --port $((8000 + $((RANDOM % 999))))`.
5. Copy / paste the url provided in the shell window into Chrome on the MADS Windows server.
6. Open the notebook from the Jupyter root directory (which is your home directory).
7. Run `start_spark()` to start a spark session in the notebook.
8. Run `stop_spark()` before closing the notebook or kill your spark application by hand using the link in the Spark UI.

In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

### Example notebook ###

The code below provides a template for how you would use a notebook to start spark, run some code, and then stop spark.

**Steps**

- Run `start_spark()` to start a spark session in the notebook (only change the default resources when advised to do so for an exercise or assignment)
- Write and run code interactively, creating additional cells as needed.
- Run `stop_spark()` before closing the notebook or kill your spark application by hand using the link in the [Spark UI](http://mathmadslinux2p.canterbury.ac.nz:8080/).

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1)

0,1
spark.dynamicAllocation.enabled,false
spark.driver.port,33569
spark.master,spark://masternode2:7077
spark.app.id,app-20240914192126-0945
spark.executor.id,driver
spark.sql.warehouse.dir,file:/users/home/dca129/Assignment1/spark-warehouse
spark.driver.memory,1g
spark.driver.host,mathmadslinux2p.canterbury.ac.nz
spark.ui.port,4145
spark.driver.extraJavaOptions,-Dderby.system.home=/tmp/dca129/spark/


In [4]:
# Write your imports here or insert cells below

from pyspark.sql import functions as F
from pyspark.sql.types import *

In [None]:
#Q1.a

In [5]:
!hdfs dfs -ls /data/ghcnd/daily | head

Found 263 items
-rw-r--r--   8 jsw93 jsw93    1897634 2024-08-07 01:43 /data/ghcnd/daily/1750.csv.gz
-rw-r--r--   8 jsw93 jsw93       3358 2024-08-07 01:44 /data/ghcnd/daily/1763.csv.gz
-rw-r--r--   8 jsw93 jsw93       3327 2024-08-07 01:42 /data/ghcnd/daily/1764.csv.gz
-rw-r--r--   8 jsw93 jsw93       3335 2024-08-07 01:41 /data/ghcnd/daily/1765.csv.gz
-rw-r--r--   8 jsw93 jsw93       3344 2024-08-07 01:38 /data/ghcnd/daily/1766.csv.gz
-rw-r--r--   8 jsw93 jsw93       3356 2024-08-07 01:43 /data/ghcnd/daily/1767.csv.gz
-rw-r--r--   8 jsw93 jsw93       3325 2024-08-07 01:41 /data/ghcnd/daily/1768.csv.gz
-rw-r--r--   8 jsw93 jsw93       3418 2024-08-07 01:41 /data/ghcnd/daily/1769.csv.gz
-rw-r--r--   8 jsw93 jsw93       3357 2024-08-07 01:43 /data/ghcnd/daily/1770.csv.gz


In [6]:
!hdfs dfs -ls -h /data/ghcnd/daily | tail

-rw-r--r--   8 jsw93 jsw93    156.8 M 2024-08-07 01:40 /data/ghcnd/daily/2015.csv.gz
-rw-r--r--   8 jsw93 jsw93    158.1 M 2024-08-07 01:42 /data/ghcnd/daily/2016.csv.gz
-rw-r--r--   8 jsw93 jsw93    157.7 M 2024-08-07 01:43 /data/ghcnd/daily/2017.csv.gz
-rw-r--r--   8 jsw93 jsw93    157.8 M 2024-08-07 01:41 /data/ghcnd/daily/2018.csv.gz
-rw-r--r--   8 jsw93 jsw93    156.7 M 2024-08-07 01:40 /data/ghcnd/daily/2019.csv.gz
-rw-r--r--   8 jsw93 jsw93    157.6 M 2024-08-07 01:43 /data/ghcnd/daily/2020.csv.gz
-rw-r--r--   8 jsw93 jsw93    160.3 M 2024-08-07 01:38 /data/ghcnd/daily/2021.csv.gz
-rw-r--r--   8 jsw93 jsw93    161.1 M 2024-08-07 01:42 /data/ghcnd/daily/2022.csv.gz
-rw-r--r--   8 jsw93 jsw93    160.6 M 2024-08-07 01:41 /data/ghcnd/daily/2023.csv.gz
-rw-r--r--   8 jsw93 jsw93     84.7 M 2024-08-07 01:42 /data/ghcnd/daily/2024.csv.gz


In [7]:
!hdfs getconf -confKey "dfs.blocksize"

134217728


In [9]:
print(128*1024**2) #MB - KB -B

134217728


In [10]:
!hdfs fsck /data/ghcnd/daily/2024.csv.gz -files -blocks

Connecting to namenode via http://masternode2:9870/fsck?ugi=dca129&files=1&blocks=1&path=%2Fdata%2Fghcnd%2Fdaily%2F2024.csv.gz
FSCK started by dca129 (auth:SIMPLE) from /192.168.40.11 for path /data/ghcnd/daily/2024.csv.gz at Sat Sep 14 19:23:30 NZST 2024

/data/ghcnd/daily/2024.csv.gz 88831735 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1074220563_479763 len=88831735 Live_repl=8


Status: HEALTHY
 Number of data-nodes:	32
 Number of racks:		1
 Total dirs:			0
 Total symlinks:		0

Replicated Blocks:
 Total size:	88831735 B
 Total files:	1
 Total blocks (validated):	1 (avg. block size 88831735 B)
 Minimally replicated blocks:	1 (100.0 %)
 Over-replicated blocks:	0 (0.0 %)
 Under-replicated blocks:	0 (0.0 %)
 Mis-replicated blocks:		0 (0.0 %)
 Default replication factor:	4
 Average block replication:	8.0
 Missing blocks:		0
 Corrupt blocks:		0
 Missing replicas:		0 (0.0 %)
 Blocks queued for replication:	0


In [6]:
!hdfs fsck /data/ghcnd/daily/2023.csv.gz -files -blocks

Connecting to namenode via http://masternode2:9870/fsck?ugi=ywa244&files=1&blocks=1&path=%2Fdata%2Fghcnd%2Fdaily%2F2023.csv.gz
FSCK started by ywa244 (auth:SIMPLE) from /192.168.40.11 for path /data/ghcnd/daily/2023.csv.gz at Mon Sep 09 12:07:53 NZST 2024

/data/ghcnd/daily/2023.csv.gz 168357302 bytes, replicated: replication=8, 2 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1074220535_479735 len=134217728 Live_repl=8
1. BP-700027894-132.181.129.68-1626517177804:blk_1074220536_479736 len=34139574 Live_repl=8


Status: HEALTHY
 Number of data-nodes:	32
 Number of racks:		1
 Total dirs:			0
 Total symlinks:		0

Replicated Blocks:
 Total size:	168357302 B
 Total files:	1
 Total blocks (validated):	2 (avg. block size 84178651 B)
 Minimally replicated blocks:	2 (100.0 %)
 Over-replicated blocks:	0 (0.0 %)
 Under-replicated blocks:	0 (0.0 %)
 Mis-replicated blocks:		0 (0.0 %)
 Default replication factor:	4
 Average block replication:	8.0
 Missing blo

In [11]:
#Q1.b

In [12]:
daily_2023 = spark.read.csv("hdfs:///data/ghcnd/daily/2023.csv.gz")

print(daily_2023.rdd.getNumPartitions())
print(daily_2023.count())

1
37867272


In [13]:
daily_2024 = spark.read.csv("hdfs:///data/ghcnd/daily/2024.csv.gz")

print(daily_2024.rdd.getNumPartitions())
print(daily_2024.count())

1
19720790


In [14]:
#Q1.c

In [15]:
data = spark.read.csv("hdfs:///data/ghcnd/daily/{2014,2015,2016,2017,2018,2019,2020,2021,2022,2023}*.csv.gz")
total_observations = data.count()
print(total_observations)

370803270


In [16]:
print(data.rdd.getNumPartitions())

10


In [17]:
#Q1.d

In [18]:
daily_total = spark.read.csv("hdfs:///data/ghcnd/daily")
print(daily_total.rdd.getNumPartitions())

105


In [19]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()