### Spark notebook ###

This notebook will only work in a Jupyter session running on `mathmadslinux2p`.

You can start your own Jupyter session on `mathmadslinux2p` and open this notebook in Chrome on the MADS Windows server by

**Steps**

1. Login to the MADS Windows server using https://mathportal.canterbury.ac.nz/.
2. Download or copy this notebook to your home directory.
3. Open powershell and run `ssh mathmadslinux2p`.
4. Run `start_pyspark_notebook` or `/opt/anaconda3/bin/jupyter-notebook --ip 132.181.129.68 --port $((8000 + $((RANDOM % 999))))`.
5. Copy / paste the url provided in the shell window into Chrome on the MADS Windows server.
6. Open the notebook from the Jupyter root directory (which is your home directory).
7. Run `start_spark()` to start a spark session in the notebook.
8. Run `stop_spark()` before closing the notebook or kill your spark application by hand using the link in the Spark UI.

In [7]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

### Example notebook ###

The code below provides a template for how you would use a notebook to start spark, run some code, and then stop spark.

**Steps**

- Run `start_spark()` to start a spark session in the notebook (only change the default resources when advised to do so for an exercise or assignment)
- Write and run code interactively, creating additional cells as needed.
- Run `stop_spark()` before closing the notebook or kill your spark application by hand using the link in the [Spark UI](http://mathmadslinux2p.canterbury.ac.nz:8080/).

In [8]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.ui.port,4464
spark.driver.port,44935
spark.driver.memory,4g
spark.app.id,app-20240428125345-0193
spark.executor.memory,4g
spark.sql.warehouse.dir,file:/users/home/dcp31/assignment_1/spark-warehouse
spark.master,spark://masternode2:7077
spark.app.name,dcp31 (jupyter)


In [9]:
# Write your imports here or insert cells below

from pyspark.sql import functions as F
from pyspark.sql.types import *

## Read in saved dataframes

In [10]:
stations = spark.read.option("header", "true").csv("hdfs:///user/dcp31/assignment_1/stations1")
countries = spark.read.option("header", "true").csv("hdfs:///user/dcp31/assignment_1/countries1")
states = spark.read.option("header", "true").csv("hdfs:///user/dcp31/assignment_1/states1")
inventory = spark.read.option("header", "true").csv("hdfs:///user/dcp31/assignment_1/inventory1")
daily = spark.read.option("header", "true").csv("hdfs:///user/dcp31/assignment_1/daily1")


stations.cache()
countries.cache()
states.cache()
inventory.cache()
daily.cache()

show_as_html(stations, 5)
show_as_html(countries, 5)
show_as_html(states, 5)
show_as_html(inventory, 5)
show_as_html(daily, 5)

Unnamed: 0,station_id,state_code,country_code,latitude,longitude,elevation,name,gsn_flag,hcn_crn_flag,wmo_id,country_name,state_name,first_year_active,last_year_active,no_unique_elements,unique_elements,core_element_count,other_element_count,PRCP_only
0,AE000041196,,AE,25.333,55.517,34.0,SHARJAH INTER. AIRP,GSN,,41196.0,United Arab Emirates,,1944,2024,4,"TMAX,TMIN,PRCP,TAVG",3,1,
1,AEM00041218,,AE,24.262,55.609,264.9,AL AIN INTL,,,41218.0,United Arab Emirates,,1994,2024,4,"TMAX,TMIN,PRCP,TAVG",3,1,
2,AGE00147715,,AG,35.42,8.1197,863.0,TEBESSA,,,,Algeria,,1879,1938,3,"TMAX,TMIN,PRCP",3,0,
3,AGE00147794,,AG,36.78,5.1,225.0,BEJAIA-CAP CARBON,,,,Algeria,,1926,1938,2,"TMAX,TMIN",2,0,
4,AGM00060402,,AG,36.712,5.07,6.1,SOUMMAM,,,60402.0,Algeria,,1973,2024,5,"TMAX,TMIN,����,PRCP,SNWD",4,1,


Unnamed: 0,country_name,country_code,station_count
0,Antigua and Barbuda,AC,2
1,United Arab Emirates,AE,4
2,Afghanistan,AF,4
3,Algeria,AG,82
4,Azerbaijan,AJ,66


Unnamed: 0,state_name,state_code,station_count
0,ALBERTA,AB,1445
1,ALASKA,AK,1040
2,ALABAMA,AL,1101
3,ARKANSAS,AR,937
4,AMERICAN SAMOA,AS,21


Unnamed: 0,station_id,latitude,longitude,element,first_year,last_year
0,ACW00011604,17.1167,-61.7833,TMAX,1949,1949
1,ACW00011604,17.1167,-61.7833,TMIN,1949,1949
2,ACW00011604,17.1167,-61.7833,PRCP,1949,1949
3,ACW00011604,17.1167,-61.7833,SNOW,1949,1949
4,ACW00011604,17.1167,-61.7833,SNWD,1949,1949


Unnamed: 0,station_id,date,element,value,m_flag,q_flag,s_flag,time
0,AE000041196,20230101,TMAX,252,,,S,
1,AE000041196,20230101,TMIN,149,,,S,
2,AE000041196,20230101,PRCP,0,D,,S,
3,AE000041196,20230101,TAVG,207,H,,S,
4,AEM00041194,20230101,TMAX,255,,,S,


## Analysis Question 1
### Q1 (a)

In [5]:
# determine number of stations overall, and per memberships

print(f"Total:", stations.count())

distinct_stations = stations.select("station_id").distinct()
print(f"Total distinct:", distinct_stations.count()) # double checking       

stations_2024 = stations.filter(F.col("last_year_active") == "2024")
print(f"2024:", stations_2024.count())

stations_GSN = stations.filter(F.col("gsn_flag") == "GSN")
print(f"GSN:", stations_GSN.count())

stations_HCN = stations.filter(F.col("hcn_crn_flag") == "HCN")
print(f"HCN:", stations_GSN.count())

stations_HCN = stations.filter(F.col("hcn_crn_flag") == "CRN")
print(f"CRN:", stations_GSN.count())


Total: 125983
Total distinct: 125983
2024: 31837
GSN: 991
HCN: 991
CRN: 991


In [6]:
# determine number of stations having two memberships

GSN_HCN = stations.filter((F.col("gsn_flag") == "GSN") & (F.col("hcn_crn_flag") == "HCN")).count()
print(f"GSN & HCN:", GSN_HCN)

GSN_CRN = stations.filter((F.col("gsn_flag") == "GSN") & (F.col("hcn_crn_flag") == "CRN")).count()
print(f"GSN & CRN:", GSN_CRN)

# a station cannot be in both HCN and CRN, it's one or the other or neither

GSN & HCN: 15
GSN & CRN: 0


### Q1 (b)

In [7]:
# determine number of stations per country 
stations_per_country = (
    stations.groupBy("country_name")
    .agg(F.countDistinct(F.col("station_id")).alias("station_count"))
    .orderBy(F.asc("station_count"))
)

show_as_html(stations_per_country)

Unnamed: 0,country_name,station_count
0,Mayotte [France],1
1,Liberia,1
2,Maldives,1
3,Suriname,1
4,Western Sahara,1
5,Niue [New Zealand],1
6,Saint Pierre and Miquelon [France],1
7,Belgium,1
8,Qatar,1
9,Pitcairn Islands [United Kingdom],1


In [8]:
# how many countries only have one station?
single_station_countries = stations_per_country.filter(F.col("station_count") == '1')
print(single_station_countries.count())
show_as_html(single_station_countries, 41)

41


Unnamed: 0,country_name,station_count
0,Cayman Islands [United Kingdom],1
1,Seychelles,1
2,Norfolk Island [Australia],1
3,"Gambia, The",1
4,Rwanda,1
5,Bahrain,1
6,Iraq,1
7,Belgium,1
8,Niue [New Zealand],1
9,Suriname,1


In [9]:
# add stations oer country to stations table
countries = countries.join(stations_per_country, "country_name", "left")
show_as_html(countries, 10)

Unnamed: 0,country_name,country_code,station_count,station_count.1
0,Antigua and Barbuda,AC,2,2
1,United Arab Emirates,AE,4,4
2,Afghanistan,AF,4,4
3,Algeria,AG,82,82
4,Azerbaijan,AJ,66,66
5,Albania,AL,3,3
6,Armenia,AM,53,53
7,Angola,AO,6,6
8,American Samoa [United States],AQ,21,21
9,Argentina,AR,101,101


In [10]:
# stations per state

stations_per_state = (
    stations.groupBy("state_name")
    .agg(F.countDistinct(F.col("station_id")).alias("station_count"))
    .orderBy(F.asc("station_count"))
)

show_as_html(stations_per_state)

Unnamed: 0,state_name,station_count
0,U.S. MINOR OUTLYING ISLANDS,11
1,NORTHERN MARIANA ISLANDS,11
2,PALAU,12
3,DISTRICT OF COLUMBIA,18
4,AMERICAN SAMOA,21
5,MARSHALL ISLANDS,21
6,GUAM,29
7,MICRONESIA,38
8,VIRGIN ISLANDS,71
9,PRINCE EDWARD ISLAND,94


In [11]:
# join stations per state to stations table
states = states.join(stations_per_state, "state_name", "left")

show_as_html(states, 74)

Unnamed: 0,state_name,state_code,station_count,station_count.1
0,ALBERTA,AB,1445,1445.0
1,ALASKA,AK,1040,1040.0
2,ALABAMA,AL,1101,1101.0
3,ARKANSAS,AR,937,937.0
4,AMERICAN SAMOA,AS,21,21.0
5,ARIZONA,AZ,1655,1655.0
6,BRITISH COLUMBIA,BC,1713,1713.0
7,CALIFORNIA,CA,3080,3080.0
8,COLORADO,CO,4640,4640.0
9,CONNECTICUT,CT,417,417.0


In [None]:
# save to outputs

'''
data_path1 = f"hdfs:///user/dcp31/assignment_1/countries1"
data_path2 = f"hdfs:///user/dcp31/assignment_1/states1"


(
    countries.write
    .option("compression", "gzip")
    .option("header", "true")
    .mode("overwrite")
    .csv(data_path1)
)

(
    states.write
    .option("compression", "gzip")
    .option("header", "true")
    .mode("overwrite")
    .csv(data_path2)
)

'''

### Q1 (c)

In [13]:
# determine number of stations in southern hemisphere
southern_hemisphere = stations.filter(F.col("latitude") < 0)
northern_hemisphere = stations.filter(F.col("latitude") > 0)
#show_as_html(southern_hemisphere, 10)
print(southern_hemisphere.count())
print(northern_hemisphere.count()) # wanted to check that this added up to total number of stations given small SH result.

25316
100587


In [14]:
# how many territories of the US?

# filter country names that have the territory identifier
# because there is one } rather than ] I used contains and then filtered out US itself
US_territories = stations.filter(F.col("country_name").contains("United States") & (F.col("country_name") != "United States"))

# number of stations per territory
US_territories = US_territories.groupBy("country_name").agg(F.count("country_name").alias("count"))

show_as_html(US_territories)

US_territories.agg(F.sum("count")).show()

Unnamed: 0,country_name,count
0,Northern Mariana Islands [United States],11
1,Virgin Islands [United States],71
2,Palmyra Atoll [United States],3
3,Midway Islands [United States},3
4,Guam [United States],29
5,American Samoa [United States],21
6,Puerto Rico [United States],243
7,Wake Island [United States],1
8,Johnston Atoll [United States],4


+----------+
|sum(count)|
+----------+
|       386|
+----------+



In [15]:
#from help session 1 - this misses the } therefore is 8 rather than 9

results = (
    countries
    .where(F.col("country_name").contains("[United States]"))
)

results.count()



8

## Analysis Question 2
### Q2 (a)

In [23]:
# create function to caluclate distance between two stations
# based on code from https://gist.github.com/pavlov99/bd265be244f8a84e291e96c5656ceb5c

import math

def distance_calculator(latitude_1, longitude_1, latitude_2, longitude_2):
    '''Calculate the distance (in kms using Haversine) between two sets of latitude/longitude coordinates

    Args:
        latitude_1 (float): latitude of country 1
        longitude_1 (float): longitude of country 1
        latitude_2 (float): latitude of country 2
        longitude_2 (float): longitude of country 2
    
    Returns:
        distance (float): distance between coordinates in km
    '''
    
    earth_radius = 6371.0
    
    lat1, lng1, lat2, lng2 = map(float, [latitude_1, longitude_1, latitude_2, longitude_2])
        
    lat1, lng1, lat2, lng2 = map(math.radians, [lat1, lng1, lat2, lng2])

    lat_diff = lat2 - lat1
    lng_diff = lng2 - lng1
    
    d = math.sin(lat_diff * 0.5) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(lng_diff * 0.5) ** 2
    distance = 2 * earth_radius * math.asin(math.sqrt(d))
    
    return round(distance, 2)

distance_udf = F.udf(distance_calculator, DoubleType())

In [21]:
# create two small dfs to test function
stations_1 = (
    stations.select(F.col("station_id").alias("station_id_1"),
                    F.col("latitude").alias("latitude_1"),
                    F.col("longitude").alias("longitude_1"),
                    F.col("country_name").alias("country_name_1"))
    .limit(4)
)

show_as_html(stations_1)

stations_2 = (
    stations.select(F.col("station_id").alias("station_id_2"),
                    F.col("latitude").alias("latitude_2"),
                    F.col("longitude").alias("longitude_2"), 
                    F.col("country_name").alias("country_name_2"))
    .limit(4)
)

show_as_html(stations_2)

cross_joined_df = stations_1.crossJoin(stations_2)
show_as_html(cross_joined_df)
# I don't understand why it shows Afghanistan int he joined table?? 
# Works as expected in next cell though? But some of the values different??

Unnamed: 0,station_id_1,latitude_1,longitude_1,country_name_1
0,AE000041196,25.333,55.517,United Arab Emirates
1,AEM00041218,24.262,55.609,United Arab Emirates
2,AGE00147715,35.42,8.1197,Algeria
3,AGE00147794,36.78,5.1,Algeria


Unnamed: 0,station_id_2,latitude_2,longitude_2,country_name_2
0,AE000041196,25.333,55.517,United Arab Emirates
1,AEM00041218,24.262,55.609,United Arab Emirates
2,AGE00147715,35.42,8.1197,Algeria
3,AGE00147794,36.78,5.1,Algeria


Unnamed: 0,station_id_1,latitude_1,longitude_1,country_name_1,station_id_2,latitude_2,longitude_2,country_name_2
0,AEM00041194,25.255,55.364,United Arab Emirates,AF000040930,35.317,69.017,Afghanistan
1,AEM00041194,25.255,55.364,United Arab Emirates,AG000060680,22.8,5.4331,Algeria
2,AEM00041194,25.255,55.364,United Arab Emirates,AGE00147704,36.97,7.79,Algeria
3,AEM00041194,25.255,55.364,United Arab Emirates,AGE00147712,36.17,1.34,Algeria
4,AGE00147705,36.78,3.07,Algeria,AF000040930,35.317,69.017,Afghanistan
5,AGE00147705,36.78,3.07,Algeria,AG000060680,22.8,5.4331,Algeria
6,AGE00147705,36.78,3.07,Algeria,AGE00147704,36.97,7.79,Algeria
7,AGE00147705,36.78,3.07,Algeria,AGE00147712,36.17,1.34,Algeria
8,AGM00060369,36.767,3.1,Algeria,AF000040930,35.317,69.017,Afghanistan
9,AGM00060369,36.767,3.1,Algeria,AG000060680,22.8,5.4331,Algeria


In [24]:
distance_test = cross_joined_df.select(
    "station_id_1",
    "country_name_1",
    "latitude_1",
    "longitude_1",
    "station_id_2",
    "country_name_2",
    "latitude_2",
    "longitude_2",
    distance_udf("latitude_1", "longitude_1", "latitude_2", "longitude_2").alias("Distance (km)")
)

distance_test.show()

+------------+--------------------+----------+-----------+------------+--------------------+----------+-----------+-------------+
|station_id_1|      country_name_1|latitude_1|longitude_1|station_id_2|      country_name_2|latitude_2|longitude_2|Distance (km)|
+------------+--------------------+----------+-----------+------------+--------------------+----------+-----------+-------------+
| AEM00041194|United Arab Emirates|    25.255|     55.364| AE000041196|United Arab Emirates|    25.333|     55.517|        17.66|
| AEM00041194|United Arab Emirates|    25.255|     55.364| AEM00041218|United Arab Emirates|    24.262|     55.609|       113.15|
| AEM00041194|United Arab Emirates|    25.255|     55.364| AGE00147715|             Algeria|     35.42|     8.1197|      4627.39|
| AEM00041194|United Arab Emirates|    25.255|     55.364| AGE00147794|             Algeria|     36.78|        5.1|      4900.12|
| AGE00147705|             Algeria|     36.78|       3.07| AE000041196|United Arab Emirate

### Q3 (b)

In [19]:
# number of NZ stations
NZ = stations.filter(F.col("country_name") == "New Zealand")
NZ.count()

15

In [20]:
# create two identical tables of NZ stations but with different var names for cross join
NZ_stations_1 = (
    stations.filter(F.col("country_name") == "New Zealand")
    .select(F.col("station_id").alias("station_id_1"), 
            F.col("name").alias("name_1"), 
            F.col("latitude").alias("latitude_1"), 
            F.col("longitude").alias("longitude_1"))
)

show_as_html(NZ_stations_1)

NZ_stations_2 = (
    stations.filter(F.col("country_name") == "New Zealand")
    .select(F.col("station_id").alias("station_id_2"), 
            F.col("name").alias("name_2"), 
            F.col("latitude").alias("latitude_2"), 
            F.col("longitude").alias("longitude_2"))
)

show_as_html(NZ_stations_2)

Unnamed: 0,station_id_1,name_1,latitude_1,longitude_1
0,NZ000093417,PARAPARAUMU AWS,-40.9,174.983
1,NZM00093781,CHRISTCHURCH INTL,-43.489,172.532
2,NZ000939450,CAMPBELL ISLAND AWS,-52.55,169.167
3,NZM00093929,ENDERBY ISLAND AWS,-50.483,166.3
4,NZ000933090,NEW PLYMOUTH AWS,-39.017,174.183
5,NZ000093844,INVERCARGILL AIRPOR,-46.417,168.333
6,NZ000093994,RAOUL ISL/KERMADEC,-29.25,-177.917
7,NZ000937470,TARA HILLS,-44.517,169.9
8,NZ000939870,CHATHAM ISLANDS AWS,-43.95,-176.567
9,NZ000093292,GISBORNE AERODROME,-38.65,177.983


Unnamed: 0,station_id_2,name_2,latitude_2,longitude_2
0,NZ000093417,PARAPARAUMU AWS,-40.9,174.983
1,NZM00093781,CHRISTCHURCH INTL,-43.489,172.532
2,NZ000939450,CAMPBELL ISLAND AWS,-52.55,169.167
3,NZM00093929,ENDERBY ISLAND AWS,-50.483,166.3
4,NZ000933090,NEW PLYMOUTH AWS,-39.017,174.183
5,NZ000093844,INVERCARGILL AIRPOR,-46.417,168.333
6,NZ000093994,RAOUL ISL/KERMADEC,-29.25,-177.917
7,NZ000937470,TARA HILLS,-44.517,169.9
8,NZ000939870,CHATHAM ISLANDS AWS,-43.95,-176.567
9,NZ000093292,GISBORNE AERODROME,-38.65,177.983


In [21]:
# cross join the two NZ tables created above, while creating a var that collects the station 1 and station 2 ids to remove 
# duplicated rows (rows where station 1 is station x and station 2 is station y, that matches rows where station 1 is station y
# and station 2 is station x). Then drops that new var. Based on code found at: 
# https://stackoverflow.com/questions/72426994/pyspark-cross-join-excluding-symmetric-results

NZ_stations_CJ = (
    NZ_stations_1.crossJoin(NZ_stations_2)
    .withColumn('filter', 
                F.array_sort(F.array('station_id_1','station_id_2')))
    .dropDuplicates(['filter'])
    .drop('filter')
    .orderBy(F.asc("name_1"))
)

show_as_html(NZ_stations_CJ)
NZ_stations_CJ.count()

Unnamed: 0,station_id_1,name_1,latitude_1,longitude_1,station_id_2,name_2,latitude_2,longitude_2
0,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZ000093417,PARAPARAUMU AWS,-40.9,174.983
1,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZM00093678,KAIKOURA,-42.417,173.7
2,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZ000936150,HOKITIKA AERODROME,-42.717,170.983
3,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZ000093994,RAOUL ISL/KERMADEC,-29.25,-177.917
4,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZM00093781,CHRISTCHURCH INTL,-43.489,172.532
5,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZM00093439,WELLINGTON AERO AWS,-41.333,174.8
6,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZM00093929,ENDERBY ISLAND AWS,-50.483,166.3
7,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZ000093844,INVERCARGILL AIRPOR,-46.417,168.333
8,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZ000093292,GISBORNE AERODROME,-38.65,177.983
9,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8


120

In [22]:
# to remove rows where the station is paired with itself 
NZ_stations_CJ = NZ_stations_CJ.filter(NZ_stations_1.station_id_1 != NZ_stations_2.station_id_2)

NZ_stations_CJ.show()
NZ_stations_CJ.count()

+------------+-------------------+----------+-----------+------------+-------------------+----------+-----------+
|station_id_1|             name_1|latitude_1|longitude_1|station_id_2|             name_2|latitude_2|longitude_2|
+------------+-------------------+----------+-----------+------------+-------------------+----------+-----------+
| NZM00093110|  AUCKLAND AERO AWS|     -37.0|      174.8| NZ000936150| HOKITIKA AERODROME|   -42.717|    170.983|
| NZM00093110|  AUCKLAND AERO AWS|     -37.0|      174.8| NZM00093929| ENDERBY ISLAND AWS|   -50.483|      166.3|
| NZM00093110|  AUCKLAND AERO AWS|     -37.0|      174.8| NZ000939450|CAMPBELL ISLAND AWS|    -52.55|    169.167|
| NZM00093110|  AUCKLAND AERO AWS|     -37.0|      174.8| NZM00093781|  CHRISTCHURCH INTL|   -43.489|    172.532|
| NZM00093110|  AUCKLAND AERO AWS|     -37.0|      174.8| NZ000093844|INVERCARGILL AIRPOR|   -46.417|    168.333|
| NZM00093110|  AUCKLAND AERO AWS|     -37.0|      174.8| NZ000093994| RAOUL ISL/KERMADE

105

In [23]:
# calculate the distances between the NZ stations
NZ_distances = NZ_stations_CJ.select(
    "station_id_1",
    "name_1",
    "latitude_1",
    "longitude_1",
    "station_id_2",
    "name_2",
    "latitude_2",
    "longitude_2",
    distance_udf("latitude_1", "longitude_1", "latitude_2", "longitude_2").alias("Distance (km)")
)

show_as_html(NZ_distances)

Unnamed: 0,station_id_1,name_1,latitude_1,longitude_1,station_id_2,name_2,latitude_2,longitude_2,Distance (km)
0,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZ000093417,PARAPARAUMU AWS,-40.9,174.983,433.95
1,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZM00093781,CHRISTCHURCH INTL,-43.489,172.532,746.69
2,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZM00093929,ENDERBY ISLAND AWS,-50.483,166.3,1644.84
3,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZ000093994,RAOUL ISL/KERMADEC,-29.25,-177.917,1095.82
4,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZM00093678,KAIKOURA,-42.417,173.7,609.63
5,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZ000936150,HOKITIKA AERODROME,-42.717,170.983,714.13
6,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZ000093844,INVERCARGILL AIRPOR,-46.417,168.333,1175.72
7,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,NZ000093292,GISBORNE AERODROME,-38.65,177.983,334.36
8,NZ000939450,CAMPBELL ISLAND AWS,-52.55,169.167,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,1783.96
9,NZ000939450,CAMPBELL ISLAND AWS,-52.55,169.167,NZ000939870,CHATHAM ISLANDS AWS,-43.95,-176.567,1420.22


In [24]:
# show in desc order, swapped to asc also
sorted_NZ = NZ_distances.orderBy(F.desc("Distance (km)"))

In [25]:
show_as_html(sorted_NZ)

Unnamed: 0,station_id_1,name_1,latitude_1,longitude_1,station_id_2,name_2,latitude_2,longitude_2,Distance (km)
0,NZ000939450,CAMPBELL ISLAND AWS,-52.55,169.167,NZ000093994,RAOUL ISL/KERMADEC,-29.25,-177.917,2799.18
1,NZ000093994,RAOUL ISL/KERMADEC,-29.25,-177.917,NZM00093929,ENDERBY ISLAND AWS,-50.483,166.3,2705.42
2,NZ000093844,INVERCARGILL AIRPOR,-46.417,168.333,NZ000093994,RAOUL ISL/KERMADEC,-29.25,-177.917,2251.34
3,NZ000093994,RAOUL ISL/KERMADEC,-29.25,-177.917,NZ000937470,TARA HILLS,-44.517,169.9,2008.89
4,NZ000939450,CAMPBELL ISLAND AWS,-52.55,169.167,NZ000093012,KAITAIA,-35.1,173.267,1967.22
5,NZ000093012,KAITAIA,-35.1,173.267,NZM00093929,ENDERBY ISLAND AWS,-50.483,166.3,1800.52
6,NZ000093994,RAOUL ISL/KERMADEC,-29.25,-177.917,NZM00093781,CHRISTCHURCH INTL,-43.489,172.532,1796.56
7,NZ000936150,HOKITIKA AERODROME,-42.717,170.983,NZ000093994,RAOUL ISL/KERMADEC,-29.25,-177.917,1796.36
8,NZ000939450,CAMPBELL ISLAND AWS,-52.55,169.167,NZM00093110,AUCKLAND AERO AWS,-37.0,174.8,1783.96
9,NZ000939450,CAMPBELL ISLAND AWS,-52.55,169.167,NZ000093292,GISBORNE AERODROME,-38.65,177.983,1687.99


In [25]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()