In [1]:
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [2]:
# Read the postcodes into a map
postcodes_map = spark.read.csv('data/uk-postcode.csv', header=True) \
  .select('Postcode', 'Region').rdd \
  .collectAsMap()
  
# Broadcast the postcodes map
postcodes = spark.sparkContext.broadcast(postcodes_map)

In [3]:
# Define a function to lookup the region for a postcode in the broadcast variable
@f.udf
def postcode_region(postcode):
  prefix = postcode.strip().split(' ')[0]
  return postcodes.value[prefix] if prefix in postcodes.value else 'Unknown'

In [4]:
# Calculate the makerspace count by region
makerspaces = spark.read.csv('data/uk-makerspaces-identifiable-data.csv', header=True)
makerspaces \
  .select('Postcode') \
  .withColumn('Region', postcode_region(makerspaces.Postcode)) \
  .groupBy('Region') \
  .count() \
  .orderBy('count', ascending=False) \
  .show()

+--------------------+-----+
|              Region|count|
+--------------------+-----+
|          Manchester|    3|
|             Glasgow|    3|
|       Tower Hamlets|    3|
|             Cardiff|    3|
|           Liverpool|    2|
|           Southwark|    2|
|             Bristol|    2|
|             Belfast|    2|
|           Lancaster|    2|
|            Aberdeen|    2|
|             Lambeth|    2|
|   Brighton and Hove|    2|
|              Camden|    2|
|           Sheffield|    2|
|              Oxford|    2|
|               Leeds|    2|
|    Scottish Borders|    1|
|Cheshire West and...|    1|
|           Cambridge|    1|
|          Wandsworth|    1|
+--------------------+-----+
only showing top 20 rows

