In [0]:
# ETL Pipeline: Joining Geographic Data (Countries, Regions, Sub-Regions)

"""
             ┌──────────────────────────────┐
             │         df_regions            │
             │  (load and select)            │
             │  id        |   region         │
             └────────────┴─────────────────┘
                       ▲
                       │ (df_countries.region_id = df_regions.id)
                       │
┌─────────────────────────────┐
│        df_countries          │
│   (load and select)          │
│  country_id   |   country    │
│  region_id    |   sub_region_id
│  population   |   area_km2   │
└───────────────┴─────────────┘
                       │
                       │ (df_countries.sub_region_id = df_sub_regions.id)
                       ▼
             ┌─────────────────────────────┐
             │       df_sub_regions         │
             │  id        |   sub_region    │
             └────────────┴────────────────┘
"""

print("The following cells execute the scheme given above")

The following steps are applied:

- load the country data
- adopt a clean schema (it is efficient and defines types for each column)
- select only the useful columns
- rename some columns for clarity
- display the first 5 rows


In [0]:
# Load country data

from pyspark.sql.functions import col
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType
)

schema_countries = StructType([
    StructField("country_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("nationality", StringType(), True),
    StructField("country_code", StringType(), True),
    StructField("iso_alpha2", StringType(), True),
    StructField("capital", StringType(), True),
    StructField("population", IntegerType(), True),
    StructField("area_km2", IntegerType(), True),
    StructField("region_id", IntegerType(), True),
    StructField("sub_region_id", IntegerType(), True)
])

df_countries = (
    spark.read
         .csv(
             "/Volumes/customer_orders/default/countries/countries_population.csv",
             header=True,
             schema=schema_countries
         )
         .select(
             "country_id",
             col("name").alias("country"),
             "region_id",
             "sub_region_id",
             "population",
             "area_km2"
         )
)

df_countries.show(5)


In [0]:
# Load regions data

schema_regions = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
])

df_regions = (
    spark.read
         .csv(
             "/Volumes/customer_orders/default/countries/country_regions.csv",
             header=True,
             schema=schema_regions
         )
         .select(
             "id",
             col("name").alias("region")
         )
)

df_regions.show(5)


In [0]:
# Load sub-regions data

schema_sub_regions = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
])

df_sub_regions = (
    spark.read
         .csv(
             "/Volumes/customer_orders/default/countries/country_sub_regions.csv",
             header=True,
             schema=schema_sub_regions
         )
         .select(
             "id",
             col("name").alias("sub_region")
         )
)

df_sub_regions.show()


 The main cell of the notebook joins the three DataFrames by means of the region_id and sub_region_id keys.
 The (".") operation allows us to chain multiple PySpark operations together.

In [0]:
# Join the three DataFrames

df_joined = (
    df_countries
        .join(
            df_regions,
            df_countries.region_id == df_regions.id,
            "left"
        )
        .select(
            "country_id",
            "country",
            "region",
            "population",
            "area_km2",
            "sub_region_id"
        )
        .join(
            df_sub_regions,
            df_countries.sub_region_id == df_sub_regions.id,
            "left"
        )
        .select(
            "country_id",
            "country",
            "region",
            "sub_region",
            "population",
            "area_km2"
        )
)

df_joined.show(5)
