# World Trade: Queries with Spark's SQL API

In [11]:
# Initialization and configuration
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession

# Spark
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.window import Window

## Initialize and configure Spark

http://localhost:4040/

In [2]:
# Configuration
conf = SparkConf().setAppName("WorldTrade").setMaster("local[4]")
conf.set("spark.driver.maxResultSize", "2g")
conf.set("spark.driver.memory", "2g")
conf.set("spark.executor.memory", "2g") 
conf.set("spark.executor.pyspark.memory", "2g")

# Initialization
sc = SparkContext(conf=conf)
sqlc = SQLContext(sc)
spark = SparkSession(sc)

## Load data

* Facts: flows
* Dimensions: reporters, partners, products

## Flows

*TO BE UPDATED*
```python

flows = spark.read.parquet("data/parquet/full2018.parquet")

flows = flows.withColumn("YEAR", flows["PERIOD"].substr(1, 4).cast(IntegerType()))
flows = flows \
    .select("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "TRADE_TYPE", "PRODUCT_NC", "FLOW", "YEAR", "VALUE_IN_EUROS") \
    .filter(flows["YEAR"] == 2018) \
    .filter(flows["PRODUCT_NC"] != "TOTAL") \
    .groupBy("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "TRADE_TYPE", "PRODUCT_NC", "FLOW", "YEAR").agg({"VALUE_IN_EUROS": "sum"}) \
    .groupBy("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "TRADE_TYPE", "PRODUCT_NC", "YEAR").pivot("FLOW", [1, 2]).sum("sum(VALUE_IN_EUROS)") \
    .withColumnRenamed("1", "IMPORTS").withColumnRenamed("2", "EXPORTS") \
    .cache()
```

What's going on here:
1. Read from parquet files
2. Select relevnat columns
3. Generate column "YEAR"; probably better to use date functions
4. Aggregate values for all months over each year as well as all other columns
5. Pivot: Generate columns "1" and "2" from "sum(VALUE_IN_EUROS)" and the categories of "FLOW"
6. Rename "1" and "2" to "IMPORTS and "EXPORTS

In [61]:
flows = spark.read.parquet("data/parquet/full2018.parquet")

flows = flows.withColumn("YEAR", flows["PERIOD"].substr(1, 4).cast(IntegerType()))
flows = flows \
    .select("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "TRADE_TYPE", "PRODUCT_NC", "FLOW", "YEAR", "VALUE_IN_EUROS") \
    .filter(flows["YEAR"] == 2018) \
    .filter(flows["PRODUCT_NC"] != "TOTAL") \
    .groupBy("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "TRADE_TYPE", "PRODUCT_NC", "FLOW", "YEAR").agg({"VALUE_IN_EUROS": "sum"}) \
    .groupBy("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "TRADE_TYPE", "PRODUCT_NC", "YEAR").pivot("FLOW", [1, 2]).sum("sum(VALUE_IN_EUROS)") \
    .withColumnRenamed("1", "IMPORTS").withColumnRenamed("2", "EXPORTS") \
    .cache()

In [62]:
flows.printSchema()

root
 |-- REPORTER: integer (nullable = true)
 |-- REPORTER_ISO: string (nullable = true)
 |-- PARTNER: integer (nullable = true)
 |-- PARTNER_ISO: string (nullable = true)
 |-- TRADE_TYPE: string (nullable = true)
 |-- PRODUCT_NC: string (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- IMPORTS: long (nullable = true)
 |-- EXPORTS: long (nullable = true)



In [63]:
flows.rdd.getNumPartitions()

200

In [64]:
flows.count()

6787984

In [65]:
flows.show(10)

+--------+------------+-------+-----------+----------+----------+----+-------+-------+
|REPORTER|REPORTER_ISO|PARTNER|PARTNER_ISO|TRADE_TYPE|PRODUCT_NC|YEAR|IMPORTS|EXPORTS|
+--------+------------+-------+-----------+----------+----------+----+-------+-------+
|       1|          FR|      6|         GB|         I|  68159100|2018| 451553|    923|
|       1|          FR|     17|         BE|         I|  44199010|2018| 565042|  12173|
|       1|          FR|     30|         SE|         I|  96132000|2018|   null|   7134|
|       1|          FR|     39|         CH|         E|  15159060|2018|   8608| 109757|
|       1|          FR|     60|         PL|         I|  84804900|2018| 303859|   null|
|       1|          FR|    212|         TN|         E|  39261000|2018|2472112| 388639|
|       1|          FR|    346|         KE|         E|  84818063|2018|   null|  90489|
|       1|          FR|    664|         IN|         E|  85051910|2018|  49750|  11527|
|       1|          FR|    706|         SG|

In [78]:
flows.select("REPORTER").distinct().count()

28

In [79]:
flows.select("PARTNER").distinct().count()

246

In [80]:
flows.select("PRODUCT_NC").distinct().count()

9965

### Reporters

In [81]:
schema = StructType([
    StructField("REPORTER", IntegerType(), True),
    StructField("START_DATE", StringType(), True),
    StructField("END_DATE", StringType(), True),
    StructField("REPORTER_NAME", StringType(), True),
    StructField("_c1", StringType(), True),
    StructField("_c2", StringType(), True)
])

reporters = spark.read.csv("data/REPORTERS.txt", sep="\t", header=False, schema=schema)
reporters = reporters \
    .select("REPORTER", "REPORTER_NAME") \
    .withColumn("REPORTER_NAME", F.trim(reporters["REPORTER_NAME"])) \
    .cache()   

In [82]:
reporters.count()

29

In [83]:
reporters.show(10)

+--------+-------------+
|REPORTER|REPORTER_NAME|
+--------+-------------+
|       1|       France|
|       2|  Belg.-Luxbg|
|       3|  Netherlands|
|       4|   Fr Germany|
|       5|        Italy|
|       6| Utd. Kingdom|
|       7|      Ireland|
|       8|      Denmark|
|       9|       Greece|
|      10|     Portugal|
+--------+-------------+
only showing top 10 rows



### Partners

* `partners.filter(partners["END_DATE"] == "31/12/2500")`: Filters out countries or territories that don't exist as legal entities anymore

In [84]:
schema = StructType([
    StructField("PARTNER", IntegerType(), True),
    StructField("START_DATE", StringType(), True),
    StructField("END_DATE", StringType(), True),
    StructField("PARTNER_NAME", StringType(), True),
    StructField("_c1", StringType(), True),
    StructField("_c2", StringType(), True)
])

partners = spark.read.csv("data/PARTNERS.txt", sep="\t", header=False, schema=schema)
partners = partners.filter(partners["END_DATE"] == "31/12/2500") \
    .withColumn("PARTNER_NAME", F.trim(partners["PARTNER_NAME"])) \
    .select("PARTNER", "PARTNER_NAME") \
    .cache()

In [85]:
partners.printSchema()

root
 |-- PARTNER: integer (nullable = true)
 |-- PARTNER_NAME: string (nullable = true)



In [86]:
partners.count()

249

In [87]:
partners.show(10)

+-------+--------------+
|PARTNER|  PARTNER_NAME|
+-------+--------------+
|      1|        France|
|      3|   Netherlands|
|      4|       Germany|
|      5|         Italy|
|      6|United Kingdom|
|      7|       Ireland|
|      8|       Denmark|
|      9|        Greece|
|     10|      Portugal|
|     11|         Spain|
+-------+--------------+
only showing top 10 rows



### Product codes (Combined Nomenclature)

* https://ec.europa.eu/taxation_customs/business/calculation-customs-duties/what-is-common-customs-tariff/combined-nomenclature_en

* `.withColumn("HS2", products["PRODUCT_NC"].substr(1, 2))`: Matches all 8-digit categories with their broader 2-digit categories (Harmonized System Chapters)

In [88]:
schema = StructType([
    StructField("PRODUCT_NC", StringType(), True),
    StructField("START_DATE", StringType(), True),
    StructField("END_DATE", StringType(), True),
    StructField("_c1", StringType(), True),
    StructField("PRODUCT_NC_NAME", StringType(), True),
    StructField("_c2", StringType(), True),
    StructField("_c3", StringType(), True)   
])

products = spark.read.csv("data/CN.txt", sep="\t", header=False, schema=schema)
products = products \
    .select("PRODUCT_NC", "PRODUCT_NC_NAME") \
    .withColumn("PRODUCT_NC_NAME", F.trim(products["PRODUCT_NC_NAME"])) \
    .withColumn("HS2", products["PRODUCT_NC"].substr(1, 2)) \
    .cache()

In [89]:
products.printSchema()

root
 |-- PRODUCT_NC: string (nullable = true)
 |-- PRODUCT_NC_NAME: string (nullable = true)
 |-- HS2: string (nullable = true)



In [90]:
products.count()

31154

In [91]:
products.show(5)

+----------+--------------------+---+
|PRODUCT_NC|     PRODUCT_NC_NAME|HS2|
+----------+--------------------+---+
|        01|        LIVE ANIMALS| 01|
|      0101|LIVE HORSES, ASSE...| 01|
|    010110|PURE-BRED BREEDIN...| 01|
|  01011010|PURE-BRED BREEDIN...| 01|
|  01011090|PURE-BRED BREEDIN...| 01|
+----------+--------------------+---+
only showing top 5 rows



## Query examples (SQL)

### Register views

In [103]:
flows.createOrReplaceTempView("flows")
reporters.createOrReplaceTempView("reporters")
partners.createOrReplaceTempView("partners")
products.createOrReplaceTempView("products")

### Total German exports, imports, and trade balance in 2018 in bn

In [104]:
query = """
    select
        sum(EXPORTS/1e9) as EXPORTS_BN,
        sum(IMPORTS/1e9) as IMPORTS_BN,
        (sum(EXPORTS) - sum(IMPORTS))/1e9 as BALANCE_BN 
    from flows 
    where 
        REPORTER_ISO = 'DE' and
        YEAR = 2018 
"""
sqlc.sql(query).show()

+-----------+-----------+----------+
| EXPORTS_BN| IMPORTS_BN|BALANCE_BN|
+-----------+-----------+----------+
|1320.840158|1088.541680|232.295922|
+-----------+-----------+----------+



### Example: Top 10 German export destinations in 2018

In [105]:
query = """
    select
        PARTNER_ISO,
        PARTNER_NAME,
        sum(EXPORTS/1e9) as EXPORTS_BN
    from flows 
    join partners on partners.PARTNER = flows.PARTNER
    where 
        REPORTER_ISO = 'DE' and
        YEAR = 2018 
    group by
        PARTNER_ISO,
        PARTNER_NAME
    order by EXPORTS_BN desc
"""
sqlc.sql(query).show(10)

+-----------+--------------------+----------+
|PARTNER_ISO|        PARTNER_NAME|EXPORTS_BN|
+-----------+--------------------+----------+
|         US|United States of ...|114.480834|
|         FR|              France|105.137964|
|         CN|               China| 93.680048|
|         NL|         Netherlands| 84.295506|
|         GB|      United Kingdom| 81.773771|
|         IT|               Italy| 69.716666|
|         AT|             Austria| 63.726222|
|         PL|              Poland| 63.290483|
|         CH|         Switzerland| 53.626724|
|         BE|             Belgium| 44.238112|
+-----------+--------------------+----------+
only showing top 10 rows



### Top 10 countries with the highest trade deficit with Germany in 2018 in bn

In [106]:
query = """
    select
        PARTNER_ISO,
        PARTNER_NAME,
        sum(EXPORTS/1e9) as EXPORTS_BN,
        sum(IMPORTS/1e9) as IMPORTS_BN,
        (sum(EXPORTS) - sum(IMPORTS))/1e9 as BALANCE_BN 
    from flows 
    join partners on partners.PARTNER = flows.PARTNER
    where 
        REPORTER_ISO = 'DE' and
        YEAR = 2018 
    group by
        PARTNER_ISO,
        PARTNER_NAME
    order by BALANCE_BN desc
"""
sqlc.sql(query).show(20)

+-----------+--------------------+----------+----------+----------+
|PARTNER_ISO|        PARTNER_NAME|EXPORTS_BN|IMPORTS_BN|BALANCE_BN|
+-----------+--------------------+----------+----------+----------+
|         US|United States of ...|114.480834| 48.559429| 65.921338|
|         GB|      United Kingdom| 81.773771| 40.509202| 41.264567|
|         FR|              France|105.137964| 69.173274| 35.964740|
|         CN|               China| 93.680048| 75.466817| 18.213171|
|         AT|             Austria| 63.726222| 45.955059| 17.771104|
|         ES|               Spain| 44.191083| 32.897243| 11.293842|
|         CH|         Switzerland| 53.626724| 42.857159| 10.769528|
|         SE|              Sweden| 26.219577| 15.966888| 10.252658|
|         IT|               Italy| 69.716666| 60.399467|  9.317248|
|         KR| Korea (Republic of)| 17.774181|  9.191087|  8.583074|
|         AU|           Australia| 10.090758|  1.718844|  8.371927|
|         MX|              Mexico| 13.986662|  6

### Example: EU exports, imports, trade balance

In [107]:
query = """
    select
        sum(EXPORTS/1e9) as EXPORTS_BN,
        sum(IMPORTS/1e9) as IMPORTS_BN,
        (sum(EXPORTS) - sum(IMPORTS))/1e9 as BALANCE_BN 
    from flows 
    where 
        TRADE_TYPE = 'E' and
        YEAR = 2018 
"""
sqlc.sql(query).show()

+-----------+-----------+----------+
| EXPORTS_BN| IMPORTS_BN|BALANCE_BN|
+-----------+-----------+----------+
|1956.318791|1979.205124|-22.875961|
+-----------+-----------+----------+



### Example with ranks: Partner by reporter, ranked by exports

In [108]:
# TBD

### Exports from Argentina to Germany (CN8)

In [122]:
query = """
    select
        flows.PRODUCT_NC,
        PRODUCT_NC_NAME,
        sum(IMPORTS/1e6) as IMPORTS_MN
    from flows
    join products on products.PRODUCT_NC = flows.PRODUCT_NC
    where 
        YEAR = 2018 and
        REPORTER_ISO = 'DE' and
        PARTNER_ISO = 'AR'
    group by
        flows.PRODUCT_NC,
        PRODUCT_NC_NAME
    order by IMPORTS_MN desc
"""
sqlc.sql(query).show(10)

+----------+--------------------+----------+
|PRODUCT_NC|     PRODUCT_NC_NAME|IMPORTS_MN|
+----------+--------------------+----------+
|  26030000|COPPER ORES AND C...|198.439966|
|  02013000|FRESH OR CHILLED ...|194.604795|
|  29371900|POLYPEPTIDE HORMO...| 85.198660|
|  51052900|WOOL, COMBED (EXC...| 46.657958|
|  23040000|OILCAKE AND OTHER...| 34.440129|
|  04090000|       NATURAL HONEY| 34.256382|
|  87082990|PARTS AND ACCESSO...| 28.150679|
|  71069100|SILVER, INCL. SIL...| 27.794690|
|  23080040|ACORNS AND HORSE-...| 21.072345|
|  38231990|FATTY ACIDS, INDU...| 19.457899|
+----------+--------------------+----------+
only showing top 10 rows



### Exports from Argentina to Germany (HS2)

In [137]:
query = """
    select
        PX1.HS2,
        PX2.PRODUCT_NC_NAME,
        sum(IMPORTS/1e6) as IMPORTS_MN
    from flows as FX
    join products as PX1 on PX1.PRODUCT_NC = FX.PRODUCT_NC
    join products as PX2 on PX1.HS2 = PX2.PRODUCT_NC

    where 
        YEAR = 2018 and
        REPORTER_ISO = 'DE' and
        PARTNER_ISO = 'AR'
    group by
        PX1.HS2,
        PX2.PRODUCT_NC_NAME

    order by IMPORTS_MN desc
"""
sqlc.sql(query).show(10, truncate=True)

+---+--------------------+----------+
|HS2|     PRODUCT_NC_NAME|IMPORTS_MN|
+---+--------------------+----------+
| 26|  ORES, SLAG AND ASH|217.186164|
| 02|MEAT AND EDIBLE M...|198.467026|
| 87|VEHICLES OTHER TH...| 96.472690|
| 29|   ORGANIC CHEMICALS| 87.133348|
| 23|RESIDUES AND WAST...| 55.514820|
| 51|WOOL, FINE OR COA...| 48.437369|
| 22|BEVERAGES, SPIRIT...| 39.100959|
| 04|DAIRY PRODUCE; BI...| 34.256823|
| 84|NUCLEAR REACTORS,...| 34.110421|
| 71|NATURAL OR CULTUR...| 29.720306|
+---+--------------------+----------+
only showing top 10 rows



### Exports from Argentina to the EU >= 80 mn

In [None]:
# TBD