# World Trade: Queries

In [1]:
# Initialization and configuration
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession

# Spark
from pyspark.sql.types import *
from pyspark.sql.functions import *

## Initialize and configure Spark

http://localhost:4040/

In [2]:
# Configuration
conf = SparkConf().setAppName("WorldTrade").setMaster("local[6]")
conf.set("spark.driver.maxResultSize", "2g")
conf.set("spark.driver.memory", "2g")
conf.set("spark.executor.memory", "2g") 
conf.set("spark.executor.pyspark.memory", "2g")

# Initialization
sc = SparkContext(conf=conf)
sqlc = SQLContext(sc)
spark = SparkSession(sc)

## Read reporter, partner, product names (= dimensions)

In [4]:
# Reporters

schema = StructType([
    StructField("REPORTER", IntegerType(), True),
    StructField("START_DATE", StringType(), True),
    StructField("END_DATE", StringType(), True),
    StructField("REPORTER_NAME", StringType(), True),
    StructField("_c1", StringType(), True),
    StructField("_c2", StringType(), True)
])

reporters = spark.read.csv("data/metadata/REPORTERS.txt", sep="\t", header=False, schema=schema) \
    .select("REPORTER", "REPORTER_NAME")
reporters = reporters.withColumn("REPORTER_NAME", trim(reporters["REPORTER_NAME"])) \
    .cache() # how to chain?

In [5]:
reporters.show(30)

+--------+--------------+
|REPORTER| REPORTER_NAME|
+--------+--------------+
|       1|        France|
|       2|   Belg.-Luxbg|
|       3|   Netherlands|
|       4|    Fr Germany|
|       5|         Italy|
|       6|  Utd. Kingdom|
|       7|       Ireland|
|       8|       Denmark|
|       9|        Greece|
|      10|      Portugal|
|      11|         Spain|
|      17|       Belgium|
|      18|    Luxembourg|
|      30|        Sweden|
|      32|       Finland|
|      38|       Austria|
|      46|         Malta|
|      53|       Estonia|
|      54|        Latvia|
|      55|     Lithuania|
|      60|        Poland|
|      61|Czech Republic|
|      63|      Slovakia|
|      64|       Hungary|
|      66|       Romania|
|      68|      Bulgaria|
|      91|      Slovenia|
|      92|       Croatia|
|     600|        Cyprus|
+--------+--------------+



In [9]:
# Partners

schema = StructType([
    StructField("PARTNER", IntegerType(), True),
    StructField("START_DATE", StringType(), True),
    StructField("END_DATE", StringType(), True),
    StructField("PARTNER_NAME", StringType(), True),
    StructField("_c1", StringType(), True),
    StructField("_c2", StringType(), True)
])

partners = spark.read.csv("data/metadata/PARTNERS.txt", sep="\t", header=False, schema=schema)
partners = partners.filter(partners["END_DATE"] == "31/12/2500") \
    .withColumn("PARTNER_NAME", trim(partners["PARTNER_NAME"])) \
    .cache()   # how to chain?


# filter out territorries that don't legally exist anymore

In [11]:
partners.count()

249

In [10]:
partners.show(5)

+-------+----------+----------+--------------+----------+----------+
|PARTNER|START_DATE|  END_DATE|  PARTNER_NAME|       _c1|       _c2|
+-------+----------+----------+--------------+----------+----------+
|      1|01/01/1997|31/12/2500|        France|01/01/1997|31/12/2500|
|      3|01/01/1976|31/12/2500|   Netherlands|01/01/1976|31/12/2500|
|      4|01/01/1991|31/12/2500|       Germany|01/01/1991|31/12/2500|
|      5|01/01/1994|31/12/2500|         Italy|01/01/1994|31/12/2500|
|      6|01/01/1976|31/12/2500|United Kingdom|01/01/1976|31/12/2500|
+-------+----------+----------+--------------+----------+----------+
only showing top 5 rows



In [13]:
# Products

schema = StructType([
    StructField("PRODUCT_NC", StringType(), True),
    StructField("START_DATE", StringType(), True),
    StructField("END_DATE", StringType(), True),
    StructField("_c1", StringType(), True),
    StructField("PRODUCT_NAME", StringType(), True),
    StructField("_c2", StringType(), True),
    StructField("_c3", StringType(), True)   
])

products = spark.read.csv("data/metadata/CN.txt", sep="\t", header=False, schema=schema) \
    .select("PRODUCT_NC", "PRODUCT_NAME")
products = products.withColumn("PRODUCT_NAME", trim(products["PRODUCT_NAME"])) \
    .withColumn("PRODUCT_NC_2D", products.PRODUCT_NC.substr(1, 2)) \
    .cache()   # how to chain?

In [14]:
products.count()

31154

In [15]:
products.show(5)

+----------+--------------------+-------------+
|PRODUCT_NC|        PRODUCT_NAME|PRODUCT_NC_2D|
+----------+--------------------+-------------+
|        01|        LIVE ANIMALS|           01|
|      0101|LIVE HORSES, ASSE...|           01|
|    010110|PURE-BRED BREEDIN...|           01|
|  01011010|PURE-BRED BREEDIN...|           01|
|  01011090|PURE-BRED BREEDIN...|           01|
+----------+--------------------+-------------+
only showing top 5 rows



## Query examples

### Load and cache all data for just one year (for the sake of speed)


### Read trade flow data

Code:  *TO BE UPDATED!*
```python

flows = spark.read.parquet("data_parquet/full2018.parquet") \
    .select("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "TRADE_TYPE", "PRODUCT_NC", "FLOW", "PERIOD", "VALUE_IN_EUROS")
flows = flows.withColumn("YEAR", flows.PERIOD.substr(1, 4).cast(IntegerType())) \
    .groupBy("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "TRADE_TYPE", "PRODUCT_NC", "FLOW", "YEAR").agg({"VALUE_IN_EUROS": "sum"}) \
    .groupBy("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "TRADE_TYPE", "PRODUCT_NC", "YEAR") \
    .pivot("FLOW", [1, 2]).sum("sum(VALUE_IN_EUROS)") \
    .withColumnRenamed("1", "IMPORTS").withColumnRenamed("2", "EXPORTS")


```

What's going on here:
1. Read from parquet files
2. Select relevnat columns
3. Generate column "YEAR"; probably better to use date functions
4. Aggregate values for all months over each year as well as all other columns
5. Pivot: Generate columns "1" and "2" from "sum(VALUE_IN_EUROS)" and the categories of "FLOW"
6. Rename "1" and "2" to "IMPORTS and "EXPORTS



In [17]:
spark.read.parquet("data/parquet/full2018.parquet").printSchema()

root
 |-- REPORTER: integer (nullable = true)
 |-- REPORTER_ISO: string (nullable = true)
 |-- PARTNER: integer (nullable = true)
 |-- PARTNER_ISO: string (nullable = true)
 |-- TRADE_TYPE: string (nullable = true)
 |-- PRODUCT_NC: string (nullable = true)
 |-- PRODUCT_SITC: string (nullable = true)
 |-- PRODUCT_CPA2002: string (nullable = true)
 |-- PRODUCT_CPA2008: string (nullable = true)
 |-- PRODUCT_CPA2_1: string (nullable = true)
 |-- PRODUCT_BEC: string (nullable = true)
 |-- PRODUCT_SECTION: string (nullable = true)
 |-- FLOW: integer (nullable = true)
 |-- STAT_REGIME: integer (nullable = true)
 |-- SUPP_UNIT: string (nullable = true)
 |-- PERIOD: string (nullable = true)
 |-- VALUE_IN_EUROS: long (nullable = true)
 |-- QUANTITY_IN_KG: long (nullable = true)
 |-- SUP_QUANTITY: integer (nullable = true)



In [18]:
flows = spark.read.parquet("data/parquet/full2018.parquet")

flows = flows \
    .withColumn("YEAR", flows["PERIOD"].substr(1, 4).cast(IntegerType())) \
    .select("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "TRADE_TYPE", "PRODUCT_NC", "FLOW", "YEAR", "VALUE_IN_EUROS") \
    .filter(flows["PRODUCT_NC"] != "TOTAL") \
    .groupBy("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "TRADE_TYPE", "PRODUCT_NC", "FLOW", "YEAR").agg({"VALUE_IN_EUROS": "sum"}) \
    .groupBy("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "TRADE_TYPE", "PRODUCT_NC", "YEAR").pivot("FLOW", [1, 2]).sum("sum(VALUE_IN_EUROS)") \
    .withColumnRenamed("1", "IMPORTS").withColumnRenamed("2", "EXPORTS") \
    .cache()   
    
### how to chain?
### combine into one groupby

In [19]:
flows.printSchema()

root
 |-- REPORTER: integer (nullable = true)
 |-- REPORTER_ISO: string (nullable = true)
 |-- PARTNER: integer (nullable = true)
 |-- PARTNER_ISO: string (nullable = true)
 |-- TRADE_TYPE: string (nullable = true)
 |-- PRODUCT_NC: string (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- IMPORTS: long (nullable = true)
 |-- EXPORTS: long (nullable = true)



In [20]:
flows.rdd.getNumPartitions()

200

In [21]:
flows.count()

6744086

In [22]:
flows.show(5)

+--------+------------+-------+-----------+----------+----------+----+-------+-------+
|REPORTER|REPORTER_ISO|PARTNER|PARTNER_ISO|TRADE_TYPE|PRODUCT_NC|YEAR|IMPORTS|EXPORTS|
+--------+------------+-------+-----------+----------+----------+----+-------+-------+
|       1|          FR|      6|         GB|         I|  68159100|2018| 451553|    923|
|       1|          FR|     17|         BE|         I|  44199010|2018| 565042|  12173|
|       1|          FR|     30|         SE|         I|  96132000|2018|   null|   7134|
|       1|          FR|     39|         CH|         E|  15159060|2018|   8608| 109757|
|       1|          FR|     60|         PL|         I|  84804900|2018| 303859|   null|
+--------+------------+-------+-----------+----------+----------+----+-------+-------+
only showing top 5 rows



In [23]:
flows.select("REPORTER").distinct().count()

28

In [24]:
flows.select("PARTNER").distinct().count()

246

In [25]:
flows.select("PRODUCT_NC").distinct().count()

9964

### Example: Total German exports in 2018

In [142]:
query = flows \
    .filter(flows["REPORTER_ISO"] == "DE") \
    .filter(flows["YEAR"] == 2018) \
    .select("REPORTER", "REPORTER_ISO", "YEAR", "EXPORTS") \
    .groupBy("REPORTER", "REPORTER_ISO", "YEAR").agg({"EXPORTS": "sum"})

query = query.withColumn("EXPORTS_MN", round(query["sum(EXPORTS)"]/1e6, 3)) \
    .drop("sum(EXPORTS)") \
    .join(broadcast(reporters), "REPORTER", how="left") \
    .select("REPORTER_ISO", "REPORTER_NAME", "YEAR", "EXPORTS_MN")

query.orderBy(query["EXPORTS_MN"].desc()).show(10, truncate=False)

+------------+-------------+----+-----------+
|REPORTER_ISO|REPORTER_NAME|YEAR|EXPORTS_MN |
+------------+-------------+----+-----------+
|DE          |Fr Germany   |2018|1320837.948|
+------------+-------------+----+-----------+



### Example: German exports, imports, trade balance (total & by country)

In [None]:
# TBD

### Example: EU exports, imports, trade balance

In [None]:
# TBD

### Example: Top 10 German export destinations in 2018

In [140]:
query = flows \
    .filter(flows["REPORTER_ISO"] == "DE") \
    .filter(flows["YEAR"] == 2018) \
    .select("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "YEAR", "EXPORTS") \
    .groupBy("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "YEAR").agg({"EXPORTS": "sum"})

query = query.withColumn("EXPORTS_MN", round(query["sum(EXPORTS)"]/1e6, 3)) \
    .drop("sum(EXPORTS)") \
    .join(broadcast(reporters), "REPORTER", how="left") \
    .join(broadcast(partners), "PARTNER", how="left") \
    .select("REPORTER_ISO", "REPORTER_NAME", "PARTNER_ISO", "PARTNER_NAME", "YEAR", "EXPORTS_MN")

query.orderBy(query["EXPORTS_MN"].desc()).show(10, truncate=False)

+------------+-------------+-----------+------------------------+----+----------+
|REPORTER_ISO|REPORTER_NAME|PARTNER_ISO|PARTNER_NAME            |YEAR|EXPORTS_MN|
+------------+-------------+-----------+------------------------+----+----------+
|DE          |Fr Germany   |US         |United States of America|2018|114480.812|
|DE          |Fr Germany   |FR         |France                  |2018|105138.035|
|DE          |Fr Germany   |CN         |China                   |2018|93680.025 |
|DE          |Fr Germany   |NL         |Netherlands             |2018|84295.499 |
|DE          |Fr Germany   |GB         |United Kingdom          |2018|81773.801 |
|DE          |Fr Germany   |IT         |Italy                   |2018|69716.726 |
|DE          |Fr Germany   |AT         |Austria                 |2018|63726.179 |
|DE          |Fr Germany   |PL         |Poland                  |2018|63290.487 |
|DE          |Fr Germany   |CH         |Switzerland             |2018|53626.716 |
|DE          |Fr

### Partner by reporter, ranked by exports

In [157]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F

In [170]:
query = flows \
    .filter(flows["YEAR"] == 2018) \
    .select("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "EXPORTS") \
    .groupBy("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO").agg({"EXPORTS": "sum"})

query = query \
    .withColumn("EXPORTS_MN", round(query["sum(EXPORTS)"]/1e6, 3)) \
    .select("REPORTER", "REPORTER_ISO", "PARTNER", "PARTNER_ISO", "EXPORTS_MN")

query = query.withColumn("RANK", F.rank().over(
        Window.partitionBy(query["REPORTER_ISO"]).orderBy(query["EXPORTS_MN"].desc())
    )) \
    .join(broadcast(reporters), "REPORTER", how="left") \
    .join(broadcast(partners), "PARTNER", how="left") \
    .select("REPORTER_ISO", "REPORTER", "REPORTER_NAME", "PARTNER_ISO", "PARTNER", "PARTNER_NAME", "RANK", "EXPORTS_MN")

query.filter(query["REPORTER_ISO"] == "DK").orderBy(query["RANK"]).show(50)

+------------+--------+-------------+-----------+-------+--------------------+----+----------+
|REPORTER_ISO|REPORTER|REPORTER_NAME|PARTNER_ISO|PARTNER|        PARTNER_NAME|RANK|EXPORTS_MN|
+------------+--------+-------------+-----------+-------+--------------------+----+----------+
|          DK|       8|      Denmark|         DE|      4|             Germany|   1| 13729.969|
|          DK|       8|      Denmark|         SE|     30|              Sweden|   2|  9669.942|
|          DK|       8|      Denmark|         QZ|    979|Countries and ter...|   3|  9203.426|
|          DK|       8|      Denmark|         GB|      6|      United Kingdom|   4|  5888.923|
|          DK|       8|      Denmark|         QY|    978|Countries and ter...|   5|  5356.036|
|          DK|       8|      Denmark|         NO|     28|              Norway|   6|  5164.612|
|          DK|       8|      Denmark|         NL|      3|         Netherlands|   7|  4121.748|
|          DK|       8|      Denmark|         US| 

In [167]:
query = flows \
    .filter(flows["YEAR"] == 2018) \
    .select("REPORTER_ISO", "PARTNER_ISO", "EXPORTS") \
    .groupBy("REPORTER_ISO", "PARTNER_ISO").agg({"EXPORTS": "sum"})

query = query \
    .withColumn("EXPORTS_MN", round(query["sum(EXPORTS)"]/1e6, 3)) \
    .select("REPORTER_ISO", "PARTNER_ISO", "EXPORTS_MN")

query = query.withColumn("RANK", F.rank().over(
        Window.partitionBy(query["REPORTER_ISO"]).orderBy(query["EXPORTS_MN"].desc())
    ))

query.orderBy(query["REPORTER_ISO"], query["RANK"]).show(50)

+------------+-----------+----------+----+
|REPORTER_ISO|PARTNER_ISO|EXPORTS_MN|RANK|
+------------+-----------+----------+----+
|          AT|         DE| 45962.192|   1|
|          AT|         US|  9929.259|   2|
|          AT|         IT|  9818.516|   3|
|          AT|         SK|  7357.566|   4|
|          AT|         CH|  7140.668|   5|
|          AT|         FR|  6536.109|   6|
|          AT|         CZ|  5689.183|   7|
|          AT|         HU|  5193.854|   8|
|          AT|         PL|  5088.566|   9|
|          AT|         CN|  4246.576|  10|
|          AT|         GB|  4225.856|  11|
|          AT|         SI|  3119.872|  12|
|          AT|         NL|   3019.41|  13|
|          AT|         ES|  2682.828|  14|
|          AT|         RO|  2663.557|  15|
|          AT|         BE|   2255.95|  16|
|          AT|         RU|   2108.85|  17|
|          AT|         SE|  1753.033|  18|
|          AT|         JP|   1472.87|  19|
|          AT|         HR|   1339.12|  20|
|          

### Example: Top 10 German exports (8-digit categories)

In [156]:
query = flows \
    .filter(flows["REPORTER_ISO"] == "DE") \
    .filter(flows["YEAR"] == 2018) \
    .select("REPORTER", "REPORTER_ISO", "PRODUCT_NC", "YEAR", "EXPORTS") \
    .groupBy("REPORTER", "REPORTER_ISO", "PRODUCT_NC", "YEAR").agg({"EXPORTS": "sum"})

query = query.withColumn("EXPORTS_MN", round(query["sum(EXPORTS)"]/1e6, 3)) \
    .drop("sum(EXPORTS)") \
    .join(broadcast(reporters), "REPORTER", how="left") \
    .join(broadcast(products), "PRODUCT_NC", how="left") \
    .select("PRODUCT_NC", "PRODUCT_NAME", "EXPORTS_MN")

query.orderBy(query["EXPORTS_MN"].desc()).show(10, truncate=False)

+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|PRODUCT_NC|PRODUCT_NAME                                                                                                                                                                                                                                                                                                                                                                                                                          

### Example: Top 10 German exports (2-digit categories)

In [155]:
query = flows \
    .filter(flows["REPORTER_ISO"] == "DE") \
    .filter(flows["YEAR"] == 2018) \
    .join(broadcast(products), "PRODUCT_NC", how="left") \
    .select("REPORTER", "REPORTER_ISO", "PRODUCT_NC_2D", "YEAR", "EXPORTS") \
    .groupBy("REPORTER", "REPORTER_ISO", "PRODUCT_NC_2D", "YEAR").agg({"EXPORTS": "sum"})

query = query.withColumn("EXPORTS_MN", round(query["sum(EXPORTS)"]/1e6, 3)) \
    .drop("sum(EXPORTS)") \
    .join(broadcast(reporters), "REPORTER", how="left") \
    .join(broadcast(products), query["PRODUCT_NC_2d"] == products["PRODUCT_NC"], how="left") \
    .select(query["PRODUCT_NC_2D"], "PRODUCT_NAME", "EXPORTS_MN")

query.orderBy(query["EXPORTS_MN"].desc()).show(10, truncate=False)

+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|PRODUCT_NC_2D|PRODUCT_NAME                                                                                                                                                                           |EXPORTS_MN|
+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|87           |VEHICLES OTHER THAN RAILWAY OR TRAMWAY ROLLING-STOCK, AND PARTS AND ACCESSORIES THEREOF                                                                                                |373834.398|
|84           |NUCLEAR REACTORS, BOILERS, MACHINERY AND MECHANICAL APPLIANCES; PARTS THEREOF                                                                

# vvv TBD vvv

### Example Export from Denmark to the Netherlands >= X mn

In [65]:
query = flows \
    .filter(flows["REPORTER_ISO"] == "DK") \
    .filter(flows["YEAR"] == 2018) \
    .select("PARTNER", "PARTNER_ISO", "YEAR", "EXPORTS") \
    .groupBy("PARTNER", "PARTNER_ISO", "YEAR").agg({"EXPORTS": "sum"})
query = query.withColumn("EXPORTS", round(query["sum(EXPORTS)"]/1e6, 3)) \
    .drop("sum(EXPORTS)") \
    .drop("PARTNER")  # how to chain?

query.orderBy(query["EXPORTS"].desc()).show(10)

+-----------+----+---------+
|PARTNER_ISO|YEAR|  EXPORTS|
+-----------+----+---------+
|         DE|2018|28277.164|
|         SE|2018|20296.471|
|         GB|2018|12264.675|
|         US|2018| 11498.81|
|         NO|2018|10817.759|
|         QZ|2018| 9203.426|
|         NL|2018| 8537.012|
|         FR|2018| 6525.947|
|         CN|2018| 6459.382|
|         PL|2018| 5367.782|
+-----------+----+---------+
only showing top 10 rows



### Example: Top 10 exports from Argentina to Germany

In [79]:
query = flows \
    .filter(flows["REPORTER_ISO"] == "DE") \
    .filter(flows["PARTNER_ISO"] == "AR") \
    .filter(flows["YEAR"] == 2018) \
    .select("PRODUCT_NC", "IMPORTS") \
    .groupBy("PRODUCT_NC").agg({"IMPORTS": "sum"})
query = query.withColumn("IMPORTS", round(query["sum(IMPORTS)"]/1e6, 3))

query.orderBy(query["IMPORTS"].desc()).show(10)

+----------+------------+-------+
|PRODUCT_NC|sum(IMPORTS)|IMPORTS|
+----------+------------+-------+
|     TOTAL|   983441306|983.441|
|  26030000|   198439966| 198.44|
|  02013000|   194604795|194.605|
|  29371900|    85198660| 85.199|
|  51052900|    46657958| 46.658|
|  23040000|    34440129|  34.44|
|  04090000|    34256382| 34.256|
|  87082990|    28150679| 28.151|
|  71069100|    27794690| 27.795|
|  23080040|    21072345| 21.072|
+----------+------------+-------+
only showing top 10 rows



In [75]:
flows.printSchema()

root
 |-- REPORTER: integer (nullable = true)
 |-- REPORTER_ISO: string (nullable = true)
 |-- PARTNER: integer (nullable = true)
 |-- PARTNER_ISO: string (nullable = true)
 |-- TRADE_TYPE: string (nullable = true)
 |-- PRODUCT_NC: string (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- IMPORTS: long (nullable = true)
 |-- EXPORTS: long (nullable = true)



### Example: Trade balance of all EU countries

In [80]:
# TBD

### Example: All exports from the EU

In [81]:
# TBD

### Example: Imports from Argentina to the EU by year

In [None]:
# TBD

## SparkSQL

In [66]:
flows.createOrReplaceTempView("trade")

### Example: Total exports of Germany in 2018

In [72]:
sqlc.sql("select sum(EXPORTS) from trade where REPORTER_ISO = 'DE' and YEAR = 2018").show()

# result is wrong

+-------------+
| sum(EXPORTS)|
+-------------+
|2641675895152|
+-------------+



In [73]:
2641675895152/1e9

2641.675895152