# World Trade: Convert data to parquet

In [2]:
# Initialization and configuration
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession

# Spark
from pyspark.sql.types import *

# Python
import os

## Initialize and configure Spark

http://localhost:4040/

In [3]:
# Configuration
conf = SparkConf().setAppName("WorldTrade").setMaster("local[4]")
conf.set("spark.driver.maxResultSize", "2g")
conf.set("spark.driver.memory", "2g")
conf.set("spark.executor.memory", "2g") 
conf.set("spark.executor.pyspark.memory", "2g")

# Initialization
sc = SparkContext(conf=conf)
sqlc = SQLContext(sc)
spark = SparkSession(sc)

## Flow data: Convert .dat files to parquet files

In [4]:
# Define schema 

fields = [
    StructField("REPORTER", IntegerType(), True),
    StructField("REPORTER_ISO", StringType(), True),
    StructField("PARTNER", IntegerType(), True),
    StructField("PARTNER_ISO", StringType(), True),
    StructField("TRADE_TYPE", StringType(), True),
    StructField("PRODUCT_NC", StringType(), True),
    StructField("PRODUCT_SITC", StringType(), True),
    StructField("PRODUCT_CPA2002", StringType(), True),
    StructField("PRODUCT_CPA2008", StringType(), True),
    StructField("PRODUCT_CPA2_1", StringType(), True),
    StructField("PRODUCT_BEC", StringType(), True),
    StructField("PRODUCT_SECTION", StringType(), True),
    StructField("FLOW", IntegerType(), True),
    StructField("STAT_REGIME", IntegerType(), True),
    StructField("SUPP_UNIT", StringType(), True),
    StructField("PERIOD", StringType(), True),
    StructField("VALUE_IN_EUROS", LongType(), True),
    StructField("QUANTITY_IN_KG", LongType(), True),
    StructField("SUP_QUANTITY", IntegerType(), True)    
]

schema = StructType(fields)

In [6]:
# Folders with monthly .dat files
os.listdir("data/dat")

['2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018']

In [None]:
# Loop over all folders for each year and convert files
for year in os.listdir("data/dat"):
    source = "data/dat/" + year + "/*.dat"
    destination = "data/parquet/full" + year + ".parquet"
    data = spark.read.csv(source, sep=",", header=True, schema=schema)
    data.coalesce(4).write.parquet(destination, mode="overwrite")
    print(year, ": conversion to parquet complete")