# Convert data to parquet

Script for converting the downloaded .dat files to .parquet files.

http://localhost:4040/

## Imports

In [1]:
# Initialization and configuration
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession

# Spark
from pyspark.sql.types import *

# Python
import os
import numpy as np

## Initialize and configure Spark

In [2]:
# Configuration
conf = SparkConf().setAppName("WorldTrade").setMaster("local[4]")
conf.set("spark.driver.maxResultSize", "2g")
conf.set("spark.driver.memory", "2g")
conf.set("spark.executor.memory", "2g") 
conf.set("spark.executor.pyspark.memory", "2g")

# Initialization
sc = SparkContext(conf=conf)
sqlc = SQLContext(sc)
spark = SparkSession(sc)

## Convert .dat files to parquet files

In [3]:
# Define schema 

fields = [
    StructField("REPORTER", IntegerType(), True),
    StructField("REPORTER_ISO", StringType(), True),
    StructField("PARTNER", IntegerType(), True),
    StructField("PARTNER_ISO", StringType(), True),
    StructField("TRADE_TYPE", StringType(), True),
    StructField("PRODUCT_NC", StringType(), True),
    StructField("PRODUCT_SITC", StringType(), True),
    StructField("PRODUCT_CPA2002", StringType(), True),
    StructField("PRODUCT_CPA2008", StringType(), True),
    StructField("PRODUCT_CPA2_1", StringType(), True),
    StructField("PRODUCT_BEC", StringType(), True),
    StructField("PRODUCT_SECTION", StringType(), True),
    StructField("FLOW", IntegerType(), True),
    StructField("STAT_REGIME", IntegerType(), True),
    StructField("SUPP_UNIT", StringType(), True),
    StructField("PERIOD", StringType(), True),
    StructField("VALUE_IN_EUROS", LongType(), True),
    StructField("QUANTITY_IN_KG", LongType(), True),
    StructField("SUP_QUANTITY", IntegerType(), True)    
]

schema = StructType(fields)

In [4]:
# Folders with monthly .dat files
os.listdir("data/dat")

['2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019']

In [5]:
# Loop over all folders for each year and convert files
for year in os.listdir("data/dat"):
    source = "data/dat/" + year + "/*.dat"
    destination = "data/parquet/full" + year + ".parquet"
    data = spark.read.csv(source, sep=",", header=True, schema=schema)
    data.coalesce(4).write.parquet(destination, mode="overwrite")
    print(year, ": conversion to parquet complete")

2001 : conversion to parquet complete
2002 : conversion to parquet complete
2003 : conversion to parquet complete
2004 : conversion to parquet complete
2005 : conversion to parquet complete
2006 : conversion to parquet complete
2007 : conversion to parquet complete
2008 : conversion to parquet complete
2009 : conversion to parquet complete
2010 : conversion to parquet complete
2011 : conversion to parquet complete
2012 : conversion to parquet complete
2013 : conversion to parquet complete
2014 : conversion to parquet complete
2015 : conversion to parquet complete
2016 : conversion to parquet complete
2017 : conversion to parquet complete
2018 : conversion to parquet complete
2019 : conversion to parquet complete


In [6]:
# List of parquet files
parquet_files = os.listdir("data/parquet")

parquet_files

['full2001.parquet',
 'full2002.parquet',
 'full2003.parquet',
 'full2004.parquet',
 'full2005.parquet',
 'full2006.parquet',
 'full2007.parquet',
 'full2008.parquet',
 'full2009.parquet',
 'full2010.parquet',
 'full2011.parquet',
 'full2012.parquet',
 'full2013.parquet',
 'full2014.parquet',
 'full2015.parquet',
 'full2016.parquet',
 'full2017.parquet',
 'full2018.parquet',
 'full2019.parquet']

In [7]:
# Store schemas for all parquet files
schemas = []
for i in range(len(parquet_files)):
    schemas.append(spark.read.parquet("data/parquet/"+parquet_files[i]).schema)

In [8]:
# Check if schemas for all files are identical
schemas_identical = np.zeros([len(parquet_files), len(parquet_files)], dtype=bool)   
for i in range(len(parquet_files)):
    for j in range(len(parquet_files)):
        schemas_identical[i][j] = schemas[i] == schemas[j]

# Should return True
np.all(schemas_identical)

True