## **PYSPARK MEMORY MANAGEMENT**

In [1]:
import pandas as pd

APP_NAME = 'pyspark_python'
MASTER = 'local[*]'
from pyspark import SparkConf
from pyspark.sql import SparkSession


conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster(MASTER)
spark = SparkSession.builder.config(conf = conf).getOrCreate()
sc = spark.sparkContext

In [2]:
import datetime

# load data

flightData2015 = spark\
.read\
.option("inferSchema", "true")\
.option("header", "true")\
.csv("../data/2015-summary.csv") 

In [29]:
flightData2015.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [3]:
#time of execution 
timestart= datetime.datetime.now()

## Inefficient code ----------------------------------------------------------
import pickle
result = flightData2015.collect() #will cause driver to collect the results
with open('data/result.txt', 'wb') as filehandle:
    pickle.dump(result, filehandle)
## -----------------------------------------------------------------------------

# Calculation of the time
timeend = datetime.datetime.now()
timedelta = round((timeend-timestart).total_seconds(), 2) 
print("Time require to run the model: " + str(timedelta) + " segundos")

Time require to run the model: 0.15 segundos


In [36]:
#time of execution for one partition
timestart= datetime.datetime.now()
#spark.conf.set("spark.sql.shuffle.partitions", "1")

## better code -------------------------------------------------
flightData2015.repartition(1).write.mode("overwrite").csv("data/test.csv") 
## will assign 2 executors
## to collect the result. Assuming executor are better provisioned
## -----------------------------------------------------------------------

# Calculation of the time
timeend = datetime.datetime.now()
timedelta = round((timeend-timestart).total_seconds(), 2) 
print("Time require to run the model: " + str(timedelta) + " segundos")

Time require to run the model: 0.15 segundos


In [37]:
#time of execution for one partition
timestart= datetime.datetime.now()
#spark.conf.set("spark.sql.shuffle.partitions", "1")

## better code -------------------------------------------------
flightData2015.coalesce(1).write.mode("overwrite").csv("data/test.csv") 
## will assign x executors
## to collect the result. Assuming executor are better provisioned
## -----------------------------------------------------------------------

# Calculation of the time
timeend = datetime.datetime.now()
timedelta = round((timeend-timestart).total_seconds(), 2) 
print("Time require to run the model: " + str(timedelta) + " segundos")

Time require to run the model: 0.14 segundos


## References

* WHY YOUR SPARK APPS ARE SLOW OR FAILING: PART I MEMORY MANAGEMENT. [Here]()
* WHY YOUR SPARK APPS ARE SLOW OR FAILING: PART II DATA SKEW AND GARBAGE COLLECTION [Here](https://unraveldata.com/common-failures-slowdowns-part-ii/)