In [8]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from sockets")
    .master("local[*]")
    .config("spark.executor.memory","512M")
    .getOrCreate()
)

spark

In [9]:
# Read Sales CSV Data - 752MB Size ~ 7.2M Records

_schema = "transacted_at string, trx_id string, retailer_id string, description string, amount double, city_id string"

df = spark.read.format("csv").schema(_schema).option("header", True).load("/home/jovyan/data/new_sales.csv")

In [10]:
'''
    Every time we use the dataframe will scan the file and apply the transformations
'''
df = df.where("amount > 300")

In [7]:
'''
    cache will be triggered when we need to run an action that will scan the file like write or count
    In spark UI under storage tab i can see that the file is cached in  Disk Memory Deserialized storage level
    Some of the data are stored in Memory and some on the disk
'''

df.cache().count() ## MEMORY_AND_DISK (DEFAULT)

7202569

In [8]:
'''
    After caching the file when we use it, spark will read the data from memory and it will be much faster
'''
df.where("amount > 300").show()

+--------------------+----------+-----------+--------------------+-------+----------+
|       transacted_at|    trx_id|retailer_id|         description| amount|   city_id|
+--------------------+----------+-----------+--------------------+-------+----------+
|2017-11-24T19:00:...|1734117022|  847200066|Wal-Mart  ppd id:...|1737.26|1646415505|
|2017-11-24T19:00:...|1734117030| 1953761884|Home Depot     pp...|  384.5| 287177635|
|2017-11-24T19:00:...|1734117153|  847200066|unkn        Kings...|2907.57|1483931123|
|2017-11-24T19:00:...|1734117241|  486576507|              iTunes|2912.67|1663872965|
|2017-11-24T19:00:...|2076947146|  511877722|unkn     ccd id: ...|1915.35|1698762556|
|2017-11-24T19:00:...|2076947113| 1996661856|AutoZone  arc id:...| 1523.6|1759612211|
|2017-11-24T19:00:...|2076946994| 1898522855|Target    ppd id:...|2589.93|2074005445|
|2017-11-24T19:00:...|2076946121|  562903918|unkn    ccd id: 5...| 315.86|1773943669|
|2017-11-24T19:00:...|2076946063| 1070485878|Amazon.co

In [9]:
# remove dataframe from cache
df.unpersist() 

DataFrame[transacted_at: string, trx_id: string, retailer_id: string, description: string, amount: double, city_id: string]

In [3]:
'''
    Spark keep liniage on dataframes so when we use df understand that the dataframe df_cached has the same data 
    It is use the cached data to answer df query
    
    Caching should be done to serve as much as queries 
    If cache only rows with amount > 300 (partial cache) and other query need other amount data it will scan the file
'''

df_cached = df.cache()
df_cached.count()
df.where("amount > 300").show()

+--------------------+----------+-----------+--------------------+-------+----------+
|       transacted_at|    trx_id|retailer_id|         description| amount|   city_id|
+--------------------+----------+-----------+--------------------+-------+----------+
|2017-11-24T19:00:...|1734117022|  847200066|Wal-Mart  ppd id:...|1737.26|1646415505|
|2017-11-24T19:00:...|1734117030| 1953761884|Home Depot     pp...|  384.5| 287177635|
|2017-11-24T19:00:...|1734117153|  847200066|unkn        Kings...|2907.57|1483931123|
|2017-11-24T19:00:...|1734117241|  486576507|              iTunes|2912.67|1663872965|
|2017-11-24T19:00:...|2076947146|  511877722|unkn     ccd id: ...|1915.35|1698762556|
|2017-11-24T19:00:...|2076947113| 1996661856|AutoZone  arc id:...| 1523.6|1759612211|
|2017-11-24T19:00:...|2076946994| 1898522855|Target    ppd id:...|2589.93|2074005445|
|2017-11-24T19:00:...|2076946121|  562903918|unkn    ccd id: 5...| 315.86|1773943669|
|2017-11-24T19:00:...|2076946063| 1070485878|Amazon.co

In [15]:
df_cached.unpersist()



DataFrame[transacted_at: string, trx_id: string, retailer_id: string, description: string, amount: double, city_id: string]

In [11]:
'''
    Cache storage level: with cache storage level is MEMORY_AND_DISK and data are desirialized
                         with persist you can define the storage level
                         MEMORY_ONLY, MEMORY_AND_DISK,MEMORY_ONLY_SER, MEMORY_AND_DISK_SER, DISK_ONLY, MEMORY_ONLY_2, MEMORY_AND_DISK_2
                         
                         if data does not fit into memory and storage level is MEMORY_ONLY we will get error
'''
import pyspark

df_persist = df.persist(pyspark.StorageLevel.MEMORY_ONLY)
df_persist.write.format("noop").mode("overwrite").save() # write the dataframe to noop to trigger the cache


In [14]:
# clean the cache
spark.catalog.clearCache()