# PySpark Scaling Demo

## CIML Summer Institute

#### UC San Diego

--- 

## Setup

In [None]:
# Initialize Spark

import pyspark
from pyspark.sql import SparkSession, Row

# Change N in local[N] to change number of resources 
# Note change in execution time
conf = pyspark.SparkConf().setAll([('spark.master', 'local[1]'),
                                   ('spark.app.name', 'Spark Demo'),
                                   ])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

print (pyspark.version.__version__)

In [None]:
print(spark.sparkContext.defaultParallelism)

#### Record execution start time

In [None]:
import time
start_time = time.time()

## Read data

In [None]:
# Read data into Spark DataFrame 
# Data source: https://jmcauley.ucsd.edu/data/amazon/ 

from os.path import expanduser
HOME = expanduser("~")

data_path = HOME + '/data/'

dataFileName = data_path + "BookReviews_5M.txt"
textDF = spark.read.text(dataFileName)

## Process data

#### Count number of rows

In [None]:
%%time

textDF.count()

#### Show first few rows

In [None]:
textDF.show()

#### Filter for reviews with the word 'science'

In [None]:
%%time

from pyspark.sql.functions import col

filteredDF = textDF.filter(col('value').contains('science'))

In [None]:
# Uncomment to show entire untruncated review 
# filteredDF.show(truncate=False)
filteredDF.show()

In [None]:
%%time 

filteredDF.count()

#### Print time since execution start

In [None]:
print(time.time() - start_time)

## Stop Spark Session

In [None]:
spark.stop()