# Setup Spark session

## Variables

Define important configuration variables here. 
They are used later for setting up the Spark session.
Also define the URL to the data.


In [1]:
# number of worker instances to be used in this session
EXECUTOR_INSTANCES = 2

# memory per worker
EXECUTOR_MEM = '6g'

# S3 URL for accessing the data (switch here to change from big to mini data)
# EVENT_DATA_URL = "s3a://udacity/sparkify/sparkify_event_data.json"
EVENT_DATA_URL = "s3a://udacity-dsnd/sparkify/mini_sparkify_event_data.json"

## Install S3 Jars

For accessing S3 two additional jar files have to be downloaded.
This is done in a simple shell script, which first checks for the existence.
If the JAR files are missing they are downloaded from the central Maven repository.

In [2]:
!./install-s3-jars.sh

DOWNLOADING necessary jar files for accessing S3 buckets for Spark 3.3.2


## S3 credentials

For accessing the S3 bucket, credentials are needed.  
To avoid checking in the credentials into the GIT repository,  
the credentials are encrypted with the following two helper functions:  
<sup>(Note: The ".seed.txt" file contains the master-password for en-/decryption. It is not checked into the GIT repository, see .gitignore)<sup>

In [3]:
from cryptography.fernet import Fernet
import base64

def decrypt(encrypted_text):
    """
    decrypts an encrypted text. The seed (master-password) for decryption is read from the file ".seed.txt"
    
    Input: encrypted_text
    
    Output: the decrypted text. If the text was not encrypted with the same seed, 
            an exception is raised.
    """
    with open('.seed.txt') as f:
        seed = f.read().strip()
    return Fernet(base64.b64encode((seed*32)[:32].encode('ascii')).decode('ascii')).decrypt(encrypted_text.encode('ascii')).decode('ascii')

def encrypt(plain_text):
    """
    encrypts a given text. The seed (master-password) for encryption is read from the file ".seed.txt"
    
    Input: plain_text
    
    Output: the encrypted text
    """
    with open('.seed.txt') as f:
        seed = f.read().strip()
    return Fernet(base64.b64encode((seed*32)[:32].encode('ascii')).decode('ascii')).encrypt(plain_text.encode('ascii')).decode('ascii')


# technical account with read S3 buckets permission
AWS_ACCESS_KEY_ID='V6ge1JcQpvyYGJjb'
AWS_SECRET_ACCESS_KEY = decrypt('gAAAAABkDFI6865LaVJVgtTYo0aMx9-JTPbTo6cwOUjg5eNNPsZhBDoHbRZ8xuXQT0ImNfvqcecZuoJd1VzYQEpBaxyCnKvosii8O1KeqoL2NwKdKtL_AUfT4eW4dvJVP--VjEvc0gB4')


## Jupyter Host IP for communication
Running the Jupyter notebook in a Kubernetes cluster together with Spark, 
the communication with the worker nodes needs to know the IP address of the submitter, 
because the local hostname (POD) is not resolvable (at least in my environment).

In [4]:
import socket

OWN_IP=socket.gethostbyname(socket.gethostname())

## Create Spark Session

Create a new Spark Session "Sparkify". The configuration is specific for my cluster.


In [5]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

## Name of the Spark Session to create
APP_NAME = "Sparkify"

## Master of the Spark Cluster
SPARK_MASTER = "spark://bit-spark-master-svc.spark.svc.cluster.local:7077"

## S3 Host (MinIO is a S3 compatible storage)
S3_HOST = "minio-api-service.minio.svc"

print(f'### SETUP SPARK SESSION "{APP_NAME}"')
spark = SparkSession.builder \
    .master(SPARK_MASTER) \
    .config("spark.jars","/home/jovyan/jars/aws-java-sdk-bundle-1.11.1026.jar,/home/jovyan/jars/hadoop-aws-3.3.2.jar") \
    .config("spark.driver.host", OWN_IP) \
    .config("spark.hadoop.fs.s3a.endpoint", S3_HOST) \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID) \
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.executor.instances", EXECUTOR_INSTANCES) \
    .config("spark.executor.memory", EXECUTOR_MEM) \
    .appName(APP_NAME).getOrCreate()
print(f"Spark version: {spark.version}")

## set log level to WARN
sc = spark.sparkContext
sc.setLogLevel("WARN")


### SETUP SPARK SESSION "Sparkify"
Spark version: 3.3.2


## Load Data 

A Dataframe `df` is loaded from the given S3 url.

In [6]:
print(f"### LOAD DATA {EVENT_DATA_URL}")
df = spark.read.json(EVENT_DATA_URL)
print(f"finished loading data")

print(f"### PERSIST")
df = df.persist()

### LOAD DATA s3a://udacity-dsnd/sparkify/mini_sparkify_event_data.json
finished loading data
### PERSIST


DataFrame[artist: string, auth: string, firstName: string, gender: string, itemInSession: bigint, lastName: string, length: double, level: string, location: string, method: string, page: string, registration: bigint, sessionId: bigint, song: string, status: bigint, ts: bigint, userAgent: string, userId: string]

# Action

Spark session is started, data is accessible via `df`.
Anything can be done now   :-)
...

# Stop Session

Do free up allocated resources, the session has to be stopped.


In [7]:
print("### STOP SPARK SESSION")
spark.stop()    

### STOP SPARK SESSION
