# <center> Introduction to Spark In-memory Computing via Python PySpark </center>

In [None]:
!bash launch_spark_cluster.sh

In [None]:
import sys
import os
import pyspark

env_spark_home=os.path.join(os.environ['HOME'],"software","spark-2.4.5-bin-hadoop2.7")
env_spark_conf_dir=os.path.join(env_spark_home,"conf")
env_pyspark_python=os.path.join("/software","anaconda3","5.1.0","bin","python")

os.environ['SPARK_HOME'] = env_spark_home
os.environ['SPARK_CONF_DIR'] = env_spark_conf_dir
os.environ['PYSPARK_PYTHON'] = env_pyspark_python

fp = open(os.path.join(env_spark_conf_dir,"master"))
node_list = fp.readlines()

import pyspark
conf = pyspark.SparkConf()
conf.setMaster("spark://" + node_list[0].strip() + ":7077")
conf.setAppName('big-data-workshop')
conf.set("spark.driver.memory","5g")
conf.set("spark.executor.instances", "3")
conf.set("spark.executor.memory","13g")
conf.set("spark.executor.cores","8")

sc = pyspark.SparkContext(conf=conf)

print(sc)

### Chicago Crime Data

**Spark SQL**
- Spark module for structured data processing
- provides more information about the structure of both the data and the computation being performed for additional optimization
- execute SQL queries written using either a basic SQL syntax or HiveQL

**DataFrame**
- a distributed collection of data organized into named columns
- conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood
- can be constructed from a wide array of sources such as: structured data files, tables in Hive, external databases, or existing RDDs

In [None]:
sqlContext = pyspark.SQLContext(sc)
df = sqlContext.read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .load("/zfs/citi/airlines/data/")\
    .cache()
df.printSchema()

In [None]:
%%time
df.count()

In [None]:
%%time
df.count()

In [None]:
import pandas as pd
pd.DataFrame(df.take(10), columns=df.columns).transpose()

In [None]:
df.printSchema()

## Problems?

- Columns with NA (too many NA?): TailNum, CancellationCode, CarrierDelay, WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay. 
  - Any meaningful data?
- Mismatched: ArrDelay and DepDelay - see numbers but of type string?

In [None]:
df_NA = df.select("TailNum").distinct()
df_NA.count()

In [None]:
df_WeatherDelay = df.select("WeatherDelay").distinct()
df_WeatherDelay.count()

In [None]:
df_WeatherDelay.collect()

### Challenge

- Study CarrierDelay, WeatherDelay, NASDelay, and SecurityDelay
- Provide some insights into these columns

## Remove NAs and convert to integers

In [None]:
df_ArrDelay = df.select("ArrDelay").distinct()
print(df_ArrDelay.count())

df_DepDelay = df.select("DepDelay").distinct()
print(df_DepDelay.count())

In [None]:
pd_arrDelay = df_ArrDelay.collect()
pd_arrDelay

There is the problem: "NA"
Can we drop it and recast?

In [None]:
df_clean = df.filter((~df.ArrDelay.like("NA")) | (~df.DepDelay.like("NA"))).cache()

In [None]:
%%time
df_clean.count()

Recast string to int

In [None]:
from pyspark.sql.types import IntegerType
df_clean = df_clean.withColumn("ArrDelay", df_clean.ArrDelay.cast(IntegerType()))
df_clean = df_clean.withColumn("DepDelay", df_clean.DepDelay.cast(IntegerType()))
df_clean.printSchema()

Big calculation ...

In [None]:
%%time
numeric_features = [t[0] for t in df_clean.dtypes if t[1] == 'int']
print(numeric_features)
df_clean.select(numeric_features).describe().toPandas().transpose()

Before we can do some regressions, we need to have an idea: scatter matrix. 
However, it is not possible to try to do it on the whole dataset, since we need to move them back to driver side. 

Look for sample under https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame

In [None]:
numeric_data = df_clean.select(numeric_features).sample(0.00001, 3).toPandas()

In [None]:
numeric_data.describe()

In [None]:
axs = pd.plotting.scatter_matrix(numeric_data, figsize=(11, 11));
n = len(numeric_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n-1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())

What are your observations?

In [None]:
%%time
for c in numeric_features:
    print("Correlation to departure delay for ", c, df_clean.stat.corr('DepDelay',c))

In [None]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols=['Month','DayofMonth','DayOfWeek'], outputCol = 'features')
v_df = vectorAssembler.transform(df_clean)
v_df = v_df.select(['features','DepDelay'])
v_df.show(10)

In [None]:
splits = v_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [None]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='DepDelay', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [None]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [None]:
train_df.describe().show()

In [None]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","DepDelay","features").show()

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="DepDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

In [None]:
!bash stop_spark_cluster.sh