### Transformation and Action

In [2]:
from pyspark.sql.functions import (lit,col,concat,split,expr)
import os
from pyspark.sql import functions as F
import json

from pyspark.sql.types import StructType,StructField,StringType

import time

In [3]:
# start time
t0 = int(round(time.time() * 1000))

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown('# <span style="color:red">'+string+'</span>'))

if ('sc' in locals() or 'sc' in globals()):
    printmd('<<<<<!!!!! It seems that you are running in a IBM Watson Studio Apache Spark Notebook. Please run it in an IBM Watson Studio Default Runtime (without Apache Spark) !!!!!>>>>>')

!pip install pyspark==2.4.5

try:
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
except ImportError as e:
    printmd('<<<<<!!!!! Please restart your kernel after installing Apache Spark !!!!!>>>>>')
    
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

print("Apache Spark session created.")




current_dir = os.path.dirname("")
relative_path = "../04 Laziness/data/NCHS_-_Teen_Birth_Rates_for_Age_Group_15-19_in_the_United_States_by_County.csv"
absolute_file_path = os.path.join(current_dir, relative_path)

absolute_file_path

mode=""

#t0 = int(round(time.time() * 1000))

t1 = int(round(time.time() * 1000))

print("1. Creating a session ........... {}".format(t1 - t0))

# Step 2 - Reads a CSV file with header, stores it in a dataframe
df = spark.read.csv(header=True, inferSchema=True,path=absolute_file_path)

initalDf = df
t2 = int(round(time.time() * 1000))
print("2. Loading initial dataset ...... {}".format(t2 - t1))

# Step 3 - Build a bigger dataset
for x in range(60):
    df = df.union(initalDf)

t3 = int(round(time.time() * 1000))
print("3. Building full dataset ........ {}".format(t3 - t2))

# Step 4 - Cleanup. preparation
df = df.withColumnRenamed("Lower Confidence Limit", "lcl") \
       .withColumnRenamed("Upper Confidence Limit", "ucl")

t4 = int(round(time.time() * 1000))
print("4. Clean-up ..................... {}".format(t4 - t3))

# Step 5 - Transformation
if mode.lower != "noop":
    df =  df.withColumn("avg", expr("(lcl+ucl)/2")) \
            .withColumn("lcl2", col("lcl")) \
            .withColumn("ucl2", col("ucl"))
    if mode.lower == "full":
        df = df.drop("avg","lcl2","ucl2")


t5 = int(round(time.time() * 1000))
print("5. Transformations  ............. {}".format(t5 - t4))

# Step 6 - Action
df.collect()
t6 = int(round(time.time() * 1000))
print("6. Final action ................. {}".format(t6 - t5))

print("")
print("# of records .................... {}".format(df.count))

Apache Spark session created.
1. Creating a session ........... 25554
2. Loading initial dataset ...... 40086
3. Building full dataset ........ 5474
4. Clean-up ..................... 102
5. Transformations  ............. 1799
6. Final action ................. 240446

# of records .................... <bound method DataFrame.count of DataFrame[Year: int, State: string, County: string, State FIPS Code: int, County FIPS Code: int, Combined FIPS Code: int, Birth Rate: double, lcl: double, ucl: double, avg: double, lcl2: double, ucl2: double]>


In [4]:
df.count

<bound method DataFrame.count of DataFrame[Year: int, State: string, County: string, State FIPS Code: int, County FIPS Code: int, Combined FIPS Code: int, Birth Rate: double, lcl: double, ucl: double, avg: double, lcl2: double, ucl2: double]>

In [5]:
df.schema

StructType(List(StructField(Year,IntegerType,true),StructField(State,StringType,true),StructField(County,StringType,true),StructField(State FIPS Code,IntegerType,true),StructField(County FIPS Code,IntegerType,true),StructField(Combined FIPS Code,IntegerType,true),StructField(Birth Rate,DoubleType,true),StructField(lcl,DoubleType,true),StructField(ucl,DoubleType,true),StructField(avg,DoubleType,true),StructField(lcl2,DoubleType,true),StructField(ucl2,DoubleType,true)))

In [6]:
df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- State: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State FIPS Code: integer (nullable = true)
 |-- County FIPS Code: integer (nullable = true)
 |-- Combined FIPS Code: integer (nullable = true)
 |-- Birth Rate: double (nullable = true)
 |-- lcl: double (nullable = true)
 |-- ucl: double (nullable = true)
 |-- avg: double (nullable = true)
 |-- lcl2: double (nullable = true)
 |-- ucl2: double (nullable = true)

