# Creating a SPARK Session 

In [115]:
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("My Application")
         .config("spark.ui.port", "4050")
         .getOrCreate()
         )

# Load the dataset

In [116]:

dataset_path = "s3://data-engg-suman/dataset/orders.csv"

# Load the data (Read is an ACTION)
ordersdf = (spark
            .read
            .option("header", True)       # for headers (1st line of the dataset)
            .option("inferSchema", True)  # for datatypes
            .csv(dataset_path)
            )

# Explore the dataset

In [117]:
# Print the first 10 lines 
ordersdf.show(10)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
+--------+-------------------+-----------------+---------------+
only showing top 10 rows



In [118]:
# Print the schema 
ordersdf.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [119]:
groupedOrderdf = (ordersdf.where("order_customer_id > 1000")              # This is NOT an action, like filer in RDD 
                          .select("order_id", "order_customer_id")        # This is NOT an action, like map in RDD
                          .groupBy("order_customer_id")                   # This is NOT an action, like groupByKey/reduceByKey in RDD
                          .count()                                        # This is NOT an action, like count in RDD 
                 )

In [120]:
groupedOrderdf.show()  # This is an action, by default shows 20 rows

+-----------------+-----+
|order_customer_id|count|
+-----------------+-----+
|             1088|    4|
|            12046|    4|
|             6357|    7|
|             4101|    5|
|             9465|    7|
|             2122|    2|
|            10206|    4|
|            10817|    6|
|             3918|    7|
|            11141|    7|
|             1591|    6|
|            10623|    8|
|             2366|    4|
|             4519|    3|
|            11317|    7|
|             1342|    3|
|             8638|    8|
|             9852|    2|
|            10362|    6|
|             7880|    5|
+-----------------+-----+
only showing top 20 rows



In [121]:
groupedOrderdf.show(5) 

+-----------------+-----+
|order_customer_id|count|
+-----------------+-----+
|             1088|    4|
|            12046|    4|
|             6357|    7|
|             4101|    5|
|             9465|    7|
+-----------------+-----+
only showing top 5 rows



In [122]:
spark.stop()

# Problem 1 

## Step 1 : Create a Spark Session

In [123]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

my_conf = SparkConf()
my_conf.set("spark.app.name", "Problem 1")
my_conf.set("spark.ui.port", "4050")
spark = SparkSession \
            .builder \
            .config(conf=my_conf) \
            .getOrCreate()

## Step 2 : Set the logging level to `error`

In [124]:
spark.sparkContext.setLogLevel("ERROR")

## Step 3 : Using the standard dataframe reader API load the file and create a dataframe

In [125]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, FloatType

window_data_schema = StructType(
    [
        StructField('country', StringType()),
        StructField('weeknum', IntegerType()),
        StructField('numinvoices', IntegerType()),
        StructField('totalquantity', IntegerType()), 
        StructField('invoicevalue', FloatType())  
    ]
)

# Read the data from the CSV 
DATASET_PATH = "s3://data-engg-suman/dataset/windowdata.csv"

windows_data_df = spark \
                    .read \
                    .format('csv') \
                    .option('header', True) \
                    .schema(window_data_schema) \
                    .option('path', DATASET_PATH) \
                    .load()


In [126]:
windows_data_df.show()

+--------------+-------+-----------+-------------+------------+
|       country|weeknum|numinvoices|totalquantity|invoicevalue|
+--------------+-------+-----------+-------------+------------+
|       Germany|     48|         11|         1795|     3309.75|
|     Lithuania|     48|          3|          622|     1598.06|
|       Germany|     49|         12|         1852|     4521.39|
|       Bahrain|     51|          1|           54|      205.74|
|       Iceland|     49|          1|          319|      711.79|
|         India|     51|          5|           95|      276.84|
|     Australia|     50|          2|          133|      387.95|
|         Italy|     49|          1|           -2|       -17.0|
|         India|     49|          5|         1280|      3284.1|
|         Spain|     50|          2|          400|     1049.01|
|United Kingdom|     51|        200|        28782|    75103.46|
|        Norway|     49|          1|         1730|     1867.98|
|United Kingdom|     48|        478|    

## Step 4 : Use the standard dataframe `writer` api to save it in `parquet` format. 
While saving make sure data is stored where we should have a folder for each country, weeknum (combination)

In [127]:

import os 

OUTPUT_PATH = "s3://data-engg-suman/processed_data"
OUTPUT_FILE_PATH = os.path.join(OUTPUT_PATH, 'windows_data')

windows_data_df \
    .write \
    .partitionBy('country', 'weeknum') \
    .parquet(path=OUTPUT_FILE_PATH, mode='overwrite')


                                                                                

In [128]:
windows_data_parquat_df = spark \
                            .read \
                            .format('parquet') \
                            .option('path', OUTPUT_FILE_PATH) \
                            .load()

In [129]:
windows_data_parquat_df.show()

                                                                                

+-----------+-------------+------------+---------------+-------+
|numinvoices|totalquantity|invoicevalue|        country|weeknum|
+-----------+-------------+------------+---------------+-------+
|          1|          107|      358.25|      Australia|     48|
|          1|          196|      320.08|          Japan|     48|
|         15|         1973|     5065.79|        Germany|     50|
|          3|          622|     1598.06|      Lithuania|     48|
|          4|          726|     1844.67|       Portugal|     49|
|          2|          133|      387.95|      Australia|     50|
|          2|            3|      257.04|        Austria|     50|
|          1|          164|       427.8|          Italy|     48|
|          2|         3897|     7384.99|          Japan|     49|
|          1|         1254|       892.8|        Finland|     50|
|          1|           -2|       -17.0|          Italy|     49|
|          4|         1299|     2808.16|         France|     48|
|          1|          21

                                                                                

In [130]:
windows_data_df.show()

+--------------+-------+-----------+-------------+------------+
|       country|weeknum|numinvoices|totalquantity|invoicevalue|
+--------------+-------+-----------+-------------+------------+
|       Germany|     48|         11|         1795|     3309.75|
|     Lithuania|     48|          3|          622|     1598.06|
|       Germany|     49|         12|         1852|     4521.39|
|       Bahrain|     51|          1|           54|      205.74|
|       Iceland|     49|          1|          319|      711.79|
|         India|     51|          5|           95|      276.84|
|     Australia|     50|          2|          133|      387.95|
|         Italy|     49|          1|           -2|       -17.0|
|         India|     49|          5|         1280|      3284.1|
|         Spain|     50|          2|          400|     1049.01|
|United Kingdom|     51|        200|        28782|    75103.46|
|        Norway|     49|          1|         1730|     1867.98|
|United Kingdom|     48|        478|    

## Step 5: Also use the dataframe write api to save the data in Avro format. 
While saving make sure data is stored where we should have a folder for each country.

In [131]:
# Ending the previous Spark Session 
spark.stop()

In [132]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

my_conf = SparkConf()
my_conf.set("spark.app.name", "Problem 1 (Avro)")
my_conf.set("spark.ui.port", "4050")
my_conf.set("spark.jars", "s3://data-engg-suman/bin/spark-avro_2.11-2.4.4.jar")
spark = SparkSession \
            .builder \
            .config(conf=my_conf) \
            .getOrCreate()

In [133]:
# First load the data in the DF 

from pyspark.sql.types import StructField, StructType, IntegerType, StringType, FloatType

window_data_schema = StructType(
    [
        StructField('country', StringType()),
        StructField('weeknum', IntegerType()),
        StructField('numinvoices', IntegerType()),
        StructField('totalquantity', IntegerType()), 
        StructField('invoicevalue', FloatType())  
    ]
)

# Read the data from the CSV 
DATASET_PATH = "s3://data-engg-suman/dataset/windowdata.csv"

windows_data_df = spark \
                    .read \
                    .format('csv') \
                    .option('header', True) \
                    .schema(window_data_schema) \
                    .option('path', DATASET_PATH) \
                    .load()

In [134]:
# Now writing the data back in AVRO format 

import os 

OUTPUT_PATH = "s3://data-engg-suman/processed_data"
OUTPUT_FILE_PATH = os.path.join(OUTPUT_PATH, 'windows_data_avro')
    
windows_data_df \
    .write \
    .format("avro") \
    .mode('Overwrite') \
    .option('path', OUTPUT_FILE_PATH) \
    .save() 


AnalysisException:  Failed to find data source: avro. Avro is built-in but external data source module since Spark 2.4. Please deploy the application as per the deployment section of "Apache Avro Data Source Guide".        

In [135]:
import pyspark
pyspark.__version__

'3.3.0+amzn.0'

In [136]:
spark.stop()

# Problem 2 

## Step 1: Create spark session

In [137]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

my_conf = SparkConf()
my_conf.set("spark.app.name", "Problem 2")
my_conf.set("spark.ui.port", "4050")
spark = SparkSession \
            .builder \
            .config(conf=my_conf) \
            .getOrCreate()

## Step 2: Set the logging level to error

In [138]:
spark.sparkContext.setLogLevel("ERROR")

## Step 3: Load the data file windowdata.csv as a rdd

In [139]:
# Read the data from the CSV 
DATASET_PATH = "s3://data-engg-suman/dataset/windowdata.csv"

rdd = spark.sparkContext.textFile(DATASET_PATH)



In [140]:
rdd.take(1)

['Spain,49,1,67,174.72']

## Step 4: Create a dataframe from this RDD by defining case class

In [141]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, FloatType

window_data_schema = StructType(
    [
        StructField('country', StringType()),
        StructField('weeknum', IntegerType()),
        StructField('numinvoices', IntegerType()),
        StructField('totalquantity', IntegerType()), 
        StructField('invoicevalue', FloatType())  
    ]
)

In [142]:
def parser(line):
    _ = line.split(',')
    return str(_[0]), int(_[1]), int(_[2]), int(_[3]), float(_[4])

In [143]:
rdd.map(parser).take(1)

[('Spain', 49, 1, 67, 174.72)]

In [144]:
final_df = rdd.map(parser).toDF(window_data_schema)

In [145]:
type(final_df)

pyspark.sql.dataframe.DataFrame

In [146]:
final_df.show()

+--------------+-------+-----------+-------------+------------+
|       country|weeknum|numinvoices|totalquantity|invoicevalue|
+--------------+-------+-----------+-------------+------------+
|         Spain|     49|          1|           67|      174.72|
|       Germany|     48|         11|         1795|     3309.75|
|     Lithuania|     48|          3|          622|     1598.06|
|       Germany|     49|         12|         1852|     4521.39|
|       Bahrain|     51|          1|           54|      205.74|
|       Iceland|     49|          1|          319|      711.79|
|         India|     51|          5|           95|      276.84|
|     Australia|     50|          2|          133|      387.95|
|         Italy|     49|          1|           -2|       -17.0|
|         India|     49|          5|         1280|      3284.1|
|         Spain|     50|          2|          400|     1049.01|
|United Kingdom|     51|        200|        28782|    75103.46|
|        Norway|     49|          1|    

## Step 5: Save this dataframe in JSON format in 8 files.

In [147]:
DATASET_PATH = "s3://data-engg-suman/processed_data/windowdata.json"


In [148]:
final_df.write \
    .format('json') \
    .mode('Overwrite') \
    .option('path', DATASET_PATH) \
    .save() 

In [149]:
spark.stop() 