In [17]:
!pip install pyspark



In [18]:
import pyspark
pyspark.__version__

'3.5.1'

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master('local') \
        .appName('c4') \
        .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/05/31 22:10:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [20]:
!head data/bakery.csv

ProductName,OriginalPrice,ApplicablePrice,Type,PercentageDiscount,Category
Millbakers English Muffins 300g,132,132.00,FOOD,0,bakery
Millbakers Queen Cupcakes 260g,99,99.00,FOOD,0,bakery
Sweet Moment Lemon &amp; Poppy Muffin 6&#39;s 300g,180,180.00,FOOD,0,bakery
Sweet Moment Chocchip Muffin 6&#39;s300g,180,180.00,FOOD,0,bakery
Millbakers Queen Cupcakes 200g,84,84.00,FOOD,0,bakery
Millbakers Family Madeira Cake 750g,236,236.00,FOOD,0,bakery
Festive Milky White Bread 800G,127,127.00,FOOD,0,bakery
Millbakers Standard Madeira Cake 500g,165,165.00,FOOD,0,bakery
Joy Super Bakers Queen Cake 350g (12 Pieces),154,154.00,FOOD,0,bakery


In [21]:
!wc -l data/bakery.csv

253 data/bakery.csv


In [22]:
# create a spark dataframe by reading the same file
df = spark.read.csv('data/bakery.csv')

In [23]:
df.show()

+--------------------+-------------+---------------+----+------------------+--------+
|                 _c0|          _c1|            _c2| _c3|               _c4|     _c5|
+--------------------+-------------+---------------+----+------------------+--------+
|         ProductName|OriginalPrice|ApplicablePrice|Type|PercentageDiscount|Category|
|Millbakers Englis...|          132|         132.00|FOOD|                 0|  bakery|
|Millbakers Queen ...|           99|          99.00|FOOD|                 0|  bakery|
|Sweet Moment Lemo...|          180|         180.00|FOOD|                 0|  bakery|
|Sweet Moment Choc...|          180|         180.00|FOOD|                 0|  bakery|
|Millbakers Queen ...|           84|          84.00|FOOD|                 0|  bakery|
|Millbakers Family...|          236|         236.00|FOOD|                 0|  bakery|
|Festive Milky Whi...|          127|         127.00|FOOD|                 0|  bakery|
|Millbakers Standa...|          165|         165.00|FO

In [24]:
# We forgot the header
df = spark.read \
     .option('header', 'true') \
     .csv('data/bakery.csv')

df.show()

+--------------------+-------------+---------------+----+------------------+--------+
|         ProductName|OriginalPrice|ApplicablePrice|Type|PercentageDiscount|Category|
+--------------------+-------------+---------------+----+------------------+--------+
|Millbakers Englis...|          132|         132.00|FOOD|                 0|  bakery|
|Millbakers Queen ...|           99|          99.00|FOOD|                 0|  bakery|
|Sweet Moment Lemo...|          180|         180.00|FOOD|                 0|  bakery|
|Sweet Moment Choc...|          180|         180.00|FOOD|                 0|  bakery|
|Millbakers Queen ...|           84|          84.00|FOOD|                 0|  bakery|
|Millbakers Family...|          236|         236.00|FOOD|                 0|  bakery|
|Festive Milky Whi...|          127|         127.00|FOOD|                 0|  bakery|
|Millbakers Standa...|          165|         165.00|FOOD|                 0|  bakery|
|Joy Super Bakers ...|          154|         154.00|FO

In [25]:
# Check if the schema is correct (Spark always infers a csv file's schema as StringType for all the columns)
df.schema

StructType([StructField('ProductName', StringType(), True), StructField('OriginalPrice', StringType(), True), StructField('ApplicablePrice', StringType(), True), StructField('Type', StringType(), True), StructField('PercentageDiscount', StringType(), True), StructField('Category', StringType(), True)])

In [26]:
# Define our own schema using pandas
!pip install pandas

import pandas as pd

df_pandas = pd.read_csv('data/bakery.csv')



In [28]:
df_pandas.dtypes

ProductName            object
OriginalPrice           int64
ApplicablePrice       float64
Type                   object
PercentageDiscount      int64
Category               object
dtype: object

In [29]:
# use the pandas data frame to create a spark schema
spark.createDataFrame(df_pandas).schema

StructType([StructField('ProductName', StringType(), True), StructField('OriginalPrice', LongType(), True), StructField('ApplicablePrice', DoubleType(), True), StructField('Type', StringType(), True), StructField('PercentageDiscount', LongType(), True), StructField('Category', StringType(), True)])

In [30]:
from pyspark.sql import types

schema = types.StructType([
            types.StructField('ProductName', types.StringType(), True), 
            types.StructField('OriginalPrice', types.LongType(), True), 
            types.StructField('ApplicablePrice', types.LongType(), True), 
            types.StructField('Type', types.StringType(), True), 
            types.StructField('PercentageDiscount', types.LongType(), True), 
            types.StructField('Category', types.StringType(), True)
        ])

In [32]:
# Reread the data with the schema
df = spark.read \
     .option('header', 'true') \
     .schema(schema) \
     .csv('data/bakery.csv')

df.schema

StructType([StructField('ProductName', StringType(), True), StructField('OriginalPrice', LongType(), True), StructField('ApplicablePrice', LongType(), True), StructField('Type', StringType(), True), StructField('PercentageDiscount', LongType(), True), StructField('Category', StringType(), True)])

In [35]:
# write to a parquet file mode append to include all
df.write.parquet('data/pq/all_foods')

In [7]:
df = spark.read.parquet('data/pq/all_foods')
df.show()

+--------------------+-------------+---------------+----+------------------+---------+
|         ProductName|OriginalPrice|ApplicablePrice|Type|PercentageDiscount| Category|
+--------------------+-------------+---------------+----+------------------+---------+
|Red Bull Energy D...|          920|           null|FOOD|                15|beverages|
|Organic India Ori...|          799|           null|FOOD|                15|beverages|
|Quencher Life Pre...|          299|           null|FOOD|                 0|beverages|
|Mayers Natural Sp...|          103|           null|FOOD|                 0|beverages|
|Carrefour Mineral...|          495|           null|FOOD|                 0|beverages|
|Pick N Peel Orang...|          292|           null|FOOD|                13|beverages|
|Kericho Gold Pure...|          335|           null|FOOD|                 0|beverages|
|Quencher Life Pre...|          514|           null|FOOD|                 0|beverages|
|   Coca Cola Soda 2L|          190|       

Now we perform the sql queries on the full data