In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

### File Reading

In [0]:
dbutils.fs.ls("/Volumes/workspace/pyspark/filedata")

[FileInfo(path='dbfs:/Volumes/workspace/pyspark/filedata/BigMart Sales.csv', name='BigMart Sales.csv', size=869537, modificationTime=1754407798000)]

In [0]:
df = spark.read.format("csv").option("header", "true") \
                              .option("inferSchema", "true") \
                              .load("/Volumes/workspace/pyspark/filedata/BigMart Sales.csv")

In [0]:
df_json = spark.read.format("json").option("header", "true") \
                                .option("inferSchema", "true") \
                                .option("multiline", "false") \
                                .load("/Volumes/workspace/pyspark/filedata/drivers.json")

### Schema Definition

In [0]:
df.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



In [0]:
ddl = """
    Item_Identifier string, 
    Item_Weight string, 
    Item_Fat_Content string, 
    Item_Visibility double, 
    Item_Type string, 
    Item_MRP double, 
    Outlet_Identifier string, 
    Outlet_Establishment_Year integer, 
    Outlet_Size string, 
    Outlet_Location_Type string, 
    Outlet_Type string, 
    Item_Outlet_Sales double
"""
df = spark.read.format("csv").option("header", "true") \
                              .option("inferSchema", "true") \
                              .schema(ddl) \
                              .load("/Volumes/workspace/pyspark/filedata/BigMart Sales.csv")

#### StructType

In [0]:
struct_ddl = StructType([
    StructField("Item_Identifier", StringType(), True),
    StructField("Item_Weight", StringType(), True), 
    StructField("Item_Fat_Content", StringType(), True), 
    StructField("Item_Visibility", DoubleType(), True), 
    StructField("Item_Type", StringType(), True), 
    StructField("Item_MRP", DoubleType(), True), 
    StructField("Outlet_Identifier", StringType(), True), 
    StructField("Outlet_Establishment_Year", IntegerType(), True), 
    StructField("Outlet_Size", StringType(), True), 
    StructField("Outlet_Location_Type", StringType(), True), 
    StructField("Outlet_Type", StringType(), True), 
    StructField("Item_Outlet_Sales", DoubleType(), True)  
])
df = spark.read.format("csv").option("header", "true") \
                              .option("inferSchema", "true") \
                              .schema(struct_ddl) \
                              .load("/Volumes/workspace/pyspark/filedata/BigMart Sales.csv")

### Transformation

#### select()

In [0]:
df.select(col("Item_Identifier"), col("Item_Type")).limit(5).display()

Item_Identifier,Item_Type
FDA15,Dairy
DRC01,Soft Drinks
FDN15,Meat
FDX07,Fruits and Vegetables
NCD19,Household


#### alias()

In [0]:
df.select(col("Item_Identifier").alias("Item_ID")).limit(5).display()

Item_ID
FDA15
DRC01
FDN15
FDX07
NCD19


#### filter()

In [0]:
df.filter(col("Item_Type") == "Household").limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
NCB30,14.6,Low Fat,0.025698134,Household,196.5084,OUT035,2004,Small,Tier 2,Supermarket Type1,1587.2672
NCD06,13.0,Low Fat,0.099887103,Household,45.906,OUT017,2007,,Tier 2,Supermarket Type1,838.908
NCP18,12.15,Low Fat,0.028760013,Household,151.4708,OUT017,2007,,Tier 2,Supermarket Type1,4815.0656
NCX54,9.195,Low Fat,0.048157338,Household,106.1622,OUT045,2002,,Tier 2,Supermarket Type1,2117.244


In [0]:
df.filter((col("Outlet_Size").isNull()) & (col("Item_Type").isin("Household", "Diary"))).limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
NCD06,13.0,Low Fat,0.099887103,Household,45.906,OUT017,2007,,Tier 2,Supermarket Type1,838.908
NCP18,12.15,Low Fat,0.028760013,Household,151.4708,OUT017,2007,,Tier 2,Supermarket Type1,4815.0656
NCX54,9.195,Low Fat,0.048157338,Household,106.1622,OUT045,2002,,Tier 2,Supermarket Type1,2117.244
NCF19,13.0,Low Fat,0.035307322,Household,47.5034,OUT017,2007,,Tier 2,Supermarket Type1,680.4476
NCP30,20.5,Low Fat,0.032835147,Household,40.2822,OUT045,2002,,Tier 2,Supermarket Type1,707.0796


#### withColumnRenamed()

In [0]:
df.withColumnRenamed("Item_Identifier", "Item_ID").limit(5).display()

Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


#### withColumn()

In [0]:
df.withColumn("flag", lit("new")).limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,flag
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,new
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,new
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,new
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,new
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,new


In [0]:
df.withColumn("Sales_per_Weight", col("Item_Outlet_Sales") / col("Item_Weight")).limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Sales_per_Weight
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,401.6277419354838
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,74.9025
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,119.844
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,38.14479166666667
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,111.3891601343785


In [0]:
df.withColumn("Item_Type", regexp_replace(col("Item_Type"), "Household", "HH")) \
    .withColumn("Item_Type", regexp_replace(col("Item_Type"), "Soft Drinks", "SD")) \
    .limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
DRC01,5.92,Regular,0.019278216,SD,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
NCD19,8.93,Low Fat,0.0,HH,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


#### cast()

In [0]:
df.withColumn("Item_MRP", col("Item_MRP").cast(StringType())).limit(5).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


#### sort()

In [0]:
df.sort(col("Item_Weight").desc()).limit(10).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
NCJ54,9.895,Low Fat,0.060017128,Household,230.8642,OUT013,1987,High,Tier 3,Supermarket Type1,4414.9198
FDR13,9.895,Regular,0.028696932,Canned,117.0492,OUT013,1987,High,Tier 3,Supermarket Type1,810.9444
DRD49,9.895,Low Fat,0.167799329,Soft Drinks,239.4564,OUT035,2004,Small,Tier 2,Supermarket Type1,5243.8408
FDR13,9.895,Regular,0.028837829,Canned,117.8492,OUT018,2009,Medium,Tier 3,Supermarket Type2,1506.0396
DRD49,9.895,Low Fat,0.168780385,Soft Drinks,236.8564,OUT017,2007,,Tier 2,Supermarket Type1,4767.128
DRD49,9.895,LF,0.167831064,Soft Drinks,237.4564,OUT046,1997,Small,Tier 1,Supermarket Type1,715.0692
NCJ54,9.895,LF,0.060067115,Household,230.6642,OUT046,1997,Small,Tier 1,Supermarket Type1,4647.284
DRD49,9.895,Low Fat,0.16817143,Soft Drinks,237.7564,OUT045,2002,,Tier 2,Supermarket Type1,3813.7024
FDT16,9.895,Regular,0.048761046,Frozen Foods,260.5278,OUT045,2002,,Tier 2,Supermarket Type1,8851.1452
FDT16,9.895,Regular,0.048662357,Frozen Foods,261.7278,OUT046,1997,Small,Tier 1,Supermarket Type1,4685.9004


In [0]:
df.sort(["Item_Fat_Content", "Item_Visibility"], ascending=[0, 1]).limit(10).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
FDT07,5.82,reg,0.0,Fruits and Vegetables,256.633,OUT049,1999,Medium,Tier 1,Supermarket Type1,2050.664
FDS31,13.1,reg,0.0,Fruits and Vegetables,178.9318,OUT049,1999,Medium,Tier 1,Supermarket Type1,3067.3406
FDE41,,reg,0.0,Frozen Foods,83.7566,OUT019,1985,Small,Tier 1,Grocery Store,253.6698
FDM58,,reg,0.0,Snack Foods,112.2544,OUT027,1985,Medium,Tier 3,Supermarket Type3,3914.904
FDB23,19.2,reg,0.005583951,Starchy Foods,226.9062,OUT013,1987,High,Tier 3,Supermarket Type1,4514.124
FDG35,,reg,0.007006883,Starchy Foods,173.5738,OUT027,1985,Medium,Tier 3,Supermarket Type3,1216.4166
FDT39,6.26,reg,0.009883257,Meat,149.8366,OUT049,1999,Medium,Tier 1,Supermarket Type1,1964.7758
FDF45,18.2,reg,0.012229464,Fruits and Vegetables,56.8904,OUT045,2002,,Tier 2,Supermarket Type1,1406.1696
FDO10,13.65,reg,0.0127517,Snack Foods,55.5588,OUT046,1997,Small,Tier 1,Supermarket Type1,1603.2464
FDB58,10.5,reg,0.013485235,Snack Foods,140.0154,OUT013,1987,High,Tier 3,Supermarket Type1,3119.9388


#### limit()

In [0]:
df.limit(3).display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
