In [1]:
# set env
import os
os.environ['SPARK_HOME'] = "/home/cloud_user/apps/spark/current"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [2]:
# Import PySpark
from pyspark.sql import SparkSession

In [3]:
# Create SparkSession
spark = SparkSession.builder \
    .appName("dataframe-operations") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/26 03:59:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
data_file_path = "./data/products.csv"
df = spark.read.csv(data_file_path, header=True, inferSchema=True)

                                                                                

In [6]:
# Display the schema
df.printSchema()

# get sample record
df.show(1)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)



                                                                                

+---+---------+-----------+--------+------+
| id|     name|   category|quantity| price|
+---+---------+-----------+--------+------+
|  1|iPhone 12|Electronics|      10|899.99|
+---+---------+-----------+--------+------+
only showing top 1 row



## Choose Specific Columns

In [7]:
# Select specific Columns
selected_columns = df.select("id","name","price")
print("Selected Columns:")
selected_columns.show(2)

Selected Columns:


                                                                                

+---+---------------+------+
| id|           name| price|
+---+---------------+------+
|  1|      iPhone 12|899.99|
|  2|Nike Air max 90|119.99|
+---+---------------+------+
only showing top 2 rows



## Filter Rows

In [8]:
# only fetch records where quantity > 20
filtered_data = df.filter(df.quantity > 20)
print(f"Greater than 20: {filtered_data.count()}")
filtered_data.show()

                                                                                

Greater than 20: 4


                                                                                

+---+--------------------+--------+--------+------+
| id|                name|category|quantity| price|
+---+--------------------+--------+--------+------+
|  2|     Nike Air max 90|Clothing|      25|119.99|
|  4|    The Great Gatsby|   Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|  Beauty|     100|  9.99|
|  6|            Yoga Mat|  Sports|      30| 29.99|
+---+--------------------+--------+--------+------+



## Grouping and Aggregation

In [10]:
# GroupBy and Aggregate
grouped_data = df.groupBy("category").agg({"quantity":"sum", "price":"avg"})
print("Grouped and Aggregated data:")
grouped_data.show()

Grouped and Aggregated data:


                                                                                

+---------------+-------------+-----------------+
|       category|sum(quantity)|       avg(price)|
+---------------+-------------+-----------------+
|         Sports|           30|            29.99|
|    Electronics|           22|799.9900000000001|
|       Clothing|           40|            84.99|
|          Books|           50|            12.99|
|Home Appliances|            8|           349.99|
|         Beauty|          100|             9.99|
|           Toys|           12|          275.485|
+---------------+-------------+-----------------+



## Join Dataframe

In [11]:
# Join based on columns - create a new dataframe (to represent other data) and join on id
df2 = df.select("id", "category").limit(10)
joined_data = df.join(df2, "id", "inner")
print("Joined Data:")
joined_data.show()

Joined Data:


                                                                                

+---+--------------------+---------------+--------+------+---------------+
| id|                name|       category|quantity| price|       category|
+---+--------------------+---------------+--------+------+---------------+
|  1|           iPhone 12|    Electronics|      10|899.99|    Electronics|
|  2|     Nike Air max 90|       Clothing|      25|119.99|       Clothing|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|Home Appliances|
|  4|    The Great Gatsby|          Books|      50| 12.99|          Books|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|         Beauty|
|  6|            Yoga Mat|         Sports|      30| 29.99|         Sports|
|  7| Samsung 4k Smart TV|    Electronics|       8|799.99|    Electronics|
|  8|        Levi's Jeans|       Clothing|      15| 49.99|       Clothing|
|  9|Dyson Vacuum Cleaner|Home Appliances|       3|399.99|Home Appliances|
| 10|  Google Pixel 8 Pro|    Electronics|       4|699.99|    Electronics|
+---+--------------------

## Sorting Data

In [12]:
# Sort by a column
sorted_data = df.orderBy("price")
print("Sorted Data by price:")
sorted_data.show()

Sorted Data by price:


[Stage 15:>                                                         (0 + 1) / 1]

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  6|            Yoga Mat|         Sports|      30| 29.99|
|  8|        Levi's Jeans|       Clothing|      15| 49.99|
| 11|    Darth Vader Lego|           Toys|      10| 49.99|
|  2|     Nike Air max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  9|Dyson Vacuum Cleaner|Home Appliances|       3|399.99|
| 12|       Colosium Lego|           Toys|       2|500.98|
| 10|  Google Pixel 8 Pro|    Electronics|       4|699.99|
|  7| Samsung 4k Smart TV|    Electronics|       8|799.99|
|  1|           iPhone 12|    Electronics|      10|899.99|
+---+--------------------+---------------+--------+------+



                                                                                

In [13]:
# Sort by a column Desc
from pyspark.sql.functions import col, desc
sorted_data = df.orderBy(col("price").desc(), col("id").desc())
print("Sorted Data by Price Desc")
sorted_data.show()

Sorted Data by Price Desc


                                                                                

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  7| Samsung 4k Smart TV|    Electronics|       8|799.99|
| 10|  Google Pixel 8 Pro|    Electronics|       4|699.99|
| 12|       Colosium Lego|           Toys|       2|500.98|
|  9|Dyson Vacuum Cleaner|Home Appliances|       3|399.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  2|     Nike Air max 90|       Clothing|      25|119.99|
| 11|    Darth Vader Lego|           Toys|      10| 49.99|
|  8|        Levi's Jeans|       Clothing|      15| 49.99|
|  6|            Yoga Mat|         Sports|      30| 29.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
+---+--------------------+---------------+--------+------+



## Get Unique Rows

In [14]:
# Get distinct 
distinct_categories = df.select("category").distinct()
print("Distinct categories:")
distinct_categories.show()

Distinct categories:


[Stage 19:>                                                         (0 + 1) / 1]

+---------------+
|       category|
+---------------+
|         Sports|
|    Electronics|
|       Clothing|
|          Books|
|Home Appliances|
|         Beauty|
|           Toys|
+---------------+



                                                                                

## Dropping Columns

In [15]:
# Drop a column
dropped_columns = df.drop("quantity","category")
print("Dropped Columns")
dropped_columns.show()

Dropped Columns


                                                                                

+---+--------------------+------+
| id|                name| price|
+---+--------------------+------+
|  1|           iPhone 12|899.99|
|  2|     Nike Air max 90|119.99|
|  3|KitchenAid Stand ...|299.99|
|  4|    The Great Gatsby| 12.99|
|  5|L'Oreal Paris Mas...|  9.99|
|  6|            Yoga Mat| 29.99|
|  7| Samsung 4k Smart TV|799.99|
|  8|        Levi's Jeans| 49.99|
|  9|Dyson Vacuum Cleaner|399.99|
| 10|  Google Pixel 8 Pro|699.99|
| 11|    Darth Vader Lego| 49.99|
| 12|       Colosium Lego|500.98|
+---+--------------------+------+



## WithColumn - Add new calculated columns

In [18]:
df_with_new_column = df.withColumn("revenue", df.quantity * df.price)
from pyspark.sql.functions import asc
df_sorted_new = df_with_new_column.orderBy(col("revenue").desc(), col("category").asc())
print("DataFrame with Revenue Column")
df_sorted_new.show()

DataFrame with Revenue Column


                                                                                

+---+--------------------+---------------+--------+------+------------------+
| id|                name|       category|quantity| price|           revenue|
+---+--------------------+---------------+--------+------+------------------+
|  1|           iPhone 12|    Electronics|      10|899.99|            8999.9|
|  7| Samsung 4k Smart TV|    Electronics|       8|799.99|           6399.92|
|  2|     Nike Air max 90|       Clothing|      25|119.99|           2999.75|
| 10|  Google Pixel 8 Pro|    Electronics|       4|699.99|           2799.96|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|           1499.95|
|  9|Dyson Vacuum Cleaner|Home Appliances|       3|399.99|           1199.97|
| 12|       Colosium Lego|           Toys|       2|500.98|           1001.96|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|             999.0|
|  6|            Yoga Mat|         Sports|      30| 29.99| 899.6999999999999|
|  8|        Levi's Jeans|       Clothing|      15| 49.99|      

## Rename Columns

In [19]:
# Rename a column to improve understanding
df_with_alias = df.withColumnRenamed("price","product_price")
print("Dataframe with Aliased column product_price")
df_with_alias.show()

Dataframe with Aliased column product_price
+---+--------------------+---------------+--------+-------------+
| id|                name|       category|quantity|product_price|
+---+--------------------+---------------+--------+-------------+
|  1|           iPhone 12|    Electronics|      10|       899.99|
|  2|     Nike Air max 90|       Clothing|      25|       119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|       299.99|
|  4|    The Great Gatsby|          Books|      50|        12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|         9.99|
|  6|            Yoga Mat|         Sports|      30|        29.99|
|  7| Samsung 4k Smart TV|    Electronics|       8|       799.99|
|  8|        Levi's Jeans|       Clothing|      15|        49.99|
|  9|Dyson Vacuum Cleaner|Home Appliances|       3|       399.99|
| 10|  Google Pixel 8 Pro|    Electronics|       4|       699.99|
| 11|    Darth Vader Lego|           Toys|      10|        49.99|
| 12|       Colosium Lego|      

# End the Session

In [20]:
# end the spark session as a best practice.
spark.stop()