<a href="https://colab.research.google.com/github/chinnuanna123/spark/blob/main/adv_pyspark_Day1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark



In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit

In [3]:
spark=SparkSession.builder.appName('Practise').getOrCreate()

In [4]:
data = [
    ("P001", "Laptop", "Electronics", 75000.0, 120, "2025-03-10 09:00:00"),
    ("P002", "Smartphone", "Electronics", 25000.0, 200, "2025-03-10 09:15:00"),
    ("P003", "Headphones", "Accessories", 1500.0, 500, "2025-03-10 09:30:00"),
    ("P004", "Chair", "Furniture", 4500.0, 80, "2025-03-10 09:45:00"),
    ("P005", "Notebook", "Stationery", 50.0, 1000, "2025-03-10 10:00:00")
]

In [5]:
columns = ["product_id", "product_name", "category", "price", "quantity_in_stock", "date_added"]

In [6]:
df = spark.createDataFrame(data, columns)

In [7]:
df.show()

+----------+------------+-----------+-------+-----------------+-------------------+
|product_id|product_name|   category|  price|quantity_in_stock|         date_added|
+----------+------------+-----------+-------+-----------------+-------------------+
|      P001|      Laptop|Electronics|75000.0|              120|2025-03-10 09:00:00|
|      P002|  Smartphone|Electronics|25000.0|              200|2025-03-10 09:15:00|
|      P003|  Headphones|Accessories| 1500.0|              500|2025-03-10 09:30:00|
|      P004|       Chair|  Furniture| 4500.0|               80|2025-03-10 09:45:00|
|      P005|    Notebook| Stationery|   50.0|             1000|2025-03-10 10:00:00|
+----------+------------+-----------+-------+-----------------+-------------------+



Selecting Specific Columns

In [10]:
df.select("product_id", "product_name").show()

+----------+------------+
|product_id|product_name|
+----------+------------+
|      P001|      Laptop|
|      P002|  Smartphone|
|      P003|  Headphones|
|      P004|       Chair|
|      P005|    Notebook|
+----------+------------+



 Filtering Rows

In [16]:
df.filter(col('category')=='Electronics').show()

+----------+------------+-----------+-------+-----------------+-------------------+
|product_id|product_name|   category|  price|quantity_in_stock|         date_added|
+----------+------------+-----------+-------+-----------------+-------------------+
|      P001|      Laptop|Electronics|75000.0|              120|2025-03-10 09:00:00|
|      P002|  Smartphone|Electronics|25000.0|              200|2025-03-10 09:15:00|
+----------+------------+-----------+-------+-----------------+-------------------+



Using where()

In [17]:
df.where(col('category')=='Electronics').show()

+----------+------------+-----------+-------+-----------------+-------------------+
|product_id|product_name|   category|  price|quantity_in_stock|         date_added|
+----------+------------+-----------+-------+-----------------+-------------------+
|      P001|      Laptop|Electronics|75000.0|              120|2025-03-10 09:00:00|
|      P002|  Smartphone|Electronics|25000.0|              200|2025-03-10 09:15:00|
+----------+------------+-----------+-------+-----------------+-------------------+



In [18]:
df.where(col('price')>50000).show()

+----------+------------+-----------+-------+-----------------+-------------------+
|product_id|product_name|   category|  price|quantity_in_stock|         date_added|
+----------+------------+-----------+-------+-----------------+-------------------+
|      P001|      Laptop|Electronics|75000.0|              120|2025-03-10 09:00:00|
+----------+------------+-----------+-------+-----------------+-------------------+



In [20]:
df.filter((col('category')=='Electronics')&(col('quantity_in_stock')>130)).show()

+----------+------------+-----------+-------+-----------------+-------------------+
|product_id|product_name|   category|  price|quantity_in_stock|         date_added|
+----------+------------+-----------+-------+-----------------+-------------------+
|      P002|  Smartphone|Electronics|25000.0|              200|2025-03-10 09:15:00|
+----------+------------+-----------+-------+-----------------+-------------------+



In [26]:
df.select("category").where(col('quantity_in_stock')>130).show()

+-----------+
|   category|
+-----------+
|Electronics|
|Accessories|
| Stationery|
+-----------+



In [29]:
df.count()

5

In [32]:
df.filter(df.category=='Electronics').count()

2

In [35]:
df.filter(col('category')=='Electronics').count()

2

Collecting Data Into a List

In [36]:
df.collect()

[Row(product_id='P001', product_name='Laptop', category='Electronics', price=75000.0, quantity_in_stock=120, date_added='2025-03-10 09:00:00'),
 Row(product_id='P002', product_name='Smartphone', category='Electronics', price=25000.0, quantity_in_stock=200, date_added='2025-03-10 09:15:00'),
 Row(product_id='P003', product_name='Headphones', category='Accessories', price=1500.0, quantity_in_stock=500, date_added='2025-03-10 09:30:00'),
 Row(product_id='P004', product_name='Chair', category='Furniture', price=4500.0, quantity_in_stock=80, date_added='2025-03-10 09:45:00'),
 Row(product_id='P005', product_name='Notebook', category='Stationery', price=50.0, quantity_in_stock=1000, date_added='2025-03-10 10:00:00')]

Using withColumn() to Add or Modify Columns

In [37]:
df.withColumn('discount_price',col('price')*0.9).show()

+----------+------------+-----------+-------+-----------------+-------------------+--------------+
|product_id|product_name|   category|  price|quantity_in_stock|         date_added|discount_price|
+----------+------------+-----------+-------+-----------------+-------------------+--------------+
|      P001|      Laptop|Electronics|75000.0|              120|2025-03-10 09:00:00|       67500.0|
|      P002|  Smartphone|Electronics|25000.0|              200|2025-03-10 09:15:00|       22500.0|
|      P003|  Headphones|Accessories| 1500.0|              500|2025-03-10 09:30:00|        1350.0|
|      P004|       Chair|  Furniture| 4500.0|               80|2025-03-10 09:45:00|        4050.0|
|      P005|    Notebook| Stationery|   50.0|             1000|2025-03-10 10:00:00|          45.0|
+----------+------------+-----------+-------+-----------------+-------------------+--------------+



In [38]:
df.withColumn('discount_price',col('price')*0.9).select('product_name','discount_price').show()

+------------+--------------+
|product_name|discount_price|
+------------+--------------+
|      Laptop|       67500.0|
|  Smartphone|       22500.0|
|  Headphones|        1350.0|
|       Chair|        4050.0|
|    Notebook|          45.0|
+------------+--------------+



In [40]:
df.select('quantity_in_stock').show()

+-----------------+
|quantity_in_stock|
+-----------------+
|              120|
|              200|
|              500|
|               80|
|             1000|
+-----------------+



In [39]:
df.withColumn('quantity_in_stock',col('quantity_in_stock')+10).show()

+----------+------------+-----------+-------+-----------------+-------------------+
|product_id|product_name|   category|  price|quantity_in_stock|         date_added|
+----------+------------+-----------+-------+-----------------+-------------------+
|      P001|      Laptop|Electronics|75000.0|              130|2025-03-10 09:00:00|
|      P002|  Smartphone|Electronics|25000.0|              210|2025-03-10 09:15:00|
|      P003|  Headphones|Accessories| 1500.0|              510|2025-03-10 09:30:00|
|      P004|       Chair|  Furniture| 4500.0|               90|2025-03-10 09:45:00|
|      P005|    Notebook| Stationery|   50.0|             1010|2025-03-10 10:00:00|
+----------+------------+-----------+-------+-----------------+-------------------+



Using drop() to Remove Columns

In [41]:
df.drop('date_added').show()

+----------+------------+-----------+-------+-----------------+
|product_id|product_name|   category|  price|quantity_in_stock|
+----------+------------+-----------+-------+-----------------+
|      P001|      Laptop|Electronics|75000.0|              120|
|      P002|  Smartphone|Electronics|25000.0|              200|
|      P003|  Headphones|Accessories| 1500.0|              500|
|      P004|       Chair|  Furniture| 4500.0|               80|
|      P005|    Notebook| Stationery|   50.0|             1000|
+----------+------------+-----------+-------+-----------------+



Dropping Multiple Columns

In [42]:
df.drop('product_name','category').show()

+----------+-------+-----------------+-------------------+
|product_id|  price|quantity_in_stock|         date_added|
+----------+-------+-----------------+-------------------+
|      P001|75000.0|              120|2025-03-10 09:00:00|
|      P002|25000.0|              200|2025-03-10 09:15:00|
|      P003| 1500.0|              500|2025-03-10 09:30:00|
|      P004| 4500.0|               80|2025-03-10 09:45:00|
|      P005|   50.0|             1000|2025-03-10 10:00:00|
+----------+-------+-----------------+-------------------+



Combining withColumn() & drop()

In [43]:
df.withColumn("total_value", col("price") * col("quantity_in_stock")).drop("category").show()



+----------+------------+-------+-----------------+-------------------+-----------+
|product_id|product_name|  price|quantity_in_stock|         date_added|total_value|
+----------+------------+-------+-----------------+-------------------+-----------+
|      P001|      Laptop|75000.0|              120|2025-03-10 09:00:00|  9000000.0|
|      P002|  Smartphone|25000.0|              200|2025-03-10 09:15:00|  5000000.0|
|      P003|  Headphones| 1500.0|              500|2025-03-10 09:30:00|   750000.0|
|      P004|       Chair| 4500.0|               80|2025-03-10 09:45:00|   360000.0|
|      P005|    Notebook|   50.0|             1000|2025-03-10 10:00:00|    50000.0|
+----------+------------+-------+-----------------+-------------------+-----------+

