Columnar operations

In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
    .appName("Day20")\
    .getOrCreate()

In [3]:
data = [

    ("U001","Delhi",450),

    ("U002","Mumbai",620),

    ("U003","Bangalore",300),

    ("U004","Delhi",700)

]
columns=["user_id","city","amount"]
df=spark.createDataFrame(data,columns)
df.show()

+-------+---------+------+
|user_id|     city|amount|
+-------+---------+------+
|   U001|    Delhi|   450|
|   U002|   Mumbai|   620|
|   U003|Bangalore|   300|
|   U004|    Delhi|   700|
+-------+---------+------+



In [5]:
from pyspark.sql.functions import col
df=df.withColumn("amount_with_tax",col("amount")*1.18)
df.show()

+-------+---------+------+-----------------+
|user_id|     city|amount|  amount_with_tax|
+-------+---------+------+-----------------+
|   U001|    Delhi|   450|            531.0|
|   U002|   Mumbai|   620|731.5999999999999|
|   U003|Bangalore|   300|            354.0|
|   U004|    Delhi|   700|            826.0|
+-------+---------+------+-----------------+



In [6]:
df=df.replace("Delhi","New Delhi")
df.show()

+-------+---------+------+-----------------+
|user_id|     city|amount|  amount_with_tax|
+-------+---------+------+-----------------+
|   U001|New Delhi|   450|            531.0|
|   U002|   Mumbai|   620|731.5999999999999|
|   U003|Bangalore|   300|            354.0|
|   U004|New Delhi|   700|            826.0|
+-------+---------+------+-----------------+



In [7]:
from pyspark.sql.functions import when
df=df.withColumn("amount_category",when(col("amount")>500,"High").otherwise("Low"))
df.show()

+-------+---------+------+-----------------+---------------+
|user_id|     city|amount|  amount_with_tax|amount_category|
+-------+---------+------+-----------------+---------------+
|   U001|New Delhi|   450|            531.0|            Low|
|   U002|   Mumbai|   620|731.5999999999999|           High|
|   U003|Bangalore|   300|            354.0|            Low|
|   U004|New Delhi|   700|            826.0|           High|
+-------+---------+------+-----------------+---------------+



PARQUET


In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
    .appName("parquet")\
    .getOrCreate()

In [3]:
data = [
    ("ORD001","Delhi","Laptop",45000,"2024-01-05"),
    ("ORD002","Mumbai","Mobile",32000,"2024-01-06"),
    ("ORD003","Bangalore","Tablet",30000,"2024-01-07"),
    ("ORD004","Delhi","Laptop",55000,"2024-01-08"),
    ("ORD005","Mumbai","Tablet",34000,"2024-01-09")
]
columns=["order_id","city","product","price","order_date"]
df=spark.createDataFrame(data,columns)
df.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_date|
+--------+---------+-------+-----+----------+
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
+--------+---------+-------+-----+----------+



In [4]:
df.write.mode("overwrite").parquet("data/parquet/orders")

In [5]:
df_parquet=spark.read.parquet("data/parquet/orders")
df_parquet.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_date|
+--------+---------+-------+-----+----------+
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
+--------+---------+-------+-----+----------+



In [7]:
df.write.mode("overwrite").orc("data/orc/orders")

In [8]:
df_orc=spark.read.orc("data/orc/orders")
df_orc.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_date|
+--------+---------+-------+-----+----------+
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
+--------+---------+-------+-----+----------+

