In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Read a CSV').getOrCreate()

In [2]:
data = """id,name,city,age,salary
1,Arjun,Hyderabad,25,45000
2,Meera,Chennai,32,52000
3,Rajesh,Bangalore,29,61000
4,Priya,Delhi,22,38000
5,Sanjay,Mumbai,35,72000
"""

with open("employees.csv","w") as f:
    f.write(data)


In [3]:
df = spark.read \
.option("header","true") \
.option("inferSchema","true") \
.csv("employees.csv")

df.show()
df.printSchema()

+---+------+---------+---+------+
| id|  name|     city|age|salary|
+---+------+---------+---+------+
|  1| Arjun|Hyderabad| 25| 45000|
|  2| Meera|  Chennai| 32| 52000|
|  3|Rajesh|Bangalore| 29| 61000|
|  4| Priya|    Delhi| 22| 38000|
|  5|Sanjay|   Mumbai| 35| 72000|
+---+------+---------+---+------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [4]:

json_data = [
  {
    "id": 1,
    "name": "Arjun",
    "city": "Hyderabad",
    "age": 25,
    "salary": 45000
  },
  {
    "id": 2,
    "name": "Meera",
    "city": "Chennai",
    "age": 32,
    "salary": 52000
  },
  {
    "id": 3,
    "name": "Rajesh",
    "city": "Bangalore",
    "age": 29,
    "salary": 61000
  },
  {
    "id": 4,
    "name": "Priya",
       "city": "Delhi",
    "age": 22,
    "salary": 38000
  },
  {
    "id": 5,
    "name": "Sanjay",
    "city": "Mumbai",
    "age": 35,
    "salary": 72000
  }]


In [5]:
df_json = spark.createDataFrame(json_data)
df.write.mode("overwrite").json("employees.json")
df_json.show()
df_json.printSchema()

+---+---------+---+------+------+
|age|     city| id|  name|salary|
+---+---------+---+------+------+
| 25|Hyderabad|  1| Arjun| 45000|
| 32|  Chennai|  2| Meera| 52000|
| 29|Bangalore|  3|Rajesh| 61000|
| 22|    Delhi|  4| Priya| 38000|
| 35|   Mumbai|  5|Sanjay| 72000|
+---+---------+---+------+------+

root
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [6]:
df.write.mode("overwrite").parquet("employees.parquet")
df_parquet = spark.read.parquet("employees.parquet")
df_parquet.show()

+---+------+---------+---+------+
| id|  name|     city|age|salary|
+---+------+---------+---+------+
|  1| Arjun|Hyderabad| 25| 45000|
|  2| Meera|  Chennai| 32| 52000|
|  3|Rajesh|Bangalore| 29| 61000|
|  4| Priya|    Delhi| 22| 38000|
|  5|Sanjay|   Mumbai| 35| 72000|
+---+------+---------+---+------+



In [7]:
data = [
    ("O001","Amit","Hyderabad","Spice Hub","Indian",450,35,"UPI","Delivered"),
    ("O002","Neha","Bangalore","Pizza Town","Italian",650,40,"Card","Delivered"),
    ("O003","Rahul","Delhi","Burger Zone","American",520,30,"Cash","Delivered"),
    ("O004","Pooja","Mumbai","Sushi Bar","Japanese",1200,55,"UPI","Cancelled"),
    ("O005","Arjun","Chennai","Curry Leaf","Indian",380,28,"UPI","Delivered"),
    ("O006","Sneha","Hyderabad","Pasta Street","Italian",700,45,"Card","Delivered"),
    ("O007","Karan","Delhi","Taco Bell","Mexican",540,33,"UPI","Delivered"),
    ("O008","Riya","Bangalore","Dragon Bowl","Chinese",600,38,"Wallet","Delivered"),
    ("O009","Vikas","Mumbai","BBQ Nation","Indian",1500,60,"Card","Delivered"),
    ("O010","Anjali","Chennai","Burger Zone","American",480,32,"Cash","Delivered"),
    ("O011","Farhan","Delhi","Biryani House","Indian",520,36,"UPI","Delivered"),
    ("O012","Megha","Hyderabad","Sushi Bar","Japanese",1100,58,"Card","Cancelled"),
    ("O013","Suresh","Bangalore","Curry Leaf","Indian",420,29,"UPI","Delivered"),
    ("O014","Divya","Mumbai","Pizza Town","Italian",780,42,"Wallet","Delivered"),
    ("O015","Nikhil","Delhi","Pasta Street","Italian",690,47,"UPI","Delivered"),
    ("O016","Kavya","Chennai","Dragon Bowl","Chinese",560,34,"UPI","Delivered"),
    ("O017","Rohit","Hyderabad","BBQ Nation","Indian",1400,62,"Card","Delivered"),
    ("O018","Simran","Bangalore","Burger Zone","American",510,31,"Cash","Delivered"),
    ("O019","Ayesha","Mumbai","Taco Bell","Mexican",570,35,"UPI","Delivered"),
    ("O020","Manish","Delhi","Curry Leaf","Indian",390,27,"Wallet","Delivered"),
    ("O021","Priya","Hyderabad","Pizza Town","Italian",720,41,"Card","Delivered"),
    ("O022","Yash","Chennai","Sushi Bar","Japanese",1150,57,"UPI","Delivered"),
    ("O023","Naina","Bangalore","Pasta Street","Italian",680,44,"UPI","Delivered"),
    ("O024","Sameer","Mumbai","Dragon Bowl","Chinese",610,39,"Wallet","Delivered"),
    ("O025","Ritika","Delhi","Burger Zone","American",500,30,"Cash","Delivered"),
    ("O026","Gopal","Hyderabad","Curry Leaf","Indian",410,28,"UPI","Delivered"),
    ("O027","Tina","Bangalore","Pizza Town","Italian",760,43,"Card","Delivered"),
    ("O028","Irfan","Mumbai","BBQ Nation","Indian",1550,65,"Card","Delivered"),
    ("O029","Sahil","Chennai","Taco Bell","Mexican",590,37,"UPI","Delivered"),
    ("O030","Lavanya","Delhi","Dragon Bowl","Chinese",630,40,"Wallet","Delivered"),
    ("O031","Deepak","Hyderabad","Burger Zone","American",520,33,"Cash","Delivered"),
    ("O032","Shweta","Bangalore","Curry Leaf","Indian",450,31,"UPI","Delivered"),
    ("O033","Aman","Mumbai","Pizza Town","Italian",810,46,"Card","Delivered"),
    ("O034","Rekha","Chennai","Pasta Street","Italian",700,45,"UPI","Delivered"),
    ("O035","Zubin","Delhi","BBQ Nation","Indian",1480,63,"Card","Delivered"),
    ("O036","Pallavi","Hyderabad","Dragon Bowl","Chinese",580,36,"Wallet","Delivered"),
    ("O037","Naveen","Bangalore","Taco Bell","Mexican",560,34,"UPI","Delivered"),
    ("O038","Sonia","Mumbai","Sushi Bar","Japanese",1180,59,"Card","Delivered"),
    ("O039","Harish","Chennai","Burger Zone","American",490,29,"Cash","Delivered"),
    ("O040","Kriti","Delhi","Curry Leaf","Indian",420,26,"UPI","Delivered")
]

columns = [
    "order_id","customer_name","city","restaurant","cuisine",
    "order_amount","delivery_time_minutes","payment_mode","order_status"
]



1

In [8]:
df = spark.createDataFrame(data, columns)
df.write.mode("overwrite").csv("orders_csv.csv")

2

In [9]:
df = spark.createDataFrame(data, columns)
df.write.mode("overwrite").option("header", "true").csv("orders_csv.csv")
df.show()
df.printSchema()

+--------+-------------+---------+-------------+--------+------------+---------------------+------------+------------+
|order_id|customer_name|     city|   restaurant| cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|
+--------+-------------+---------+-------------+--------+------------+---------------------+------------+------------+
|    O001|         Amit|Hyderabad|    Spice Hub|  Indian|         450|                   35|         UPI|   Delivered|
|    O002|         Neha|Bangalore|   Pizza Town| Italian|         650|                   40|        Card|   Delivered|
|    O003|        Rahul|    Delhi|  Burger Zone|American|         520|                   30|        Cash|   Delivered|
|    O004|        Pooja|   Mumbai|    Sushi Bar|Japanese|        1200|                   55|         UPI|   Cancelled|
|    O005|        Arjun|  Chennai|   Curry Leaf|  Indian|         380|                   28|         UPI|   Delivered|
|    O006|        Sneha|Hyderabad| Pasta Street|

In [10]:
spend = df.filter(df.order_amount > 700).show()

+--------+-------------+---------+----------+--------+------------+---------------------+------------+------------+
|order_id|customer_name|     city|restaurant| cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|
+--------+-------------+---------+----------+--------+------------+---------------------+------------+------------+
|    O004|        Pooja|   Mumbai| Sushi Bar|Japanese|        1200|                   55|         UPI|   Cancelled|
|    O009|        Vikas|   Mumbai|BBQ Nation|  Indian|        1500|                   60|        Card|   Delivered|
|    O012|        Megha|Hyderabad| Sushi Bar|Japanese|        1100|                   58|        Card|   Cancelled|
|    O014|        Divya|   Mumbai|Pizza Town| Italian|         780|                   42|      Wallet|   Delivered|
|    O017|        Rohit|Hyderabad|BBQ Nation|  Indian|        1400|                   62|        Card|   Delivered|
|    O021|        Priya|Hyderabad|Pizza Town| Italian|         720|     

3

In [11]:
spec = df.select("order_id","city","cuisine","order_amount")
spec.show()

+--------+---------+--------+------------+
|order_id|     city| cuisine|order_amount|
+--------+---------+--------+------------+
|    O001|Hyderabad|  Indian|         450|
|    O002|Bangalore| Italian|         650|
|    O003|    Delhi|American|         520|
|    O004|   Mumbai|Japanese|        1200|
|    O005|  Chennai|  Indian|         380|
|    O006|Hyderabad| Italian|         700|
|    O007|    Delhi| Mexican|         540|
|    O008|Bangalore| Chinese|         600|
|    O009|   Mumbai|  Indian|        1500|
|    O010|  Chennai|American|         480|
|    O011|    Delhi|  Indian|         520|
|    O012|Hyderabad|Japanese|        1100|
|    O013|Bangalore|  Indian|         420|
|    O014|   Mumbai| Italian|         780|
|    O015|    Delhi| Italian|         690|
|    O016|  Chennai| Chinese|         560|
|    O017|Hyderabad|  Indian|        1400|
|    O018|Bangalore|American|         510|
|    O019|   Mumbai| Mexican|         570|
|    O020|    Delhi|  Indian|         390|
+--------+-

4

In [12]:
from pyspark.sql.functions import lit, when, col
df.sort(col("delivery_time_minutes").desc()).show(truncate=False)
df.write.mode("overwrite").option("header", "true").csv("orders_csv.csv")

+--------+-------------+---------+------------+--------+------------+---------------------+------------+------------+
|order_id|customer_name|city     |restaurant  |cuisine |order_amount|delivery_time_minutes|payment_mode|order_status|
+--------+-------------+---------+------------+--------+------------+---------------------+------------+------------+
|O028    |Irfan        |Mumbai   |BBQ Nation  |Indian  |1550        |65                   |Card        |Delivered   |
|O035    |Zubin        |Delhi    |BBQ Nation  |Indian  |1480        |63                   |Card        |Delivered   |
|O017    |Rohit        |Hyderabad|BBQ Nation  |Indian  |1400        |62                   |Card        |Delivered   |
|O009    |Vikas        |Mumbai   |BBQ Nation  |Indian  |1500        |60                   |Card        |Delivered   |
|O038    |Sonia        |Mumbai   |Sushi Bar   |Japanese|1180        |59                   |Card        |Delivered   |
|O012    |Megha        |Hyderabad|Sushi Bar   |Japanese|

5

In [13]:
deli = df.filter(df.order_status == "Delivered").show()
df.write.mode("overwrite").option("header", "true").csv("delivered.csv")

+--------+-------------+---------+-------------+--------+------------+---------------------+------------+------------+
|order_id|customer_name|     city|   restaurant| cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|
+--------+-------------+---------+-------------+--------+------------+---------------------+------------+------------+
|    O001|         Amit|Hyderabad|    Spice Hub|  Indian|         450|                   35|         UPI|   Delivered|
|    O002|         Neha|Bangalore|   Pizza Town| Italian|         650|                   40|        Card|   Delivered|
|    O003|        Rahul|    Delhi|  Burger Zone|American|         520|                   30|        Cash|   Delivered|
|    O005|        Arjun|  Chennai|   Curry Leaf|  Indian|         380|                   28|         UPI|   Delivered|
|    O006|        Sneha|Hyderabad| Pasta Street| Italian|         700|                   45|        Card|   Delivered|
|    O007|        Karan|    Delhi|    Taco Bell|

6

In [14]:
city = df.filter((df.city == "Mumbai") & (df.payment_mode == "Card")).show()

+--------+-------------+------+----------+--------+------------+---------------------+------------+------------+
|order_id|customer_name|  city|restaurant| cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|
+--------+-------------+------+----------+--------+------------+---------------------+------------+------------+
|    O009|        Vikas|Mumbai|BBQ Nation|  Indian|        1500|                   60|        Card|   Delivered|
|    O028|        Irfan|Mumbai|BBQ Nation|  Indian|        1550|                   65|        Card|   Delivered|
|    O033|         Aman|Mumbai|Pizza Town| Italian|         810|                   46|        Card|   Delivered|
|    O038|        Sonia|Mumbai| Sushi Bar|Japanese|        1180|                   59|        Card|   Delivered|
+--------+-------------+------+----------+--------+------------+---------------------+------------+------------+



7

In [15]:
df = df.withColumn("delivery_category", when(df.delivery_time_minutes > 45, "Late").otherwise("OnTime"))
df.show()

+--------+-------------+---------+-------------+--------+------------+---------------------+------------+------------+-----------------+
|order_id|customer_name|     city|   restaurant| cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|delivery_category|
+--------+-------------+---------+-------------+--------+------------+---------------------+------------+------------+-----------------+
|    O001|         Amit|Hyderabad|    Spice Hub|  Indian|         450|                   35|         UPI|   Delivered|           OnTime|
|    O002|         Neha|Bangalore|   Pizza Town| Italian|         650|                   40|        Card|   Delivered|           OnTime|
|    O003|        Rahul|    Delhi|  Burger Zone|American|         520|                   30|        Cash|   Delivered|           OnTime|
|    O004|        Pooja|   Mumbai|    Sushi Bar|Japanese|        1200|                   55|         UPI|   Cancelled|             Late|
|    O005|        Arjun|  Chennai|   Curr

8

In [20]:
df_json = spark.createDataFrame(data, columns)
df.write.mode("overwrite").json("orders_csv.json")
df_json.show()
df_json.printSchema()

+--------+-------------+---------+-------------+--------+------------+---------------------+------------+------------+
|order_id|customer_name|     city|   restaurant| cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|
+--------+-------------+---------+-------------+--------+------------+---------------------+------------+------------+
|    O001|         Amit|Hyderabad|    Spice Hub|  Indian|         450|                   35|         UPI|   Delivered|
|    O002|         Neha|Bangalore|   Pizza Town| Italian|         650|                   40|        Card|   Delivered|
|    O003|        Rahul|    Delhi|  Burger Zone|American|         520|                   30|        Cash|   Delivered|
|    O004|        Pooja|   Mumbai|    Sushi Bar|Japanese|        1200|                   55|         UPI|   Cancelled|
|    O005|        Arjun|  Chennai|   Curry Leaf|  Indian|         380|                   28|         UPI|   Delivered|
|    O006|        Sneha|Hyderabad| Pasta Street|

9

In [21]:
df.write.mode("overwrite").parquet("orders_csv.parquet")
df_parquet = spark.read.parquet("orders_csv.parquet")
df_parquet.show()

+--------+-------------+---------+-------------+--------+------------+---------------------+------------+------------+-----------------+
|order_id|customer_name|     city|   restaurant| cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|delivery_category|
+--------+-------------+---------+-------------+--------+------------+---------------------+------------+------------+-----------------+
|    O001|         Amit|Hyderabad|    Spice Hub|  Indian|         450|                   35|         UPI|   Delivered|           OnTime|
|    O002|         Neha|Bangalore|   Pizza Town| Italian|         650|                   40|        Card|   Delivered|           OnTime|
|    O003|        Rahul|    Delhi|  Burger Zone|American|         520|                   30|        Cash|   Delivered|           OnTime|
|    O004|        Pooja|   Mumbai|    Sushi Bar|Japanese|        1200|                   55|         UPI|   Cancelled|             Late|
|    O005|        Arjun|  Chennai|   Curr

In [22]:
food = df_parquet.filter((df_parquet.cuisine == "Indian") & (df_parquet.order_amount > 500)).show()

+--------+-------------+---------+-------------+-------+------------+---------------------+------------+------------+-----------------+
|order_id|customer_name|     city|   restaurant|cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|delivery_category|
+--------+-------------+---------+-------------+-------+------------+---------------------+------------+------------+-----------------+
|    O009|        Vikas|   Mumbai|   BBQ Nation| Indian|        1500|                   60|        Card|   Delivered|             Late|
|    O011|       Farhan|    Delhi|Biryani House| Indian|         520|                   36|         UPI|   Delivered|           OnTime|
|    O017|        Rohit|Hyderabad|   BBQ Nation| Indian|        1400|                   62|        Card|   Delivered|             Late|
|    O028|        Irfan|   Mumbai|   BBQ Nation| Indian|        1550|                   65|        Card|   Delivered|             Late|
|    O035|        Zubin|    Delhi|   BBQ Nation|

11

In [28]:
top = df.sort(col("order_amount").desc()).limit(10)
top.show()

df.write.mode("overwrite").parquet("top_10_orders.parquet")

+--------+-------------+---------+----------+--------+------------+---------------------+------------+------------+-----------------+
|order_id|customer_name|     city|restaurant| cuisine|order_amount|delivery_time_minutes|payment_mode|order_status|delivery_category|
+--------+-------------+---------+----------+--------+------------+---------------------+------------+------------+-----------------+
|    O028|        Irfan|   Mumbai|BBQ Nation|  Indian|        1550|                   65|        Card|   Delivered|             Late|
|    O009|        Vikas|   Mumbai|BBQ Nation|  Indian|        1500|                   60|        Card|   Delivered|             Late|
|    O035|        Zubin|    Delhi|BBQ Nation|  Indian|        1480|                   63|        Card|   Delivered|             Late|
|    O017|        Rohit|Hyderabad|BBQ Nation|  Indian|        1400|                   62|        Card|   Delivered|             Late|
|    O004|        Pooja|   Mumbai| Sushi Bar|Japanese|        

13

In [40]:
df.write.mode("overwrite").parquet("x")

In [41]:
df_json.write.mode("overwrite").parquet("y")

14

In [43]:

df = spark.read.parquet("x")

# Optional: ensure a single CSV file (coalesce to 1 partition)
df_single = df.coalesce(1)

# Write CSV with pipe delimiter and header
df_single.write.mode("overwrite") \
    .option("header", "true") \
    .option("delimiter", "|")


<pyspark.sql.readwriter.DataFrameWriter at 0x7f80fc1e5d30>

15

In [29]:
from pyspark.sql.functions import count,sum
cuzine = df_parquet.groupBy("cuisine").agg(sum("order_amount").alias("Total"))
cuzine.show()

+--------+-----+
| cuisine|Total|
+--------+-----+
| Mexican| 2260|
|  Indian| 9370|
| Chinese| 2980|
|Japanese| 4630|
| Italian| 6490|
|American| 3020|
+--------+-----+



16

In [34]:
cuzine = df_parquet.groupBy("city").agg(count("order_amount").alias("Total"))
cuzine.show()

+---------+-----+
|     city|Total|
+---------+-----+
|Bangalore|    8|
|  Chennai|    7|
|   Mumbai|    8|
|    Delhi|    9|
|Hyderabad|    8|
+---------+-----+



17

In [35]:
cuzine = df_parquet.groupBy("payment_mode").agg(count("payment_mode").alias("Total")).orderBy("Total",ascending=False)
cuzine.show()

+------------+-----+
|payment_mode|Total|
+------------+-----+
|         UPI|   17|
|        Card|   11|
|        Cash|    6|
|      Wallet|    6|
+------------+-----+



C1

In [36]:
repartitioned_df = df.repartition(4)
repartitioned_df.write.mode("overwrite").parquet("repartitioned_orders.parquet")

In [None]:
df_read_repartitioned = spark.read.parquet("repartitioned_orders.parquet")
df_read_repartitioned.show()

C2

In [37]:
from pyspark.sql.functions import count, sum

report_df = df.groupBy("city").agg(
    count("order_id").alias("total_orders"),
    sum("order_amount").alias("total_revenue")
)

report_df.show()


+---------+------------+-------------+
|     city|total_orders|total_revenue|
+---------+------------+-------------+
|Bangalore|           8|         4630|
|  Chennai|           7|         4350|
|   Mumbai|           8|         8200|
|    Delhi|           9|         5690|
|Hyderabad|           8|         5880|
+---------+------------+-------------+



In [38]:
report_df.write.mode("overwrite").parquet("city_report.parquet")