<a href="https://colab.research.google.com/github/codingniket/Python-Training/blob/main/16-12-2025/RDD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('RDD').getOrCreate()

In [None]:
data = """id,name,city,age,salary
1,Arjun,Hyderabad,25,45000
2,Meera,Chennai,32,52000
3,Rajesh,Bangalore,29,61000
4,Priya,Delhi,22,38000
5,Sanjay,Mumbai,35,72000
6,Kavya,Hyderabad,28,48000
7,Imran,Delhi,31,53000
8,Divya,Chennai,27,45000
9,Anil,Bangalore,40,85000
10,Ritu,Mumbai,23,39000
"""

In [None]:
with open('employee.csv','w') as f:
    f.write(data)


In [None]:
rdd = spark.sparkContext.textFile('employee.csv')

In [None]:
rdd.take(5)

['id,name,city,age,salary',
 '1,Arjun,Hyderabad,25,45000',
 '2,Meera,Chennai,32,52000',
 '3,Rajesh,Bangalore,29,61000',
 '4,Priya,Delhi,22,38000']

In [None]:
header = rdd.first()
data_rdd = rdd.filter(lambda x: x != header)
data_rdd.collect()

['1,Arjun,Hyderabad,25,45000',
 '2,Meera,Chennai,32,52000',
 '3,Rajesh,Bangalore,29,61000',
 '4,Priya,Delhi,22,38000',
 '5,Sanjay,Mumbai,35,72000',
 '6,Kavya,Hyderabad,28,48000',
 '7,Imran,Delhi,31,53000',
 '8,Divya,Chennai,27,45000',
 '9,Anil,Bangalore,40,85000',
 '10,Ritu,Mumbai,23,39000']

In [None]:
split_rdd = data_rdd.map(lambda x: x.split(','))
split_rdd.take(3)

[['1', 'Arjun', 'Hyderabad', '25', '45000'],
 ['2', 'Meera', 'Chennai', '32', '52000'],
 ['3', 'Rajesh', 'Bangalore', '29', '61000']]

In [11]:
#take -> collect

In [13]:
city_salary_rdd = split_rdd.map(lambda x: (x[2],int(x[4])))
city_salary_rdd.collect()

[('Hyderabad', 45000),
 ('Chennai', 52000),
 ('Bangalore', 61000),
 ('Delhi', 38000),
 ('Mumbai', 72000),
 ('Hyderabad', 48000),
 ('Delhi', 53000),
 ('Chennai', 45000),
 ('Bangalore', 85000),
 ('Mumbai', 39000)]

In [14]:
total_salary_per_city = city_salary_rdd.reduceByKey(lambda x,y: x+y)
total_salary_per_city.collect()

[('Hyderabad', 93000),
 ('Delhi', 91000),
 ('Mumbai', 111000),
 ('Chennai', 97000),
 ('Bangalore', 146000)]

In [15]:
highest_salary = total_salary_per_city.reduce(
    lambda x,y: x if x[1] > y[1] else y
)
highest_salary

('Bangalore', 146000)

In [30]:
data = '''call_id,caller,receiver,city,call_type,duration_seconds,cost
C001,Amit,Rahul,Hyderabad,Local,180,2.5
C002,Neha,Arjun,Bangalore,STD,320,6.0
C003,Rahul,Pooja,Delhi,Local,60,1.0
C004,Pooja,Neha,Mumbai,ISD,900,25.0
C005,Arjun,Amit,Chennai,STD,400,7.5
C006,Sneha,Karan,Hyderabad,Local,240,3.0
C007,Karan,Sneha,Delhi,Local,120,2.0
C008,Riya,Vikas,Bangalore,STD,360,6.5
C009,Vikas,Riya,Mumbai,ISD,1100,30.0
C010,Anjali,Sanjay,Chennai,Local,90,1.5
C011,Farhan,Ayesha,Delhi,STD,420,7.0
C012,Ayesha,Farhan,Hyderabad,ISD,950,28.0
C013,Suresh,Divya,Bangalore,Local,150,2.0
C014,Divya,Suresh,Mumbai,STD,380,6.8
C015,Nikhil,Priya,Delhi,Local,200,2.8
C016,Priya,Nikhil,Chennai,STD,410,7.2
C017,Rohit,Kavya,Hyderabad,Local,170,2.3
C018,Kavya,Rohit,Bangalore,Local,140,2.1
C019,Manish,Tina,Mumbai,ISD,1000,27.0
C020,Tina,Manish,Delhi,STD,350,6.2
'''



In [31]:
with open('call_records.csv','w') as f:
    f.write(data)


1

In [32]:
rdd = spark.sparkContext.textFile('call_records.csv')
rdd.take(5)

['call_id,caller,receiver,city,call_type,duration_seconds,cost',
 'C001,Amit,Rahul,Hyderabad,Local,180,2.5',
 'C002,Neha,Arjun,Bangalore,STD,320,6.0',
 'C003,Rahul,Pooja,Delhi,Local,60,1.0',
 'C004,Pooja,Neha,Mumbai,ISD,900,25.0']

2

In [34]:
head = rdd.first()
data_rdd = rdd.filter(lambda x: x != head)
data_rdd.collect()

['C001,Amit,Rahul,Hyderabad,Local,180,2.5',
 'C002,Neha,Arjun,Bangalore,STD,320,6.0',
 'C003,Rahul,Pooja,Delhi,Local,60,1.0',
 'C004,Pooja,Neha,Mumbai,ISD,900,25.0',
 'C005,Arjun,Amit,Chennai,STD,400,7.5',
 'C006,Sneha,Karan,Hyderabad,Local,240,3.0',
 'C007,Karan,Sneha,Delhi,Local,120,2.0',
 'C008,Riya,Vikas,Bangalore,STD,360,6.5',
 'C009,Vikas,Riya,Mumbai,ISD,1100,30.0',
 'C010,Anjali,Sanjay,Chennai,Local,90,1.5',
 'C011,Farhan,Ayesha,Delhi,STD,420,7.0',
 'C012,Ayesha,Farhan,Hyderabad,ISD,950,28.0',
 'C013,Suresh,Divya,Bangalore,Local,150,2.0',
 'C014,Divya,Suresh,Mumbai,STD,380,6.8',
 'C015,Nikhil,Priya,Delhi,Local,200,2.8',
 'C016,Priya,Nikhil,Chennai,STD,410,7.2',
 'C017,Rohit,Kavya,Hyderabad,Local,170,2.3',
 'C018,Kavya,Rohit,Bangalore,Local,140,2.1',
 'C019,Manish,Tina,Mumbai,ISD,1000,27.0',
 'C020,Tina,Manish,Delhi,STD,350,6.2']

3

In [35]:
split_rdd = data_rdd.map(lambda x: x.split(','))
split_rdd.take(3)

[['C001', 'Amit', 'Rahul', 'Hyderabad', 'Local', '180', '2.5'],
 ['C002', 'Neha', 'Arjun', 'Bangalore', 'STD', '320', '6.0'],
 ['C003', 'Rahul', 'Pooja', 'Delhi', 'Local', '60', '1.0']]

4

In [37]:
total_cost_per_city = split_rdd.map(lambda x: (x[3],float(x[6])))
total_cost_per_city.collect()

total_per_city = total_cost_per_city.reduceByKey(lambda x,y: x+y)
total_per_city.collect()

[('Hyderabad', 35.8),
 ('Delhi', 19.0),
 ('Mumbai', 88.8),
 ('Bangalore', 16.6),
 ('Chennai', 16.2)]

5

In [38]:
highest = total_per_city.reduce(lambda x,y: x if x[1] > y[1] else y)
highest

('Mumbai', 88.8)

6

In [40]:
total_duration = split_rdd.map(lambda x: (x[4],int(x[5])))
total_duration.collect()

bygroup = total_duration.reduceByKey(lambda x,y: x+y)
bygroup.collect()

[('Local', 1350), ('STD', 2640), ('ISD', 3950)]

7

In [42]:
calls_per_city = split_rdd.map(lambda x: (x[3], 1))
calls_per_city_count = calls_per_city.reduceByKey(lambda x, y: x + y)
calls_per_city_count.collect()

[('Hyderabad', 4),
 ('Delhi', 5),
 ('Mumbai', 4),
 ('Bangalore', 4),
 ('Chennai', 3)]

8

In [44]:
total_per_city_collected = total_per_city.collect()
calls_per_city_count_collected = calls_per_city_count.collect()

total_cost_dict = dict(total_per_city_collected)
calls_count_dict = dict(calls_per_city_count_collected)

avg_call_cost_per_city = {
    city: total_cost_dict[city] / calls_count_dict[city]
    for city in total_cost_dict if city in calls_count_dict
}

print("Average call cost per city:")
for city, avg_cost in avg_call_cost_per_city.items():
    print(f"{city}: {avg_cost:.2f}")

Average call cost per city:
Hyderabad: 8.95
Delhi: 3.80
Mumbai: 22.20
Bangalore: 4.15
Chennai: 5.40


9

In [48]:
high_value = split_rdd.filter(lambda x: float(x[6]) > 20)
high_value.collect()

[['C004', 'Pooja', 'Neha', 'Mumbai', 'ISD', '900', '25.0'],
 ['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C019', 'Manish', 'Tina', 'Mumbai', 'ISD', '1000', '27.0']]

10

In [51]:
isd_call = split_rdd.filter(lambda x: x[4] == 'ISD')
isd_call.collect()
isd_map_count = isd_call.map(lambda x: (x[3], 1))
isd_call_city = isd_map_count.reduceByKey(lambda x, y: x + y)
isd_call_city.collect()

[('Mumbai', 3), ('Hyderabad', 1)]

11

In [52]:
longest = split_rdd.reduce(lambda x,y: x if int(x[5]) > int(y[5]) else y)
longest

['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0']

12

In [58]:
#the total revenue generated by each caller
total_revenue_per_caller = split_rdd.map(lambda x: (x[1], float(x[6])))
total_revenue_per_caller.collect()

[('Amit', 2.5),
 ('Neha', 6.0),
 ('Rahul', 1.0),
 ('Pooja', 25.0),
 ('Arjun', 7.5),
 ('Sneha', 3.0),
 ('Karan', 2.0),
 ('Riya', 6.5),
 ('Vikas', 30.0),
 ('Anjali', 1.5),
 ('Farhan', 7.0),
 ('Ayesha', 28.0),
 ('Suresh', 2.0),
 ('Divya', 6.8),
 ('Nikhil', 2.8),
 ('Priya', 7.2),
 ('Rohit', 2.3),
 ('Kavya', 2.1),
 ('Manish', 27.0),
 ('Tina', 6.2)]

13

In [60]:
suspicious_calls = split_rdd.filter(lambda x: (float(x[6]) > 25) & (int(x[5]) > 900) )
suspicious_calls.collect()

[['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C019', 'Manish', 'Tina', 'Mumbai', 'ISD', '1000', '27.0']]