In [0]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName("Student data project")
sc = SparkContext.getOrCreate(conf)

In [0]:
file = sc.textFile("/FileStore/tables/StudentData.csv")
header = file.first()

In [0]:
file.take(5)

Out[3]: ['age,gender,name,course,roll,marks,email',
 '28,Female,Hubert Oliveras,DB,02984,59,Annika Hoffman_Naoma Fritts@OOP.com',
 '29,Female,Toshiko Hillyard,Cloud,12899,62,Margene Moores_Marylee Capasso@DB.com',
 '28,Male,Celeste Lollis,PF,21267,45,Jeannetta Golden_Jenna Montague@DSA.com',
 '29,Female,Elenore Choy,DB,32877,29,Billi Clore_Mitzi Seldon@DB.com']

### Show the number of student in the file

In [0]:
rdd = file.filter(lambda x : x != header).map( lambda x : x.split(",") )
rdd.count()

Out[4]: 1000

### Show total marks achieved by Female and Male students

In [0]:
rdd = file.filter( lambda x : x != header ).map( lambda x : ( x.split(",")[1], int(x.split(",")[-2]) ) )

In [0]:
male = rdd.reduceByKey( lambda x,y : x+y ).filter( lambda x : x[0] == "Male" )
male.collect()

Out[6]: [('Male', 30461)]

In [0]:
female = rdd.reduceByKey( lambda x,y : x+y ).filter( lambda x : x[0] == "Female" )
female.collect()

Out[7]: [('Female', 29636)]

### Show the total number of students that have passed and failed. 50+ marks are required to pass the course

In [0]:
rdd = file.filter( lambda x : x != header ).map( lambda x : int(x.split(",")[-2]) )

In [0]:
rdd1 = rdd.map( lambda x : "yes" if x > 50 else "no" )
passed = rdd1.filter( lambda x : x == "yes" )
failed = rdd1.filter( lambda x : x == "no" )

In [0]:
passed.count()

Out[10]: 630

In [0]:
failed.count()

Out[11]: 370

### Show the total number of students enrolled per course

In [0]:
rdd = file.filter( lambda x : x != header ).map( lambda x : ( x.split(",")[3] , 1 ) )

In [0]:
rdd1 = rdd.reduceByKey( lambda x,y : x+y )
rdd1.collect()

Out[13]: [('DB', 157),
 ('Cloud', 192),
 ('PF', 166),
 ('MVC', 157),
 ('OOP', 152),
 ('DSA', 176)]

### Show the total marks that students have achieved per course

In [0]:
rdd = file.filter( lambda x : x != header ).map( lambda x : ( x.split(",")[3] , int(x.split(",")[-2]) ) )

In [0]:
rdd1 = rdd.reduceByKey( lambda x,y : x+y )
rdd1.collect()

Out[15]: [('DB', 9270),
 ('Cloud', 11443),
 ('PF', 9933),
 ('MVC', 9585),
 ('OOP', 8916),
 ('DSA', 10950)]

### Show the average marks that students have achieved per course

In [0]:
rdd = file.filter( lambda x : x != header ).map( lambda x : ( x.split(",")[3] , ( int(x.split(",")[-2]) , 1 ) ) )

In [0]:
rdd1 = rdd.reduceByKey( lambda x,y : ( x[0]+y[0] , x[1]+y[1] ) )

In [0]:
rdd2 = rdd1.map( lambda x : ( x[0] , x[1][0]/x[1][1] ) )
rdd2.collect()

Out[18]: [('DB', 59.044585987261144),
 ('Cloud', 59.598958333333336),
 ('PF', 59.83734939759036),
 ('MVC', 61.05095541401274),
 ('OOP', 58.6578947368421),
 ('DSA', 62.21590909090909)]

### Show the minimum and maximum marks achieved per course

In [0]:
rdd = file.filter( lambda x : x != header ).map( lambda x : ( x.split(",")[3], x.split(",")[-2] ) )

In [0]:
min = rdd.reduceByKey( lambda x,y : x if x < y else y )
min.collect()

Out[20]: [('DB', '20'),
 ('Cloud', '20'),
 ('PF', '20'),
 ('MVC', '22'),
 ('OOP', '20'),
 ('DSA', '20')]

In [0]:
max = rdd.reduceByKey( lambda x,y : x if x > y else y )
max.collect()

Out[21]: [('DB', '98'),
 ('Cloud', '99'),
 ('PF', '99'),
 ('MVC', '99'),
 ('OOP', '99'),
 ('DSA', '99')]

### Show the average age of male and female students

In [0]:
rdd = file.filter( lambda x : x != header ).map( lambda x : ( x.split(",")[1], ( int(x.split(",")[0]), 1) ) )

In [0]:
rdd1 = rdd.reduceByKey( lambda x,y : ( x[0]+y[0] , x[1]+y[1] ) )

In [0]:
rdd2 = rdd1.map( lambda x : ( x[0] , x[1][0]/x[1][1] ) )

In [0]:
male = rdd2.filter( lambda x : x[0] == "Male" )
male.collect()

Out[25]: [('Male', 28.52304609218437)]

In [0]:
female = rdd2.filter( lambda x : x[0] == "Female" )
female.collect()

Out[26]: [('Female', 28.489021956087825)]