In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import findspark
findspark.init()
findspark.find()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
sqlcontext=SQLContext(sc)

In [2]:
examDF=spark.read.csv("dataset.csv",header=True,inferSchema=True)

In [3]:
examDF.show()
examDF.columns

+-------+-----------------------------+---------+
|Student|% achieved in the assignments|Pass exam|
+-------+-----------------------------+---------+
|  Alice|                          70%|      Yes|
|    Bob|                          36%|       No|
|  Carol|                          95%|      Yes|
|    Dan|                          63%|      Yes|
|    Eve|                          43%|       No|
|  Frank|                          84%|      Yes|
|  Grace|                          54%|      Yes|
|  Heidi|                          15%|       No|
|   Ivan|                          21%|       No|
|   Judy|                          91%|      Yes|
|Mallory|                          34%|       No|
|   null|                         null|     null|
+-------+-----------------------------+---------+



['Student', '% achieved in the assignments', 'Pass exam']

In [4]:
examDF.printSchema()

root
 |-- Student: string (nullable = true)
 |-- % achieved in the assignments: string (nullable = true)
 |-- Pass exam: string (nullable = true)



In [5]:
examDF=examDF.withColumnRenamed("% achieved in the assignments","%Achieved")
examDF.columns

['Student', '%Achieved', 'Pass exam']

In [6]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
def p2n(x): #Percentage to integer // removing percentage sign
    return int(x.strip('%'))
p2nudf=udf(lambda x: p2n(x),IntegerType())


In [7]:
examDF=examDF.withColumn("%Achieved",p2nudf(examDF["%Achieved"]))

In [8]:
examDF.show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "C:\Program Files\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 604, in main
  File "C:\Program Files\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 596, in process
  File "C:\Program Files\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "C:\Program Files\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "C:\Program Files\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 200, in _batched
    for item in iterator:
  File "C:\Program Files\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 450, in mapper
  File "C:\Program Files\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 450, in <genexpr>
  File "C:\Program Files\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 85, in <lambda>
  File "C:\Program Files\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-6-f5b5e08cc6c1>", line 5, in <lambda>
  File "<ipython-input-6-f5b5e08cc6c1>", line 4, in p2n
AttributeError: 'NoneType' object has no attribute 'strip'


In [None]:
examDF.printSchema()

### Calculating mean,std and variance 

In [None]:
# Mean YES
examDF.filter(examDF["Pass exam"]=="Yes").groupBy("Pass exam").agg({"%Achieved":"mean"}).show()

In [None]:
# STD Yes
examDF.filter(examDF["Pass exam"]=="Yes").groupBy("Pass exam").agg({"%Achieved":"std"}).show()

In [None]:
# Variance Yes
examDF.filter(examDF["Pass exam"]=="Yes").groupBy("Pass exam").agg({"%Achieved":"variance"}).show()

In [None]:
# Mean NO
examDF.filter(examDF["Pass exam"]=="No").groupBy("Pass exam").agg({"%Achieved":"mean"}).show()

In [None]:
# STD no
examDF.filter(examDF["Pass exam"]=="No").groupBy("Pass exam").agg({"%Achieved":"std"}).show()

In [None]:
# Variance No
examDF.filter(examDF["Pass exam"]=="No").groupBy("Pass exam").agg({"%Achieved":"variance"}).show()

### Creating a Naive bayes model to predict whether a student with specific marks will pass the exam or not

In [None]:
examDF.show()
examDF.printSchema()

In [None]:
from pyspark.ml.feature import StringIndexer

In [None]:
str_indexer=StringIndexer().setInputCol("Pass exam").setOutputCol("label")

In [None]:
indexed_examDF=str_indexer.fit(examDF).transform(examDF)

In [None]:
indexed_examDF.show()
indexed_examDF.printSchema()

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
va=VectorAssembler().setInputCols(["%Achieved"]).setOutputCol("features")

In [None]:
training_DF=va.transform(indexed_examDF)

In [None]:
training_DF.show()

In [None]:
train,test=training_DF.randomSplit([0.7,0.3],1234)

In [None]:
from pyspark.ml.linalg import *


In [None]:
#Applying Naive bayes classification
from pyspark.ml.classification import LogisticRegression, NaiveBayes

In [None]:
lr=LogisticRegression(featuresCol="features",labelCol="label")

In [None]:
# training the data
model=lr.fit(training_DF)
#pred=model.transform(test)


In [None]:
model.predictProbability(DenseVector([49.0]))

In [None]:
#evaluation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
evaluator=MulticlassClassificationEvaluator()

In [None]:
evaluator.evaluate(pred)

In [None]:
pred.show()

In [None]:
from pyspark.ml.linalg import *
model.predictProbability(DenseVector([48.0]))