# **Graduate Admission Prediction with PySpark**

# TASK 1 : Install Dependencies & Run a SparkSession


In [None]:
#install pyspark
! pip install pyspark



In [None]:
#create a sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark_app').getOrCreate()

# TASK 2 : Clone & Explore dataset

In [None]:
#clone the dataset
! git clone https://github.com/education454/admission_dataset

fatal: destination path 'admission_dataset' already exists and is not an empty directory.


In [None]:
#check the presence of dataset
! ls admission_dataset 

Admission_Predict_Ver1.1.csv


In [None]:
#create a spark dataframe
df = spark.read.csv('/content/admission_dataset/Admission_Predict_Ver1.1.csv', header=True, inferSchema=True)

In [None]:
#display dataframe
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [None]:
#get the no.of rows & columns
print((df.count(), len(df.columns)))

(500, 9)


In [None]:
#print schema 
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [None]:
#get the summary statistics
df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

# TASK 3 : Data Cleaning

In [None]:
#drop the unnecessary column
df = df.drop('Serial No')

In [None]:
#display the dataframe
df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|      302|        102|                1|2.0|1.5| 8.0|       0|            0.5|
|      323|        108|                3

In [None]:
#check for null values
for i in df.columns:
  print(i + ":", df[df[i].isNull()].count())

GRE Score: 0
TOEFL Score: 0
University Rating: 0
SOP: 0
LOR: 0
CGPA: 0
Research: 0
Chance of Admit: 0


# TASK 4 : Correlation Analysis & Feature Selection

In [None]:
# correlation analysis
for col in df.columns:
  print('Correlation to chance of admit for {} column is {}.'.format(col, df.stat.corr(col, 'Chance of Admit')))

Correlation to chance of admit for GRE Score column is 0.8103506354632601.
Correlation to chance of admit for TOEFL Score column is 0.7922276143050825.
Correlation to chance of admit for University Rating column is 0.6901323687886894.
Correlation to chance of admit for SOP column is 0.6841365241316723.
Correlation to chance of admit for LOR column is 0.645364513528011.
Correlation to chance of admit for CGPA column is 0.882412574904574.
Correlation to chance of admit for Research column is 0.5458710294711379.
Correlation to chance of admit for Chance of Admit column is 1.0.


In [None]:
# feature selection
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['GRE Score','TOEFL Score','CGPA'],outputCol='features')

In [None]:
#display dataframe
output_data = assembler.transform(df)
output_data.show()

Exception ignored in: <function JavaWrapper.__del__ at 0x7ff59e543e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/pyspark/ml/wrapper.py", line 39, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'RegressionEvaluator' object has no attribute '_java_obj'


+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,109.0,8.2]|
|      308

# TASK 5 : Build the Linear Regression Model

In [None]:
#import Linearregression and create final data
from pyspark.ml.regression import LinearRegression
final_data = output_data.select('features', 'Chance of Admit')

In [None]:
#print schema of final data
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [None]:
#split the dataset into training and testing set
train, test = final_data.randomSplit([0.7, 0.3])

In [None]:
#build & train the model
models = LinearRegression(featuresCol='features', labelCol='Chance of Admit')
model = models.fit(train)

In [None]:
#get coefficients & intercept
print('coefficients:', model.coefficients)
print('intercept:', model.intercept)

coefficients: [0.002812303047931625,0.0027355376407363114,0.13889142744916452]
intercept: -1.6511667588615981


In [None]:
#get summary of the model
summary = model.summary

In [None]:
#print the rmse & r2 score
print('RSME', summary.rootMeanSquaredError)
print('r2 score', summary.r2)

RSME 0.06216764571408949
r2 score 0.8052063974543022


# TASK 6 : Evaluate & Save the Model

In [None]:
#transform on the test data
predictions = model.transform(test)

In [None]:
#display the predictions
predictions.show()

+------------------+---------------+------------------+
|          features|Chance of Admit|        prediction|
+------------------+---------------+------------------+
|  [293.0,97.0,7.8]|           0.64|0.5215383194372738|
|[296.0,101.0,7.68]|            0.6|0.5242504078501138|
| [297.0,99.0,7.81]|           0.54|0.5396475211849643|
|  [298.0,99.0,7.6]|           0.46|0.5132926244685712|
|[298.0,100.0,7.95]|           0.58|0.5646401617165149|
|[298.0,101.0,7.86]|           0.54|0.5548754708868266|
|[298.0,105.0,8.54]|           0.69|0.6602637921152035|
|[299.0,100.0,7.42]|           0.42|0.4938400082163896|
|[300.0,104.0,8.16]|           0.71|0.6103741181396483|
| [301.0,99.0,8.22]|           0.64|0.6078422186308485|
|[301.0,104.0,7.89]|           0.68|0.5756857357763054|
|[301.0,104.0,8.12]|           0.68|0.6076307640896135|
|[302.0,101.0,7.96]|           0.46|0.5800138258234695|
| [302.0,102.0,8.0]|            0.5|0.5883050205621727|
| [303.0,99.0,7.66]|           0.36|0.5356876253

In [None]:
#evaluate the model 
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Chance of Admit', metricName='r2')
print('r2 on the test data', evaluator.evaluate(predictions))

r2 on the test data 0.8009695017958209


In [None]:
#save the model
model.save("admission_model")

In [None]:
#load the model
from pyspark.ml.regression import LinearRegressionModel
model = LinearRegressionModel.load("admission_model")