# TASK 1 : Install Dependencies & Run a SparkSession


In [None]:
#install pyspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 33 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 49.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=2165bdd4d1cc2ad4837f45f0eacdc314ce8d779de26d574706f754b26bc4fef3
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [None]:
#create a sparksession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("spark").getOrCreate()
spark

# TASK 2 : Clone & Explore dataset

In [None]:
#clone the dataset
! git clone https://github.com/education454/admission_dataset

Cloning into 'admission_dataset'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (3/3), done.


In [None]:
#check the presence of dataset
!ls admission_dataset

Admission_Predict_Ver1.1.csv


In [None]:
import os

os.getcwd()

'/content'

In [None]:
#create a spark dataframe
df = spark.read.csv("/content/admission_dataset/Admission_Predict_Ver1.1.csv", header=True, inferSchema=True)

In [None]:
#display dataframe
df.show(5)

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
only showing top 5 rows



In [None]:
#get the no.of rows & columns
print((df.count(), len(df.columns)))

(500, 9)


In [None]:
#print schema 
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [None]:
#get the summary statistics
df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

# TASK 3 : Data Cleaning

In [None]:
#drop the unnecessary column

df = df.drop("Serial No")

In [None]:
#display the dataframe
df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|      302|        102|                1|2.0|1.5| 8.0|       0|            0.5|
|      323|        108|                3

In [None]:
#check for null values

for i in df.columns:
  print(i, ":", df[df[i].isNull()].count())

GRE Score : 0
TOEFL Score : 0
University Rating : 0
SOP : 0
LOR : 0
CGPA : 0
Research : 0
Chance of Admit : 0


# TASK 4 : Correlation Analysis & Feature Selection

In [None]:
# correlation analysis
for col in df.columns:
  print("Correlation to chance of admit for {} is {} " .format(col, round(df.stat.corr("Chance of Admit", col),2)))

Correlation to chance of admit for GRE Score is 0.81 
Correlation to chance of admit for TOEFL Score is 0.79 
Correlation to chance of admit for University Rating is 0.69 
Correlation to chance of admit for SOP is 0.68 
Correlation to chance of admit for LOR is 0.65 
Correlation to chance of admit for CGPA is 0.88 
Correlation to chance of admit for Research is 0.55 
Correlation to chance of admit for Chance of Admit is 1.0 


In [None]:
# feature selection

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["GRE Score", "TOEFL Score", "CGPA"], outputCol="features")

In [None]:
#display dataframe

output_data = assembler.transform(df)
output_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,109.0,8.2]|
|      308

# TASK 5 : Build the Linear Regression Model

In [None]:
#import Linearregression and create final data

from pyspark.ml.regression import LinearRegression

final_data= output_data.select("features", "Chance of Admit")

In [None]:
#print schema of final data

final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [None]:
#split the dataset into training and testing set

train, test = final_data.randomSplit([0.7, 0.3])

In [None]:
#build & train the model

models = LinearRegression(featuresCol="features" , labelCol="Chance of Admit")
model = models.fit(train)

In [None]:
#get coefficients & intercept
print("Coefficients: ", model.coefficients)
print("Intercept: ", model.intercept)

Coefficients:  [0.0029362489240756084,0.0031225226548558696,0.13456493107421272]
Intercept:  -1.6944506547748606


In [None]:
#get summary of the model

summary= model.summary

In [None]:
#print the rmse & r2 score

print("RMSE: ", summary.rootMeanSquaredError)
print("R2: ", summary.r2)


RMSE:  0.0632043430659309
R2:  0.7935533892431867


# TASK 6 : Evaluate & Save the Model

In [None]:
#transform on the test data

predictions = model.transform(test)

In [None]:
#display the predictions
predictions.show(20)

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
|  [293.0,97.0,7.8]|           0.64| 0.5183614398791712|
| [294.0,95.0,7.64]|           0.49| 0.4935222545216609|
|  [295.0,93.0,7.2]|           0.46|0.43100488846337126|
| [296.0,95.0,7.54]|           0.44|0.48593825926239087|
| [296.0,99.0,8.03]|           0.61| 0.5643651661081786|
| [297.0,99.0,7.81]|           0.54| 0.5376971301959277|
|[297.0,101.0,7.67]|           0.57| 0.5251030851552494|
| [298.0,99.0,7.46]|           0.53| 0.4935356532440289|
|  [298.0,99.0,7.6]|           0.46| 0.5123747435944186|
|[298.0,101.0,7.69]|           0.53| 0.5307306327008094|
|[298.0,105.0,8.54]|           0.69| 0.6576009147333137|
| [299.0,97.0,7.66]|           0.38| 0.5171398430732352|
|[299.0,100.0,7.88]|           0.51| 0.5561116958741297|
|[299.0,100.0,7.88]|           0.68| 0.5561116958741297|
| [299.0,106.0,8.4]|           

In [None]:
from pyspark.ml import evaluation
#evaluate the model 

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Chance of Admit" , metricName="r2")
print("R2 on the test data is ", evaluator.evaluate(predictions))

R2 on the test data is  0.8241453759038979


In [None]:
#save the model

model.save("ChanceOfAdmit_linear_model")

In [None]:
#load the model

from pyspark.ml.regression import LinearRegressionModel
model= LinearRegressionModel.load("ChanceOfAdmit_linear_model")