# TASK 1 : Install Dependencies & Run a SparkSession


In [None]:
#install pyspark
! pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=764790a6326a57a3441833dfcd80a0be349307decb69e00d44bad1582edd0c24
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [None]:
#create a sparksession object
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark").getOrCreate()

# TASK 2 : Load & Explore dataset

In [None]:
#check the presence of dataset
!ls

Admission_Predict_Ver1.1.csv  sample_data


In [None]:
#create a spark dataframe
data = spark.read.csv("Admission_Predict_Ver1.1.csv", header=True, inferSchema=True)

In [None]:
#display dataframe, top 5
data.show(5)

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
only showing top 5 rows



In [None]:
#display the no.of rows & columns
print((data.count(), len(data.columns)))

(500, 9)


In [9]:
#print schema 
data.printSchema()


root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [10]:
#get the summary statistics
data.describe().toPandas().transpose()


Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Serial No,500,250.5,144.4818327679989,1,500
GRE Score,500,316.472,11.295148372354712,290,340
TOEFL Score,500,107.192,6.081867659564538,92,120
University Rating,500,3.114,1.143511800759815,1,5
SOP,500,3.374,0.9910036207566072,1.0,5.0
LOR,500,3.484,0.9254495738978191,1.0,5.0
CGPA,500,8.576440000000003,0.6048128003332054,6.8,9.92
Research,500,0.56,0.4968840786090358,0,1
Chance of Admit,500,0.7217399999999996,0.14114040395030228,0.34,0.97


# TASK 3 : Data Cleaning

In [19]:
#drop the unnecessary column
data = data.drop('Research')


In [20]:
#validate the unnecessary column is dropped
data.columns

['Serial No',
 'GRE Score',
 'TOEFL Score',
 'University Rating',
 'SOP',
 'LOR',
 'CGPA',
 'Chance of Admit']

In [18]:
#check for null values
for i in data.columns:
  print(i+":", data[data[i].isNull()].count())

Serial No: 0
GRE Score: 0
TOEFL Score: 0
University Rating: 0
SOP: 0
LOR: 0
CGPA: 0
Research: 0
Chance of Admit: 0


# TASK 4 : Correlation Analysis & Feature Selection

In [11]:
# correlation analysis
for col in data.columns:
  print('Correlation to chance of admin col for {} is {}'.format(col, data.stat.corr('Chance of Admit', col)))

Correlation to chance of admin col for Serial No is 0.00850504936113174
Correlation to chance of admin col for GRE Score is 0.8103506354632598
Correlation to chance of admin col for TOEFL Score is 0.7922276143050823
Correlation to chance of admin col for University Rating is 0.6901323687886892
Correlation to chance of admin col for SOP is 0.6841365241316723
Correlation to chance of admin col for LOR is 0.6453645135280112
Correlation to chance of admin col for CGPA is 0.882412574904574
Correlation to chance of admin col for Research is 0.5458710294711379
Correlation to chance of admin col for Chance of Admit is 1.0


In [21]:
# feature selection; pick 3 features based on their class correlation
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = ['GRE Score','TOEFL Score','CGPA'],outputCol = 'features')

In [30]:
#display the resultant dataframe
output_data = assembler.transform(data)
output_data.show()

+---------+---------+-----------+-----------------+---+---+----+---------------+------------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Chance of Admit|          features|
+---------+---------+-----------+-----------------+---+---+----+---------------+------------------+
|        1|      337|        118|                4|4.5|4.5|9.65|           0.92|[337.0,118.0,9.65]|
|        2|      324|        107|                4|4.0|4.5|8.87|           0.76|[324.0,107.0,8.87]|
|        3|      316|        104|                3|3.0|3.5| 8.0|           0.72| [316.0,104.0,8.0]|
|        4|      322|        110|                3|3.5|2.5|8.67|            0.8|[322.0,110.0,8.67]|
|        5|      314|        103|                2|2.0|3.0|8.21|           0.65|[314.0,103.0,8.21]|
|        6|      330|        115|                5|4.5|3.0|9.34|            0.9|[330.0,115.0,9.34]|
|        7|      321|        109|                3|3.0|4.0| 8.2|           0.75| [321.0,109.0,8.2]|


# TASK 5 : Build the Linear Regression Model

In [31]:
#import Linearregression and create final_data DF ie select subset of data, 'features' and 'Chance of Admit'
from pyspark.ml.regression import LinearRegression
final_data = output_data.select('features','Chance of Admit')

In [25]:
#print schema of final data
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [32]:
#split the dataset into training and testing set with a seed value of 123
train, test = final_data.randomSplit([0.8, 0.2], seed=123)


In [33]:
train.show()

+------------------+---------------+
|          features|Chance of Admit|
+------------------+---------------+
|[290.0,100.0,7.56]|           0.47|
|[290.0,104.0,7.46]|           0.45|
| [294.0,93.0,7.36]|           0.46|
| [294.0,95.0,7.64]|           0.49|
|  [295.0,93.0,7.2]|           0.46|
| [295.0,99.0,7.57]|           0.37|
| [295.0,99.0,7.65]|           0.57|
|[295.0,101.0,7.86]|           0.69|
| [296.0,95.0,7.54]|           0.44|
|  [296.0,97.0,7.8]|           0.49|
|[296.0,101.0,7.68]|            0.6|
| [297.0,96.0,7.43]|           0.34|
| [297.0,96.0,7.89]|           0.43|
| [297.0,98.0,7.67]|           0.59|
| [297.0,100.0,7.9]|           0.52|
|[297.0,101.0,7.67]|           0.57|
| [298.0,92.0,7.88]|           0.51|
| [298.0,97.0,7.21]|           0.45|
| [298.0,98.0,8.03]|           0.34|
| [298.0,99.0,7.46]|           0.53|
+------------------+---------------+
only showing top 20 rows



In [34]:
#build & train the LinearRegression model
models = LinearRegression(featuresCol = 'features',labelCol = 'Chance of Admit')
model = models.fit(train)

In [35]:
#get coefficients & intercept  (y = mx + c) i.e. model specific information
print("coefficients:",model.coefficients)
print("intercept:",model.intercept)

coefficients: [0.0022656083234338467,0.003943604439786189,0.1357809197519976]
intercept: -1.5834317832688576


In [36]:
#get summary of the model
summary = model.summary

In [37]:
#print the mae, rmse & r2 score
print('MAE', summary.meanAbsoluteError)
print('RMSE',summary.rootMeanSquaredError)
print('r2 score', summary.r2)

MAE 0.04530729939838914
RMSE 0.06334892921559741
r2 score 0.7964903541530333


# TASK 6 : Evaluate & Save the Model

In [40]:
#transform on the test data
predictions = model.transform(test)



In [41]:
#display the predictions
predictions.show()

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
|  [293.0,97.0,7.8]|           0.64| 0.5220122602221011|
| [295.0,96.0,7.34]|           0.47| 0.4601406493432638|
| [296.0,99.0,7.28]|           0.47|0.46609021580093635|
| [296.0,99.0,8.03]|           0.61| 0.5679259056149346|
| [297.0,99.0,7.81]|           0.54|  0.540319711592929|
|  [298.0,98.0,7.5]|           0.44|0.49654963035345756|
|[298.0,101.0,7.86]|           0.54| 0.5572615747835352|
|[298.0,105.0,8.54]|           0.69| 0.6653670179740383|
| [299.0,97.0,7.66]|           0.38| 0.5165965813974245|
|[299.0,100.0,7.42]|           0.42| 0.4958399739763035|
|[299.0,100.0,7.88]|           0.51| 0.5582991970622226|
|[299.0,100.0,8.02]|           0.63| 0.5773085258275021|
| [299.0,106.0,8.4]|           0.64| 0.6525669019719784|
|[300.0,100.0,8.66]|           0.64| 0.6664739227922147|
| [301.0,98.0,8.03]|           

In [43]:
#evaluate the model for rmse, mae, r2 metrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
evaluator1 = RegressionEvaluator(predictionCol = 'prediction',labelCol = 'Chance of Admit',metricName = 'r2')
evaluator2 = RegressionEvaluator(predictionCol = 'prediction',labelCol = 'Chance of Admit',metricName = 'rmse')
evaluator3 = RegressionEvaluator(predictionCol = 'prediction',labelCol = 'Chance of Admit',metricName = 'mae')

print('r2 on the test data',evaluator1.evaluate(predictions))
print('rmse on the test data',evaluator2.evaluate(predictions))
print('mae on the test data',evaluator3.evaluate(predictions))



r2 on the test data 0.8329088873418462
rmse on the test data 0.05853544314069586
mae on the test data 0.04280128115315109


In [44]:
#save the model
model.save('model')


In [45]:
#load the model
from pyspark.ml.regression import LinearRegressionModel
model = LinearRegressionModel.load('model')

# TASK 7 : Create a Random Forest Regressor and compare its results with the Linear Regression model


In [49]:
#import RandomForestRegressor and fit the data to the model
from pyspark.ml.regression import RandomForestRegressor

#build & train the model
regressor = RandomForestRegressor(featuresCol='features', labelCol='Chance of Admit')


In [57]:
modelr = regressor.fit(train)


In [58]:
#get model's number of Trees
print (modelr.featureImportances)


(3,[0,1,2],[0.26179640171419194,0.26470046959588495,0.47350312868992317])


In [60]:
#transform on the test data
pred = modelr.transform(test) #cvModel obj will automatically pick the bestModel
pred.select('Chance of Admit', 'prediction').show()

+---------------+-------------------+
|Chance of Admit|         prediction|
+---------------+-------------------+
|           0.64|0.49425567423613775|
|           0.47|0.44684923767752716|
|           0.47|    0.4802772067906|
|           0.61| 0.5012079800988192|
|           0.54| 0.5233818665149336|
|           0.44| 0.4920378488659159|
|           0.54| 0.5859917737393376|
|           0.69| 0.6407372240631993|
|           0.38| 0.5062267637947271|
|           0.42| 0.5116799095138997|
|           0.51| 0.5722134583358723|
|           0.63| 0.5842205671915427|
|           0.64| 0.6427771735581488|
|           0.64| 0.6615500595814312|
|           0.67| 0.5691984481073135|
|           0.68| 0.6319872167432464|
|           0.42| 0.5675635436370446|
|           0.38| 0.5699383124663113|
|           0.52| 0.5729310759116915|
|           0.64| 0.6381175647305585|
+---------------+-------------------+
only showing top 20 rows



In [62]:
#display the predictions
pred = cvModel.transform(test) #cvModel obj will automatically pick the bestModel
pred.select('Chance of Admit', 'prediction').show()

+---------------+-------------------+
|Chance of Admit|         prediction|
+---------------+-------------------+
|           0.64|0.49425567423613775|
|           0.47|0.44684923767752716|
|           0.47|    0.4802772067906|
|           0.61| 0.5012079800988192|
|           0.54| 0.5233818665149336|
|           0.44| 0.4920378488659159|
|           0.54| 0.5859917737393376|
|           0.69| 0.6407372240631993|
|           0.38| 0.5062267637947271|
|           0.42| 0.5116799095138997|
|           0.51| 0.5722134583358723|
|           0.63| 0.5842205671915427|
|           0.64| 0.6427771735581488|
|           0.64| 0.6615500595814312|
|           0.67| 0.5691984481073135|
|           0.68| 0.6319872167432464|
|           0.42| 0.5675635436370446|
|           0.38| 0.5699383124663113|
|           0.52| 0.5729310759116915|
|           0.64| 0.6381175647305585|
+---------------+-------------------+
only showing top 20 rows



In [65]:
#evaluate the model for rmse, mae and r2 metrics
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol='Chance of Admit')
rmse = eval.evaluate(pred)
mse = eval.evaluate(pred, {eval.metricName: 'mse'})
mae = eval.evaluate(pred, {eval.metricName: 'mae'})
r2 = eval.evaluate(pred, {eval.metricName: 'r2'})

print("RMSE: %.3f" %rmse)
print("MSE: %.3f" %mse)
print("MAE: %.3f" %mae) # mean abs error: avg error +ve, -ve expected in predictions
print("r2: %.3f" %r2)

RMSE: 0.062
MSE: 0.004
MAE: 0.045
r2: 0.815
