In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.sql import SparkSession

In [2]:
#dataset = pd.read_csv('Admission_Predict.csv')
spark = SparkSession.builder.getOrCreate()
#dataset_ps = spark.createDataFrame(dataset)
df=spark.read.csv('Admission_Predict.csv',inferSchema=True,header=True)
df.show(10)

+----------+---------+-----------+-----------------+---+----+----+--------+----+
|Serial No.|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|  cA|
+----------+---------+-----------+-----------------+---+----+----+--------+----+
|         1|      337|        118|                4|4.5| 4.5|9.65|       1|0.92|
|         2|      324|        107|                4|4.0| 4.5|8.87|       1|0.76|
|         3|      316|        104|                3|3.0| 3.5| 8.0|       1|0.72|
|         4|      322|        110|                3|3.5| 2.5|8.67|       1| 0.8|
|         5|      314|        103|                2|2.0| 3.0|8.21|       0|0.65|
|         6|      330|        115|                5|4.5| 3.0|9.34|       1| 0.9|
|         7|      321|        109|                3|3.0| 4.0| 8.2|       1|0.75|
|         8|      308|        101|                2|3.0| 4.0| 7.9|       0|0.68|
|         9|      302|        102|                1|2.0| 1.5| 8.0|       0| 0.5|
|        10|      323|      

#  Prediction 

In [4]:
d1 = pd.DataFrame([[337,118,4,4.5,4.6,9.65,1,0.8]] , columns =['GRE Score','TOEFL Score','University Rating','SOP','LOR ','CGPA','Research','cA'])
sp = spark.createDataFrame(d1)
d1

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,cA
0,337,118,4,4.5,4.6,9.65,1,0.8


In [21]:
df.show(5,False)

+----------+---------+-----------+-----------------+---+----+----+--------+----+
|Serial No.|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|cA  |
+----------+---------+-----------+-----------------+---+----+----+--------+----+
|1         |337      |118        |4                |4.5|4.5 |9.65|1       |0.92|
|2         |324      |107        |4                |4.0|4.5 |8.87|1       |0.76|
|3         |316      |104        |3                |3.0|3.5 |8.0 |1       |0.72|
|4         |322      |110        |3                |3.5|2.5 |8.67|1       |0.8 |
|5         |314      |103        |2                |2.0|3.0 |8.21|0       |0.65|
+----------+---------+-----------+-----------------+---+----+----+--------+----+
only showing top 5 rows



#  Vectorizing Streams of Data Columns 

In [6]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler


In [7]:
vc = VectorAssembler(inputCols = ['GRE Score','TOEFL Score','University Rating','SOP','LOR ','CGPA','Research'] 
                     , outputCol = 'features')
features_df = vc.transform(df)
print(features_df)
sp1=vc.transform(sp)
print(sp1)

DataFrame[Serial No.: int, GRE Score: int, TOEFL Score: int, University Rating: int, SOP: double, LOR : double, CGPA: double, Research: int, cA: double, features: vector]
DataFrame[GRE Score: bigint, TOEFL Score: bigint, University Rating: bigint, SOP: double, LOR : double, CGPA: double, Research: bigint, cA: double, features: vector]


#  Selecting Features and the prediction columns 



In [8]:
model_df  = features_df.select('features','cA') 
sp1_mod = features_df.select('features','cA')
print(sp1_mod)
print(model_df)

DataFrame[features: vector, cA: double]
DataFrame[features: vector, cA: double]


In [9]:
train_df , test_df = model_df.randomSplit([0.7,0.3])
print("test:--")
print(test_df)
print("train:--")
print(train_df)
print("model:--")
print(model_df)


test:--
DataFrame[features: vector, cA: double]
train:--
DataFrame[features: vector, cA: double]
model:--
DataFrame[features: vector, cA: double]


#  Random Forest Regression

In [10]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [11]:
dt = RandomForestRegressor(labelCol = 'cA' , featuresCol ='features')
dt_model = dt.fit(model_df)
dt_predictions = dt_model.transform(sp1_mod)

dt_predictions.select(['prediction']).show(1,False)

+------------------+
|prediction        |
+------------------+
|0.9340921821774056|
+------------------+
only showing top 1 row



#  Linear Regression 


Output follows Gaussian Distribution


In [16]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [24]:
lr = LinearRegression(labelCol = 'cA' )
lr_model = lr.fit(train_df)
lr_predictions = lr_model.transform(sp1_mod)
lr_predictions.select(['prediction']).show(1,False)

+-----------------+
|prediction       |
+-----------------+
|0.955110737114637|
+-----------------+
only showing top 1 row



In [25]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.show(10,False)

+----------------------------------+----+-------------------+
|features                          |cA  |prediction         |
+----------------------------------+----+-------------------+
|[294.0,93.0,1.0,1.5,2.0,7.36,0.0] |0.46|0.45185440159084966|
|[295.0,96.0,2.0,1.5,2.0,7.34,0.0] |0.47|0.46441277021823146|
|[296.0,95.0,2.0,3.0,2.0,7.54,1.0] |0.44|0.5014180043943184 |
|[296.0,99.0,2.0,3.0,3.5,7.28,0.0] |0.47|0.4897996985108175 |
|[298.0,92.0,1.0,2.0,2.0,7.88,0.0] |0.51|0.5173059907846305 |
|[298.0,98.0,2.0,4.0,3.0,8.03,0.0] |0.34|0.5688787713670138 |
|[298.0,101.0,2.0,1.5,2.0,7.86,0.0]|0.54|0.5525889952399594 |
|[299.0,96.0,2.0,1.5,2.0,7.86,0.0] |0.54|0.5364313606818898 |
|[300.0,99.0,1.0,1.0,2.5,8.01,0.0] |0.58|0.577413586849211  |
|[301.0,98.0,1.0,2.0,3.0,8.03,1.0] |0.67|0.6045552546456838 |
+----------------------------------+----+-------------------+
only showing top 10 rows



In [28]:
train_res = lr_model.evaluate(train_df).predictions
test = lr_model.evaluate(test_df).predictions
spcase = lr_model.evaluate(sp1_mod).predictions

In [29]:
train_res.show(10,False)
test.show(10,False)
spcase.show(1,False)

+----------------------------------+----+------------------+
|features                          |cA  |prediction        |
+----------------------------------+----+------------------+
|[290.0,100.0,1.0,1.5,2.0,7.56,0.0]|0.47|0.496688774986483 |
|[290.0,104.0,4.0,2.0,2.5,7.46,0.0]|0.45|0.5148618166000156|
|[293.0,97.0,2.0,2.0,4.0,7.8,1.0]  |0.64|0.5845742871098834|
|[294.0,95.0,1.0,1.5,1.5,7.64,0.0] |0.49|0.4846116764044359|
|[295.0,93.0,1.0,2.0,2.0,7.2,0.0]  |0.46|0.4295814877438666|
|[295.0,99.0,2.0,2.5,3.0,7.65,0.0] |0.57|0.5288555619622342|
|[295.0,101.0,2.0,2.5,2.0,7.86,0.0]|0.69|0.5424715944315557|
|[296.0,97.0,2.0,1.5,2.0,7.8,0.0]  |0.49|0.5282532826703872|
|[296.0,99.0,2.0,2.5,2.5,8.03,0.0] |0.61|0.5687751866038249|
|[296.0,101.0,1.0,2.5,3.0,7.68,0.0]|0.6 |0.5377145102388599|
+----------------------------------+----+------------------+
only showing top 10 rows

+----------------------------------+----+-------------------+
|features                          |cA  |prediction       

# Decision Tree Regression

In [26]:
from pyspark.ml.regression import DecisionTreeRegressor
#from pyspark.ml.evaluation import RegressionEvaluator

In [27]:
dtr = DecisionTreeRegressor(labelCol = 'cA' , featuresCol ='features')
dtr_model = dtr.fit(model_df)
dtr_predictions = dtr_model.transform(sp1_mod)

dtr_predictions.select(['prediction']).show(1,False)

+------------------+
|prediction        |
+------------------+
|0.9278947368421054|
+------------------+
only showing top 1 row



# Generalized  Linear Regression

follows exponential family type

Family ----- Response Type ------ Suported
Gaussian     Continious           Identity , Log , Inverse
Binomial     Binary               logit  , probit , ClogLog


In [30]:
from pyspark.ml.regression import GeneralizedLinearRegression

In [35]:
glr = GeneralizedLinearRegression(labelCol = 'cA' , featuresCol ='features' , family ="gaussian")
glr_model = glr.fit(model_df)
glr_predictions = glr_model.transform(sp1_mod)

glr_predictions.select(['prediction']).show(1,False)

+------------------+
|prediction        |
+------------------+
|0.9514585639019955|
+------------------+
only showing top 1 row



In [36]:
train_res = glr_model.evaluate(train_df).predictions
test = glr_model.evaluate(test_df).predictions
spcase = glr_model.evaluate(sp1_mod).predictions
train_res.show(10,False)
test.show(10,False)
spcase.show(1,False)

+----------------------------------+----+-------------------+
|features                          |cA  |prediction         |
+----------------------------------+----+-------------------+
|[290.0,100.0,1.0,1.5,2.0,7.56,0.0]|0.47|0.4810219874562627 |
|[290.0,104.0,4.0,2.0,2.5,7.46,0.0]|0.45|0.5074803029108064 |
|[293.0,97.0,2.0,2.0,4.0,7.8,1.0]  |0.64|0.579316395146293  |
|[294.0,95.0,1.0,1.5,1.5,7.64,0.0] |0.49|0.4717123422791931 |
|[295.0,93.0,1.0,2.0,2.0,7.2,0.0]  |0.46|0.42480121977535257|
|[295.0,99.0,2.0,2.5,3.0,7.65,0.0] |0.57|0.5222586357363646 |
|[295.0,101.0,2.0,2.5,2.0,7.86,0.0]|0.69|0.5307219472972107 |
|[296.0,97.0,2.0,1.5,2.0,7.8,0.0]  |0.49|0.5169498534297157 |
|[296.0,99.0,2.0,2.5,2.5,8.03,0.0] |0.61|0.5580164760100152 |
|[296.0,101.0,1.0,2.5,3.0,7.68,0.0]|0.6 |0.5276867264142    |
+----------------------------------+----+-------------------+
only showing top 10 rows

+----------------------------------+----+-------------------+
|features                          |cA  |pre

#  Gradient Boosted Tree Regression

Gradient-Boosted Trees (GBTs) are ensembles of decision trees. GBTs iteratively train decision trees in order to minimize a loss function.
The spark.ml implementation supports GBTs for binary classification and for regression, using both continuous and categorical features.

In [12]:
from pyspark.ml.regression import GBTRegressor

In [14]:
gbtr = GBTRegressor(labelCol = 'cA' , featuresCol ='features')
gbtr_model = gbtr.fit(model_df)
gbtr_predictions = gbtr_model.transform(sp1_mod)

gbtr_predictions.select(['prediction']).show(1,False)

+------------------+
|prediction        |
+------------------+
|0.9267027715295116|
+------------------+
only showing top 1 row



# Isotonic regression

We implement a pool adjacent violators algorithm which uses an approach to parallelizing isotonic regression. The training input is a DataFrame which contains three columns label, features and weight. Additionally IsotonicRegression algorithm has one optional parameter called isotonic defaulting to true. This argument specifies if the isotonic regression is isotonic (monotonically increasing) or antitonic (monotonically decreasing).

Training returns an IsotonicRegressionModel that can be used to predict labels for both known and unknown features. The result of isotonic regression is treated as piecewise linear function. The rules for prediction therefore are:

If the prediction input exactly matches a training feature then associated prediction is returned. In case there are multiple predictions with the same feature then one of them is returned. Which one is undefined (same as java.util.Arrays.binarySearch).
If the prediction input is lower or higher than all training features then prediction with lowest or highest feature is returned respectively. In case there are multiple predictions with the same feature then the lowest or highest is returned respectively.
If the prediction input falls between two training features then prediction is treated as piecewise linear function and interpolated value is calculated from the predictions of the two closest features. In case there are multiple values with the same feature then the same rules as in previous point are used.

In [18]:
from pyspark.ml.regression import IsotonicRegression

In [19]:
afsr = IsotonicRegression(labelCol = 'cA' , featuresCol ='features')
afsr_model = afsr.fit(model_df)
afsr_predictions = afsr_model.transform(sp1_mod)

afsr_predictions.select(['prediction']).show(1,False)

+------------------+
|prediction        |
+------------------+
|0.9361111111111112|
+------------------+
only showing top 1 row



In [20]:
print("Boundaries in increasing order: %s\n" % str(afsr_model.boundaries))
print("Predictions associated with the boundaries: %s\n" % str(afsr_model.predictions))

Boundaries in increasing order: [290.0,290.0,293.0,298.0,298.0,298.0,299.0,299.0,299.0,299.0,300.0,300.0,300.0,304.0,304.0,307.0,307.0,308.0,308.0,311.0,311.0,312.0,312.0,312.0,316.0,316.0,316.0,317.0,317.0,319.0,319.0,319.0,319.0,320.0,320.0,320.0,320.0,325.0,325.0,325.0,327.0,327.0,327.0,329.0,329.0,329.0,329.0,329.0,330.0,330.0,330.0,331.0,331.0,334.0,334.0,334.0,339.0,339.0,340.0,340.0,340.0,340.0]

Predictions associated with the boundaries: [0.45,0.47,0.5005263157894738,0.5005263157894738,0.51,0.5119999999999999,0.5119999999999999,0.54,0.56,0.5766666666666667,0.5766666666666667,0.58,0.5800000000000001,0.5800000000000001,0.6245454545454545,0.6245454545454545,0.6571428571428573,0.6571428571428573,0.6573076923076923,0.6573076923076923,0.66,0.66,0.67,0.6790196078431372,0.6790196078431372,0.69,0.6950000000000001,0.6950000000000001,0.7172222222222222,0.7172222222222222,0.72,0.73,0.7371428571428572,0.7371428571428572,0.74,0.76,0.7880882352941176,0.7880882352941176,0.79,0.795454545454545