In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd

In [2]:
schemaFields = StructType([StructField("date", DateType(), False),
                StructField("open", FloatType(), False),
                StructField("high", FloatType(), False),
                StructField("low", FloatType(), False),
                StructField("close", FloatType(), False),
                StructField("Name", StringType(), False)])

In [3]:
# read stocks data
# File location and type
file_location = "/FileStore/tables/Admission_Predict.csv"
file_type = "csv"

# CSV options
infer_schema = "TRUE"
first_row_is_header = "TRUE"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option('inferSchema',infer_schema)\
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)


In [4]:
df = df.drop('Serial No.')

In [5]:
df.show(2)

In [6]:
#convert all the features into vectors
from pyspark.ml.feature import VectorAssembler

In [7]:
df = df.withColumnRenamed('LOR ','LOR')

In [8]:
df = df.withColumnRenamed('Chance of Admit ','Chance of Admit')

In [9]:
vector_assemble = VectorAssembler(inputCols=['GRE Score','TOEFL Score','University Rating','SOP','LOR','CGPA','Research'], outputCol = 'features')

In [10]:
assemble_df = vector_assemble.transform(df)

In [11]:
assemble_df = assemble_df.select('features','Chance of Admit')

In [12]:
#converting features into MinMax
from pyspark.ml.feature import MinMaxScaler
scaler = MinMaxScaler(inputCol='features', outputCol="scaledFeatures")

In [13]:
scaler_fit = scaler.fit(assemble_df)

In [14]:
scaled_df = scaler_fit.transform(assemble_df)

In [15]:
scaled_df = scaled_df.select('scaledFeatures','Chance of Admit')

In [16]:
split = scaled_df.randomSplit([0.7,0.3])

In [17]:
train = split[0]
test = split[1]

In [18]:
from pyspark.ml.regression import LinearRegression

In [19]:
train.columns

In [20]:
lr = LinearRegression(featuresCol='scaledFeatures', labelCol='Chance of Admit')

In [21]:
lr_model = lr.fit(train)

In [22]:
training_summary = lr_model.summary

In [23]:
print('Training R^2 : %s' %str(training_summary.r2adj))
print('Error : %s' %str(training_summary.rootMeanSquaredError))

In [24]:
#prediction based on trained data
pred_df = lr_model.transform(train)

In [25]:
pred_df.show(5)

In [26]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Chance of Admit",metricName="r2")

In [27]:
evaluate_df = lr_evaluator.evaluate(pred_df)

In [28]:
#evaluation percentage
print('The evaluation %g' %evaluate_df)