In [2]:
import pyspark
from pyspark.sql import SparkSession
#SparkSession is now the entry point of Spark
#SparkSession can also be construed as gateway to spark libraries

#create instance of spark class
spark=SparkSession.builder.appName('housing_price_model').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/06 09:42:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/06 09:42:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/04/06 09:42:09 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/04/06 09:42:09 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/04/06 09:42:09 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [4]:
#Loading the Student_Grades_Data.csv file, uploaded in previous step
data = spark.read.csv('/config/workspace/Student_Grades_Data.csv', header=True, inferSchema=True)

In [5]:
#Taking a look at data type of each column to see what data types inferSchema=TRUE paramter has set for each column
data.printSchema()

root
 |-- Time_to_Study: integer (nullable = true)
 |-- Grades: double (nullable = true)



In [6]:
#Display first few rows of data
data.show()

+-------------+------+
|Time_to_Study|Grades|
+-------------+------+
|            1|   1.5|
|            5|   2.7|
|            7|   3.1|
|            3|   2.1|
|            2|   1.8|
|            9|   3.9|
|            6|   2.9|
|           12|   4.5|
|           11|   4.3|
|            2|   1.8|
|            4|   2.4|
|            8|   3.5|
|           13|   4.8|
|            9|   3.9|
|           14|   5.0|
|           10|   4.1|
|            6|   2.9|
|           12|   4.5|
|            1|   1.5|
|            4|   2.4|
+-------------+------+
only showing top 20 rows



In [7]:
#Create a Feature array by omitting the last column
feature_cols = data.columns[:-1] 
from pyspark.ml.feature import VectorAssembler
vect_assembler = VectorAssembler(inputCols=feature_cols,outputCol="features")

In [8]:
#Utilize Assembler created above in order to add the feature column
data_w_features = vect_assembler.transform(data)

In [9]:
#Display the data having additional column named features. Had it been multiple linear regression problem, you could see all the
# independent variable values combined in one list
data_w_features.show()

+-------------+------+--------+
|Time_to_Study|Grades|features|
+-------------+------+--------+
|            1|   1.5|   [1.0]|
|            5|   2.7|   [5.0]|
|            7|   3.1|   [7.0]|
|            3|   2.1|   [3.0]|
|            2|   1.8|   [2.0]|
|            9|   3.9|   [9.0]|
|            6|   2.9|   [6.0]|
|           12|   4.5|  [12.0]|
|           11|   4.3|  [11.0]|
|            2|   1.8|   [2.0]|
|            4|   2.4|   [4.0]|
|            8|   3.5|   [8.0]|
|           13|   4.8|  [13.0]|
|            9|   3.9|   [9.0]|
|           14|   5.0|  [14.0]|
|           10|   4.1|  [10.0]|
|            6|   2.9|   [6.0]|
|           12|   4.5|  [12.0]|
|            1|   1.5|   [1.0]|
|            4|   2.4|   [4.0]|
+-------------+------+--------+
only showing top 20 rows



In [10]:
#Select only Features and Label from previous dataset as we need these two entities for building machine learning model
finalized_data = data_w_features.select("features","Grades")

finalized_data.show()

+--------+------+
|features|Grades|
+--------+------+
|   [1.0]|   1.5|
|   [5.0]|   2.7|
|   [7.0]|   3.1|
|   [3.0]|   2.1|
|   [2.0]|   1.8|
|   [9.0]|   3.9|
|   [6.0]|   2.9|
|  [12.0]|   4.5|
|  [11.0]|   4.3|
|   [2.0]|   1.8|
|   [4.0]|   2.4|
|   [8.0]|   3.5|
|  [13.0]|   4.8|
|   [9.0]|   3.9|
|  [14.0]|   5.0|
|  [10.0]|   4.1|
|   [6.0]|   2.9|
|  [12.0]|   4.5|
|   [1.0]|   1.5|
|   [4.0]|   2.4|
+--------+------+
only showing top 20 rows



In [11]:
#Split the data into training and test model with 70% obs. going in training and 30% in testing
train_dataset, test_dataset = finalized_data.randomSplit([0.7, 0.3])

In [12]:
#Peek into training data
train_dataset.describe().show()

+-------+------------------+
|summary|            Grades|
+-------+------------------+
|  count|                31|
|   mean|3.0709677419354837|
| stddev| 1.100058649462866|
|    min|               1.5|
|    max|               5.0|
+-------+------------------+



In [13]:
#Peek into test_dataset
test_dataset.describe().show()

+-------+------------------+
|summary|            Grades|
+-------+------------------+
|  count|                19|
|   mean|3.4684210526315793|
| stddev|1.0964856140207158|
|    min|               1.5|
|    max|               5.0|
+-------+------------------+



In [14]:
#Import Linear Regression class called LinearRegression
from pyspark.ml.regression import LinearRegression

In [15]:
#Create the Linear Regression object named having feature column as features and Label column as Time_to_Study
LinReg = LinearRegression(featuresCol="features", labelCol="Grades")

In [16]:
#Train the model on the training using fit() method.
model = LinReg.fit(train_dataset)

23/04/06 09:43:59 WARN Instrumentation: [c97653d1] regParam is zero, which might cause numerical instability and overfitting.
23/04/06 09:43:59 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/04/06 09:43:59 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/04/06 09:43:59 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [17]:
#Predict the Grades using the evulate method
pred = model.evaluate(test_dataset)

In [18]:
#Show the predicted Grade values along side actual Grade values
pred.predictions.show()

+--------+------+------------------+
|features|Grades|        prediction|
+--------+------+------------------+
|   [1.0]|   1.5|1.5505491561746585|
|   [2.0]|   1.8|1.8245780873292259|
|   [2.0]|   1.8|1.8245780873292259|
|   [4.0]|   2.4|2.3726359496383607|
|   [5.0]|   2.7| 2.646664880792928|
|   [6.0]|   2.9|2.9206938119474954|
|   [7.0]|   3.1| 3.194722743102063|
|   [7.0]|   3.1| 3.194722743102063|
|   [8.0]|   3.5|3.4687516742566302|
|   [8.0]|   3.5|3.4687516742566302|
|   [8.0]|   3.5|3.4687516742566302|
|   [9.0]|   3.9|3.7427806054111974|
|  [10.0]|   4.1| 4.016809536565765|
|  [11.0]|   4.3| 4.290838467720333|
|  [12.0]|   4.5|   4.5648673988749|
|  [12.0]|   4.5|   4.5648673988749|
|  [13.0]|   4.8| 4.838896330029467|
|  [14.0]|   5.0| 5.112925261184035|
|  [14.0]|   5.0| 5.112925261184035|
+--------+------+------------------+





In [19]:
#Find out coefficient value
coefficient = model.coefficients
print ("The coefficient of the model is : %a" %coefficient)

The coefficient of the model is : DenseVector([0.274])


In [20]:
#Find out intercept Value
intercept = model.intercept
print ("The Intercept of the model is : %f" %intercept)

The Intercept of the model is : 1.276520


In [21]:
#Evaluate the model using metric like Mean Absolute Error(MAE), Root Mean Square Error(RMSE) and R-Square
from pyspark.ml.evaluation import RegressionEvaluator
evaluation = RegressionEvaluator(labelCol="Grades", predictionCol="prediction")

# Root Mean Square Error
rmse = evaluation.evaluate(pred.predictions, {evaluation.metricName: "rmse"})
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = evaluation.evaluate(pred.predictions, {evaluation.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = evaluation.evaluate(pred.predictions, {evaluation.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = evaluation.evaluate(pred.predictions, {evaluation.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 0.071
MSE: 0.005
MAE: 0.059
r2: 0.996


In [22]:
#Create Unlabeled dataset  to contain only feature column
unlabeled_dataset = test_dataset.select('features')

In [23]:
#Display the content of unlabeled_dataset
unlabeled_dataset.show()

+--------+
|features|
+--------+
|   [1.0]|
|   [2.0]|
|   [2.0]|
|   [4.0]|
|   [5.0]|
|   [6.0]|
|   [7.0]|
|   [7.0]|
|   [8.0]|
|   [8.0]|
|   [8.0]|
|   [9.0]|
|  [10.0]|
|  [11.0]|
|  [12.0]|
|  [12.0]|
|  [13.0]|
|  [14.0]|
|  [14.0]|
+--------+



In [24]:
#Predict the model output for fresh & unseen test data using transform() method
new_predictions = model.transform(unlabeled_dataset)

In [25]:
#Display the new prediction values
new_predictions.show()

+--------+------------------+
|features|        prediction|
+--------+------------------+
|   [1.0]|1.5505491561746585|
|   [2.0]|1.8245780873292259|
|   [2.0]|1.8245780873292259|
|   [4.0]|2.3726359496383607|
|   [5.0]| 2.646664880792928|
|   [6.0]|2.9206938119474954|
|   [7.0]| 3.194722743102063|
|   [7.0]| 3.194722743102063|
|   [8.0]|3.4687516742566302|
|   [8.0]|3.4687516742566302|
|   [8.0]|3.4687516742566302|
|   [9.0]|3.7427806054111974|
|  [10.0]| 4.016809536565765|
|  [11.0]| 4.290838467720333|
|  [12.0]|   4.5648673988749|
|  [12.0]|   4.5648673988749|
|  [13.0]| 4.838896330029467|
|  [14.0]| 5.112925261184035|
|  [14.0]| 5.112925261184035|
+--------+------------------+

