In [2]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=df53ff8343278105663f8f9da50de07c83b0907d700058d790e28ae9dafdedbd
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [3]:
from pyspark.sql import SparkSession

In [7]:
spark=SparkSession.builder.appName('linear-regression').getOrCreate()

In [8]:
spark

In [29]:
df=spark.read.csv('/content/Student_Performance.csv',header=True,inferSchema=True)

In [30]:
df.show()

+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+
|Hours Studied|Previous Scores|Extracurricular Activities|Sleep Hours|Sample Question Papers Practiced|Performance Index|
+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+
|            7|             99|                       Yes|          9|                               1|             91.0|
|            4|             82|                        No|          4|                               2|             65.0|
|            8|             51|                       Yes|          7|                               2|             45.0|
|            5|             52|                       Yes|          5|                               2|             36.0|
|            7|             75|                        No|          8|                               5|             66.0|
|            3|         

In [31]:
from pyspark.ml.feature import StringIndexer

In [32]:
string=StringIndexer(inputCols=["Extracurricular Activities"],
outputCols=["Extracurricular Activities_encoded"])

In [33]:
df=string.fit(df).transform(df)

In [34]:
df.show()

+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+----------------------------------+
|Hours Studied|Previous Scores|Extracurricular Activities|Sleep Hours|Sample Question Papers Practiced|Performance Index|Extracurricular Activities_encoded|
+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+----------------------------------+
|            7|             99|                       Yes|          9|                               1|             91.0|                               1.0|
|            4|             82|                        No|          4|                               2|             65.0|                               0.0|
|            8|             51|                       Yes|          7|                               2|             45.0|                               1.0|
|            5|             52|                       Yes|

In [37]:
df=df.drop('Extracurricular Activities')

In [38]:
df.show()

+-------------+---------------+-----------+--------------------------------+-----------------+----------------------------------+
|Hours Studied|Previous Scores|Sleep Hours|Sample Question Papers Practiced|Performance Index|Extracurricular Activities_encoded|
+-------------+---------------+-----------+--------------------------------+-----------------+----------------------------------+
|            7|             99|          9|                               1|             91.0|                               1.0|
|            4|             82|          4|                               2|             65.0|                               0.0|
|            8|             51|          7|                               2|             45.0|                               1.0|
|            5|             52|          5|                               2|             36.0|                               1.0|
|            7|             75|          8|                               5|             6

In [39]:
df.printSchema()

root
 |-- Hours Studied: integer (nullable = true)
 |-- Previous Scores: integer (nullable = true)
 |-- Sleep Hours: integer (nullable = true)
 |-- Sample Question Papers Practiced: integer (nullable = true)
 |-- Performance Index: double (nullable = true)
 |-- Extracurricular Activities_encoded: double (nullable = false)



In [40]:
df.columns

['Hours Studied',
 'Previous Scores',
 'Sleep Hours',
 'Sample Question Papers Practiced',
 'Performance Index',
 'Extracurricular Activities_encoded']

In [43]:
df.show()

+-------------+---------------+-----------+--------------------------------+-----------------+----------------------------------+
|Hours Studied|Previous Scores|Sleep Hours|Sample Question Papers Practiced|Performance Index|Extracurricular Activities_encoded|
+-------------+---------------+-----------+--------------------------------+-----------------+----------------------------------+
|            7|             99|          9|                               1|             91.0|                               1.0|
|            4|             82|          4|                               2|             65.0|                               0.0|
|            8|             51|          7|                               2|             45.0|                               1.0|
|            5|             52|          5|                               2|             36.0|                               1.0|
|            7|             75|          8|                               5|             6

now we have to convert them into vectors

In [42]:
from pyspark.ml.feature  import VectorAssembler

In [44]:
feature=VectorAssembler(inputCols=["Hours Studied","Previous Scores","Sleep Hours","Sample Question Papers Practiced","Extracurricular Activities_encoded"],
    outputCol='independant_features'
)

In [46]:
output=feature.transform(df)

In [47]:
output.show()

+-------------+---------------+-----------+--------------------------------+-----------------+----------------------------------+--------------------+
|Hours Studied|Previous Scores|Sleep Hours|Sample Question Papers Practiced|Performance Index|Extracurricular Activities_encoded|independant_features|
+-------------+---------------+-----------+--------------------------------+-----------------+----------------------------------+--------------------+
|            7|             99|          9|                               1|             91.0|                               1.0|[7.0,99.0,9.0,1.0...|
|            4|             82|          4|                               2|             65.0|                               0.0|[4.0,82.0,4.0,2.0...|
|            8|             51|          7|                               2|             45.0|                               1.0|[8.0,51.0,7.0,2.0...|
|            5|             52|          5|                               2|             36.0|

In [48]:
final=output.select(['independant_features','Performance Index'])

In [49]:
final.show()

+--------------------+-----------------+
|independant_features|Performance Index|
+--------------------+-----------------+
|[7.0,99.0,9.0,1.0...|             91.0|
|[4.0,82.0,4.0,2.0...|             65.0|
|[8.0,51.0,7.0,2.0...|             45.0|
|[5.0,52.0,5.0,2.0...|             36.0|
|[7.0,75.0,8.0,5.0...|             66.0|
|[3.0,78.0,9.0,6.0...|             61.0|
|[7.0,73.0,5.0,6.0...|             63.0|
|[8.0,45.0,4.0,6.0...|             42.0|
|[5.0,77.0,8.0,2.0...|             61.0|
|[4.0,89.0,4.0,0.0...|             69.0|
|[8.0,91.0,4.0,5.0...|             84.0|
|[8.0,79.0,6.0,2.0...|             73.0|
|[3.0,47.0,9.0,2.0...|             27.0|
|[6.0,47.0,4.0,2.0...|             33.0|
|[5.0,79.0,7.0,8.0...|             68.0|
|[2.0,72.0,4.0,3.0...|             43.0|
|[8.0,73.0,8.0,4.0...|             67.0|
|[6.0,83.0,7.0,2.0...|             70.0|
|[2.0,54.0,4.0,9.0...|             30.0|
|[5.0,75.0,7.0,0.0...|             63.0|
+--------------------+-----------------+
only showing top

In [50]:
from pyspark.ml.regression import LinearRegression
train_data,test_data=final.randomSplit([0.75,0.25])

In [51]:
train_data.show()

+--------------------+-----------------+
|independant_features|Performance Index|
+--------------------+-----------------+
|[1.0,40.0,4.0,3.0...|             15.0|
|[1.0,40.0,4.0,3.0...|             13.0|
|[1.0,40.0,4.0,8.0...|             12.0|
|[1.0,40.0,5.0,6.0...|             13.0|
|[1.0,40.0,5.0,9.0...|             10.0|
|[1.0,40.0,5.0,9.0...|             14.0|
|[1.0,40.0,6.0,3.0...|             12.0|
|[1.0,40.0,6.0,5.0...|             11.0|
|[1.0,40.0,6.0,6.0...|             16.0|
|[1.0,40.0,7.0,4.0...|             11.0|
|[1.0,40.0,8.0,9.0...|             18.0|
|[1.0,40.0,9.0,2.0...|             11.0|
|[1.0,40.0,9.0,6.0...|             13.0|
|[1.0,40.0,9.0,6.0...|             14.0|
|[1.0,40.0,9.0,6.0...|             15.0|
|[1.0,40.0,9.0,7.0...|             16.0|
|[1.0,41.0,4.0,3.0...|             15.0|
|[1.0,41.0,5.0,0.0...|             14.0|
|[1.0,41.0,5.0,4.0...|             18.0|
|[1.0,41.0,5.0,5.0...|             14.0|
+--------------------+-----------------+
only showing top

In [52]:
regressor=LinearRegression(featuresCol="independant_features",
      labelCol='Performance Index')

In [60]:
regressor=regressor.fit(train_data)

In [61]:
regressor.intercept

-34.25749455401062

In [62]:
regressor.coefficients

DenseVector([2.8489, 1.0191, 0.4955, 0.1996, 0.6205])

In [63]:
prediction=regressor.evaluate(test_data)

In [64]:
prediction.predictions.show()

+--------------------+-----------------+------------------+
|independant_features|Performance Index|        prediction|
+--------------------+-----------------+------------------+
|[1.0,40.0,4.0,2.0...|             13.0|12.357645550813558|
|[1.0,40.0,6.0,0.0...|             15.0|12.328805615174616|
|[1.0,40.0,7.0,4.0...|             14.0|13.622755630390188|
|[1.0,40.0,7.0,6.0...|             12.0|14.642527875560425|
|[1.0,40.0,8.0,5.0...|             16.0|14.317842750268312|
|[1.0,40.0,8.0,8.0...|             15.0| 15.53723596055103|
|[1.0,41.0,4.0,3.0...|             12.0|12.955850769615402|
|[1.0,41.0,6.0,5.0...|             18.0|14.346025009371658|
|[1.0,41.0,7.0,3.0...|             12.0|14.442249233912342|
|[1.0,41.0,8.0,2.0...|             15.0|15.358624738510777|
|[1.0,41.0,8.0,8.0...|             15.0|16.556350529185664|
|[1.0,42.0,5.0,5.0...|             12.0|14.869673423240634|
|[1.0,42.0,5.0,6.0...|             17.0|15.689824703298392|
|[1.0,42.0,5.0,8.0...|             18.0|

In [65]:
prediction.meanAbsoluteError

1.6337503237385156

In [66]:
prediction.meanSquaredError

4.198098214630204