In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('lin_reg').getOrCreate()

In [2]:
df=spark.read.csv('Linear_regression_dataset.csv',
inferSchema=True,header=True)

In [3]:
print((df.count(), len(df.columns)))

(1232, 6)


In [7]:
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)



In [9]:
df.describe().show(5, False)

+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|var_1            |var_2            |var_3             |var_4               |var_5               |output             |
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|count  |1232             |1232             |1232              |1232                |1232                |1232               |
|mean   |715.0819805194806|715.0819805194806|80.90422077922078 |0.3263311688311693  |0.25927272727272715 |0.39734172077922014|
|stddev |91.5342940441652 |93.07993263118064|11.458139049993724|0.015012772334166148|0.012907228928000298|0.03326689862173776|
|min    |463              |472              |40                |0.277               |0.214               |0.301              |
|max    |1009             |1103             |116               |0.373               |0.294               |0.491

In [13]:
from pyspark.sql.functions import corr

In [14]:
df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'output']

In [12]:
#check for correlation between columns
df.select(corr('var_1', 'output')).show()

+-------------------+
|corr(var_1, output)|
+-------------------+
| 0.9187399607627283|
+-------------------+



In [16]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

### Feature Engineering
This is the part where we create a single vector combining all input features
by using Spark’s VectorAssembler. It creates only a single feature that
captures the input values for that row. So, instead of five input columns, it
essentially merges all input columns into a single feature vector column.

In [17]:
vec_assmebler=VectorAssembler(inputCols=['var_1',
'var_2', 'var_3', 'var_4', 'var_5'],outputCol='features')

In [18]:
features_df=vec_assmebler.transform(df)

In [19]:
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- features: vector (nullable = true)



In [20]:
model_df =features_df.select('features', 'output')
model_df.show(5)

+--------------------+------+
|            features|output|
+--------------------+------+
|[734.0,688.0,81.0...| 0.418|
|[700.0,600.0,94.0...| 0.389|
|[712.0,705.0,93.0...| 0.417|
|[734.0,806.0,69.0...| 0.415|
|[613.0,759.0,61.0...| 0.378|
+--------------------+------+
only showing top 5 rows



In [21]:
#Size of data frame
print((model_df.count(), len(model_df.columns)))

(1232, 2)


In [25]:
#Split the dataset
train_df, test_df=model_df.randomSplit([0.7,0.3])


In [26]:
print((train_df.count(), len(train_df.columns)))


(863, 2)


In [27]:
print((test_df.count(), len(test_df.columns)))


(369, 2)


In [29]:
train_df.describe().show()

+-------+-------------------+
|summary|             output|
+-------+-------------------+
|  count|                863|
|   mean|0.39771610660486634|
| stddev|0.03219260204747708|
|    min|              0.301|
|    max|              0.484|
+-------+-------------------+



## Linear Regression Model

In [31]:
from pyspark.ml.regression import LinearRegression

lin_Reg = LinearRegression(labelCol='output')

In [32]:
lr_model = lin_Reg.fit(train_df)

In [33]:
lr_model.intercept

0.18995089194078849

In [34]:
print(lr_model.coefficients)


[0.0003344410099700511,5.527001871329109e-05,0.0001805170331704408,-0.6179904124948249,0.44643098498049943]


In [35]:
training_predictions=lr_model.evaluate(train_df)


In [36]:
training_predictions.meanAbsoluteError

0.00961596797368784

In [37]:
training_predictions.meanSquaredError

0.00014546814919634427

In [38]:
training_predictions.r2


0.8594731597928396

In [39]:
# make predictions on the test data
test_results=lr_model.evaluate(test_df)


In [40]:
# view residuals
test_results.residuals.show(10)


+--------------------+
|           residuals|
+--------------------+
| 0.00759269164866222|
|0.005930201992758344|
|-0.00656600102650...|
|-0.01613231926126...|
|-0.00797621902423...|
|-0.00131341124228...|
|-0.01237081667652...|
|-0.01135442495989...|
|-0.00697147606415...|
|-2.11181120580161...|
+--------------------+
only showing top 10 rows

