In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('lin_reg').getOrCreate()

In [2]:
df=spark.read.csv('Linear_regression_dataset.csv',
inferSchema=True,header=True)

In [3]:
print((df.count(), len(df.columns)))

(1232, 6)


In [7]:
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)



In [9]:
df.describe().show(5, False)

+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|var_1            |var_2            |var_3             |var_4               |var_5               |output             |
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|count  |1232             |1232             |1232              |1232                |1232                |1232               |
|mean   |715.0819805194806|715.0819805194806|80.90422077922078 |0.3263311688311693  |0.25927272727272715 |0.39734172077922014|
|stddev |91.5342940441652 |93.07993263118064|11.458139049993724|0.015012772334166148|0.012907228928000298|0.03326689862173776|
|min    |463              |472              |40                |0.277               |0.214               |0.301              |
|max    |1009             |1103             |116               |0.373               |0.294               |0.491

In [13]:
from pyspark.sql.functions import corr

In [14]:
df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'output']

In [12]:
#check for correlation between columns
df.select(corr('var_1', 'output')).show()

+-------------------+
|corr(var_1, output)|
+-------------------+
| 0.9187399607627283|
+-------------------+



In [16]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

### Feature Engineering
This is the part where we create a single vector combining all input features
by using Spark’s VectorAssembler. It creates only a single feature that
captures the input values for that row. So, instead of five input columns, it
essentially merges all input columns into a single feature vector column.

In [17]:
vec_assmebler=VectorAssembler(inputCols=['var_1',
'var_2', 'var_3', 'var_4', 'var_5'],outputCol='features')

In [18]:
features_df=vec_assmebler.transform(df)

In [19]:
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- features: vector (nullable = true)

