## Import Libraries

In [16]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

## Spark Session

In [2]:
spark = SparkSession.builder.appName('LR').getOrCreate()

## Import Data

In [3]:
training = spark.read.csv('D:/pyspark/heroes3.csv',header=True,inferSchema=True)
training.show()

+------------+---+----------+--------+-------+
|        name|age|experience|  salary|company|
+------------+---+----------+--------+-------+
|    deadpool| 65|         2|10000000| marvel|
|   spiderman| 20|         7| 5000000| marvel|
|     captain|120|         3|20000000| marvel|
|      batman| 50|         2|10000000|     dc|
| black widow| 40|         6|15000000| marvel|
|    superman|150|        10| 5000000|     dc|
|womder woman| 35|         4|15000000|     dc|
|       wanda| 30|         3|10000000| marvel|
+------------+---+----------+--------+-------+



In [4]:
training.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- company: string (nullable = true)



In [5]:
training.columns

['name', 'age', 'experience', 'salary', 'company']

## Data Processing

In [9]:
## input cols
featureassembler = VectorAssembler(inputCols=['age','experience'],outputCol='independent features')

In [12]:
## transform input cols
output = featureassembler.transform(training)

In [13]:
output.show()

+------------+---+----------+--------+-------+--------------------+
|        name|age|experience|  salary|company|independent features|
+------------+---+----------+--------+-------+--------------------+
|    deadpool| 65|         2|10000000| marvel|          [65.0,2.0]|
|   spiderman| 20|         7| 5000000| marvel|          [20.0,7.0]|
|     captain|120|         3|20000000| marvel|         [120.0,3.0]|
|      batman| 50|         2|10000000|     dc|          [50.0,2.0]|
| black widow| 40|         6|15000000| marvel|          [40.0,6.0]|
|    superman|150|        10| 5000000|     dc|        [150.0,10.0]|
|womder woman| 35|         4|15000000|     dc|          [35.0,4.0]|
|       wanda| 30|         3|10000000| marvel|          [30.0,3.0]|
+------------+---+----------+--------+-------+--------------------+



In [14]:
finalized_data = output.select(['independent features','salary'])

In [15]:
finalized_data.show()

+--------------------+--------+
|independent features|  salary|
+--------------------+--------+
|          [65.0,2.0]|10000000|
|          [20.0,7.0]| 5000000|
|         [120.0,3.0]|20000000|
|          [50.0,2.0]|10000000|
|          [40.0,6.0]|15000000|
|        [150.0,10.0]| 5000000|
|          [35.0,4.0]|15000000|
|          [30.0,3.0]|10000000|
+--------------------+--------+



In [25]:
## train test split
train_data,test_data = finalized_data.randomSplit([0.75,0.25])

## Model Training

In [26]:
## train model
regressor = LinearRegression(featuresCol='independent features',labelCol='salary')
regressor = regressor.fit(train_data)

In [27]:
## coefficients
regressor.coefficients

DenseVector([37989.7457, -1418884.6309])

In [28]:
## intercept
regressor.intercept

16107932.174917322

## Model Prection

In [30]:
pred_results = regressor.evaluate(test_data)

In [32]:
pred_results.predictions.show()

+--------------------+--------+--------------------+
|independent features|  salary|          prediction|
+--------------------+--------+--------------------+
|          [65.0,2.0]|10000000|1.5739496386511981E7|
+--------------------+--------+--------------------+



## Model Evaluation

In [33]:
## MSE
pred_results.meanSquaredError

32941818770784.094

In [35]:
## RMSE
pred_results.rootMeanSquaredError

5739496.3865119815