In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Missing').getOrCreate()

In [2]:
## Read The dataset
training = spark.read.csv('test3.csv',header=True,inferSchema=True)

In [3]:
training.show()

+---------+---+----------+------+
|     name|age|experience|salary|
+---------+---+----------+------+
|    fazil| 21|        11| 10000|
|     umar| 30|        12| 20000|
|   fazila| 20|        32| 30000|
|jabarulla| 50|        40| 40000|
|   asathi| 40|        17| 50000|
|  mohamed| 30|        16| 60000|
|abuthahir| 60|        30| 70000|
|    vijay| 70|        30| 80000|
|    surya| 30|        10| 90000|
+---------+---+----------+------+



In [4]:
training.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [5]:
training.columns

['name', 'age', 'experience', 'salary']

In [6]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["age","experience"],outputCol="Independent Features")

In [7]:

output=featureassembler.transform(training)

In [8]:
output.show()

+---------+---+----------+------+--------------------+
|     name|age|experience|salary|Independent Features|
+---------+---+----------+------+--------------------+
|    fazil| 21|        11| 10000|         [21.0,11.0]|
|     umar| 30|        12| 20000|         [30.0,12.0]|
|   fazila| 20|        32| 30000|         [20.0,32.0]|
|jabarulla| 50|        40| 40000|         [50.0,40.0]|
|   asathi| 40|        17| 50000|         [40.0,17.0]|
|  mohamed| 30|        16| 60000|         [30.0,16.0]|
|abuthahir| 60|        30| 70000|         [60.0,30.0]|
|    vijay| 70|        30| 80000|         [70.0,30.0]|
|    surya| 30|        10| 90000|         [30.0,10.0]|
+---------+---+----------+------+--------------------+



In [9]:
output.columns

['name', 'age', 'experience', 'salary', 'Independent Features']

In [10]:
finalized_data=output.select("Independent Features","salary")

In [11]:

finalized_data.show()

+--------------------+------+
|Independent Features|salary|
+--------------------+------+
|         [21.0,11.0]| 10000|
|         [30.0,12.0]| 20000|
|         [20.0,32.0]| 30000|
|         [50.0,40.0]| 40000|
|         [40.0,17.0]| 50000|
|         [30.0,16.0]| 60000|
|         [60.0,30.0]| 70000|
|         [70.0,30.0]| 80000|
|         [30.0,10.0]| 90000|
+--------------------+------+



In [12]:
from pyspark.ml.regression import LinearRegression
##train test split
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='salary')
regressor=regressor.fit(train_data)

In [13]:
### Coefficients
regressor.coefficients

DenseVector([1539.6286, -744.6608])

In [14]:
### Intercepts
regressor.intercept

-2465.875987224591

In [15]:
### Prediction
pred_results=regressor.evaluate(test_data)

In [16]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(39613.03720405991, 1768301407.7636738)