In [58]:
from pyspark.sql import SparkSession
#session builder
spark = SparkSession.builder.appName('example').getOrCreate()

In [59]:
#read the dataset
data = spark.read.csv('test.csv', header = True, inferSchema = True)

In [60]:
data.show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|  Mary| 27|         2| 15000|
|George| 35|         7| 25000|
| Chloe| 50|        21| 40000|
|  Paul| 24|         3| 20000|
|  Jane| 21|         1| 10000|
| Steve| 31|        10| 30000|
|  Kate| 45|        20| 40000|
+------+---+----------+------+



In [61]:
data.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [62]:
data.columns

['Name', 'Age', 'Experience', 'Salary']

In [63]:
from pyspark.ml.feature import VectorAssembler

fa = VectorAssembler(inputCols= ['Age','Experience'], outputCol='f_independent')

In [64]:
output = fa.transform(training)

In [65]:
output.show()

+------+---+----------+------+-------------+
|  Name|Age|Experience|Salary|f_independent|
+------+---+----------+------+-------------+
|  Mary| 27|         2| 15000|   [27.0,2.0]|
|George| 35|         7| 25000|   [35.0,7.0]|
| Chloe| 50|        21| 40000|  [50.0,21.0]|
|  Paul| 24|         3| 20000|   [24.0,3.0]|
|  Jane| 21|         1| 10000|   [21.0,1.0]|
| Steve| 31|        10| 30000|  [31.0,10.0]|
|  Kate| 45|        20| 40000|  [45.0,20.0]|
+------+---+----------+------+-------------+



In [66]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'f_independent']

In [67]:
final = output.select('f_independent','Salary')

In [68]:
final.show()

+-------------+------+
|f_independent|Salary|
+-------------+------+
|   [27.0,2.0]| 15000|
|   [35.0,7.0]| 25000|
|  [50.0,21.0]| 40000|
|   [24.0,3.0]| 20000|
|   [21.0,1.0]| 10000|
|  [31.0,10.0]| 30000|
|  [45.0,20.0]| 40000|
+-------------+------+



In [69]:
from pyspark.ml.regression import LinearRegression

#split data (final) in train and test
train_data, test_data = final.randomSplit([0.0,0.40])
#Linear Regression
lr = LinearRegression(featuresCol='f_independent', labelCol='Salary')
lr = lr.fit(train_data)

In [70]:
lr.coefficients

DenseVector([1613.8889, -225.0])

In [71]:
lr.intercept

-28125.0

In [72]:
#prediction

pred = lr.evaluate(test_data)

In [73]:
pred.predictions.show()

+-------------+------+------------------+
|f_independent|Salary|        prediction|
+-------------+------+------------------+
|   [21.0,1.0]| 10000| 5541.666666666664|
|   [24.0,3.0]| 20000| 9933.333333333336|
|  [31.0,10.0]| 30000|19655.555555555562|
|   [35.0,7.0]| 25000|26786.111111111117|
|  [50.0,21.0]| 40000| 47844.44444444447|
+-------------+------+------------------+



In [74]:
#MAE
pred.meanAbsoluteError 

6900.000000000005

In [75]:
#MSE
pred.meanSquaredError

58589509.259259306