In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('SparkMl').getOrCreate()

In [2]:
train = spark.read.csv('test1.csv', header=True, inferSchema=True)
train.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [3]:
train.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [4]:
from pyspark.ml.feature import VectorAssembler
featureAssembler = VectorAssembler(inputCols=['age', 'Experience'], outputCol='independentFeatures')


In [5]:
output = featureAssembler.transform(train)
output.show()

+---------+---+----------+------+-------------------+
|     Name|age|Experience|Salary|independentFeatures|
+---------+---+----------+------+-------------------+
|    Krish| 31|        10| 30000|        [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|         [30.0,8.0]|
|    Sunny| 29|         4| 20000|         [29.0,4.0]|
|     Paul| 24|         3| 20000|         [24.0,3.0]|
|   Harsha| 21|         1| 15000|         [21.0,1.0]|
|  Shubham| 23|         2| 18000|         [23.0,2.0]|
+---------+---+----------+------+-------------------+



In [6]:
from pyspark.ml.feature import StandardScaler
Scalerizer=StandardScaler().setInputCol("independentFeatures").setOutputCol("Scaled_features")

In [7]:
Scalerizer.fit(output).transform(output).show()

+---------+---+----------+------+-------------------+--------------------+
|     Name|age|Experience|Salary|independentFeatures|     Scaled_features|
+---------+---+----------+------+-------------------+--------------------+
|    Krish| 31|        10| 30000|        [31.0,10.0]|[7.41748501646384...|
|Sudhanshu| 30|         8| 25000|         [30.0,8.0]|[7.17821130625533...|
|    Sunny| 29|         4| 20000|         [29.0,4.0]|[6.93893759604682...|
|     Paul| 24|         3| 20000|         [24.0,3.0]|[5.74256904500426...|
|   Harsha| 21|         1| 15000|         [21.0,1.0]|[5.02474791437873...|
|  Shubham| 23|         2| 18000|         [23.0,2.0]|[5.50329533479575...|
+---------+---+----------+------+-------------------+--------------------+



In [6]:
finalData = output.select('independentFeatures', 'Salary')
finalData.show()

+-------------------+------+
|independentFeatures|Salary|
+-------------------+------+
|        [31.0,10.0]| 30000|
|         [30.0,8.0]| 25000|
|         [29.0,4.0]| 20000|
|         [24.0,3.0]| 20000|
|         [21.0,1.0]| 15000|
|         [23.0,2.0]| 18000|
+-------------------+------+



In [7]:
from pyspark.ml.regression import LinearRegression
train_df, test_df = finalData.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='independentFeatures', labelCol='Salary')
regressor = regressor.fit(train_df)

In [8]:
### Coefficients
regressor.coefficients

DenseVector([28.4757, 1271.3568])

In [9]:
### Intercept
regressor.intercept

14299.832495812996

In [10]:
### prediction
pred = regressor.evaluate(test_df)

In [11]:
pred.predictions.show()

+-------------------+------+------------------+
|independentFeatures|Salary|        prediction|
+-------------------+------+------------------+
|        [31.0,10.0]| 30000|27896.147403685147|
+-------------------+------+------------------+



In [13]:
pred.meanAbsoluteError, pred.meanSquaredError

(2103.852596314853, 4426195.747020748)