In [9]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Missin').getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [10]:
training = spark.read.csv('files/test1.csv', header=True, inferSchema=True)

In [12]:
training

Name,age,Experience,Salary
Krish,31,10,30000
Sudhanshu,30,8,25000
Sunny,29,4,20000
Paul,24,3,20000
Harsha,21,1,15000
Shubham,23,2,18000


In [13]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [15]:
training.columns

['Name', 'age', 'Experience', 'Salary']

In [16]:
columns = training.columns
type(columns)

list

In [26]:
from pyspark.ml.feature import VectorAssembler
featureassenbker = VectorAssembler(
    inputCols=columns[1:3], outputCol="Independent Features"
                                  )

In [27]:
output = featureassenbker.transform(training)
output

Name,age,Experience,Salary,Independent Features
Krish,31,10,30000,"[31.0,10.0]"
Sudhanshu,30,8,25000,"[30.0,8.0]"
Sunny,29,4,20000,"[29.0,4.0]"
Paul,24,3,20000,"[24.0,3.0]"
Harsha,21,1,15000,"[21.0,1.0]"
Shubham,23,2,18000,"[23.0,2.0]"


In [28]:
output.columns

['Name', 'age', 'Experience', 'Salary', 'Independent Features']

In [29]:
finalized_data = output.select("Independent Features", "Salary")
finalized_data

Independent Features,Salary
"[31.0,10.0]",30000
"[30.0,8.0]",25000
"[29.0,4.0]",20000
"[24.0,3.0]",20000
"[21.0,1.0]",15000
"[23.0,2.0]",18000


In [None]:
from pyspark.ml.regression import LinearRegression

In [44]:

train_data, test_data = finalized_data.randomSplit([.75, .25])
regressor = LinearRegression(featuresCol="Independent Features", labelCol="Salary")
regressor = regressor.fit(train_data)

22/10/26 02:15:09 WARN Instrumentation: [bad5333a] regParam is zero, which might cause numerical instability and overfitting.


In [45]:
regressor.coefficients

DenseVector([-102.53, 1688.6818])

In [46]:
regressor.intercept

16470.03994673731

In [47]:
pred_results = regressor.evaluate(test_data)
pred_results.predictions

Independent Features,Salary,prediction
"[30.0,8.0]",25000,26903.59520639148


In [48]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(1903.595206391481, 3623674.709796625)