### Examples Of Pyspark ML

In [56]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('ml').getOrCreate()

In [57]:
spark

In [58]:
df = spark.read.csv('dataset_1.csv', header=True, inferSchema=True)
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|    Azman| 31|        11| 18000|
|     Armi| 33|        14| 35000|
|    Majid| 40|        20| 44000|
|    Fauzy| 50|        29| 56000|
|      Ain| 39|        15| 33000|
|    Ahmad| 40|        18| 44000|
|    Irfan| 35|        13| 40000|
|      Bob| 36|        12| 32000|
|     Umar| 38|        18| 45000|
+---------+---+----------+------+



In [59]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [60]:
df.columns

['Name', 'age', 'Experience', 'Salary']

## Data Preparation
In pyspark, we need to group all features as one group, before training.\
[Age,Experience]----> new feature--->independent feature using VectorAssembler!

In [61]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["age","Experience"],outputCol="Independent Features")

In [62]:
output=featureassembler.transform(df)

In [63]:
output.show()

+---------+---+----------+------+--------------------+
|     Name|age|Experience|Salary|Independent Features|
+---------+---+----------+------+--------------------+
|    Krish| 31|        10| 30000|         [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|          [30.0,8.0]|
|    Sunny| 29|         4| 20000|          [29.0,4.0]|
|     Paul| 24|         3| 20000|          [24.0,3.0]|
|   Harsha| 21|         1| 15000|          [21.0,1.0]|
|    Azman| 31|        11| 18000|         [31.0,11.0]|
|     Armi| 33|        14| 35000|         [33.0,14.0]|
|    Majid| 40|        20| 44000|         [40.0,20.0]|
|    Fauzy| 50|        29| 56000|         [50.0,29.0]|
|      Ain| 39|        15| 33000|         [39.0,15.0]|
|    Ahmad| 40|        18| 44000|         [40.0,18.0]|
|    Irfan| 35|        13| 40000|         [35.0,13.0]|
|      Bob| 36|        12| 32000|         [36.0,12.0]|
|     Umar| 38|        18| 45000|         [38.0,18.0]|
+---------+---+----------+------+--------------------+



In [64]:
output.columns

['Name', 'age', 'Experience', 'Salary', 'Independent Features']

In [65]:
finalized_data=output.select("Independent Features","Salary")
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|         [31.0,11.0]| 18000|
|         [33.0,14.0]| 35000|
|         [40.0,20.0]| 44000|
|         [50.0,29.0]| 56000|
|         [39.0,15.0]| 33000|
|         [40.0,18.0]| 44000|
|         [35.0,13.0]| 40000|
|         [36.0,12.0]| 32000|
|         [38.0,18.0]| 45000|
+--------------------+------+



## Model Training

### Linear Regression

In [66]:
from pyspark.ml.regression import LinearRegression

# Split the data into training and testing sets
train_data,test_data=finalized_data.randomSplit([0.8,0.2], seed=123)

# Create a Linear Regression model
lr=LinearRegression(featuresCol='Independent Features', labelCol='Salary')

# Fit the model to the training data
lr_model = lr.fit(train_data)

23/10/26 11:40:32 WARN Instrumentation: [d41cc392] regParam is zero, which might cause numerical instability and overfitting.


In [67]:
# Display the coefficients and intercept
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [234.59962756055324,1391.9925512103946]
Intercept: 7402.793296088717


In [68]:
# Make predictions on the test data
lr_pred = lr_model.evaluate(test_data)

In [69]:
# Show the predictions
# lr_pred.select("independent_var", "dependent_var", "prediction").show()
lr_pred.predictions.show()

+--------------------+------+-----------------+
|Independent Features|Salary|       prediction|
+--------------------+------+-----------------+
|          [29.0,4.0]| 20000|19774.15270018634|
|         [33.0,14.0]| 35000| 34632.4767225325|
|         [40.0,20.0]| 44000|44626.62942271874|
|         [50.0,29.0]| 56000|59500.55865921782|
+--------------------+------+-----------------+



In [71]:
lr_pred.meanAbsoluteError

1180.139664804431

In [72]:
lr_pred.meanSquaredError

3208163.9305888186