In [22]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Regression').getOrCreate()

In [21]:
# for stopping session
spark.stop()

In [23]:
spark

In [24]:
data = spark.read.csv('./DATASET/salary_dataset.csv', header=True, inferSchema=True)

In [29]:
data.show(5)

+---+----------+----+---------+----------+------+
|age|experience| gpa|   degree|  position|salary|
+---+----------+----+---------+----------+------+
| 30|         7|3.94| bachelor|  engineer| 32500|
| 26|         2|2.86| bachelor|      NULL| 22500|
| 27|         0|3.13|doctorate| secretary| 37000|
| 32|      NULL| 3.1| bachelor|  engineer| 24500|
| 24|         1|3.81| bachelor|accountant| 23500|
+---+----------+----+---------+----------+------+
only showing top 5 rows



In [30]:
data = data.na.drop()

In [31]:
data.show()

+---+----------+----+---------+----------+------+
|age|experience| gpa|   degree|  position|salary|
+---+----------+----+---------+----------+------+
| 30|         7|3.94| bachelor|  engineer| 32500|
| 27|         0|3.13|doctorate| secretary| 37000|
| 24|         1|3.81| bachelor|accountant| 23500|
| 35|         7|3.93|doctorate| secretary| 43500|
| 23|         1|3.78|   master|accountant| 30500|
| 32|         8|3.04| bachelor|accountant| 31500|
| 27|         2|3.52| bachelor| secretary| 18500|
| 35|        11|3.66|doctorate|accountant| 54000|
| 33|         4|2.59| bachelor| secretary| 26000|
| 25|         3|3.81| bachelor| secretary| 17000|
| 30|         4|3.17| bachelor|accountant| 23000|
| 22|         1|3.66| bachelor| secretary| 24000|
| 25|         0|3.65| bachelor|  engineer| 27500|
| 28|         4| 3.1|   master|  engineer| 37000|
| 24|         0|3.05|   master| secretary| 28500|
| 33|         3|3.34|doctorate|  engineer| 44000|
| 31|         0|2.65| bachelor|accountant| 24000|


In [32]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression

# Convert categorical columns to numeric indices


In [33]:
degree_indexer = StringIndexer(inputCol="degree", outputCol="degree_index")
position_indexer = StringIndexer(inputCol="position", outputCol="position_index")

data = degree_indexer.fit(data).transform(data)
data = position_indexer.fit(data).transform(data)

# Apply OneHotEncoder

In [35]:
degree_encoder = OneHotEncoder(inputCol="degree_index", outputCol="degree_vec")
position_encoder = OneHotEncoder(inputCol="position_index", outputCol="position_vec")

data = degree_encoder.fit(data).transform(data)
data = position_encoder.fit(data).transform(data)

In [36]:
data.show()

+---+----------+----+---------+----------+------+------------+--------------+-------------+-------------+
|age|experience| gpa|   degree|  position|salary|degree_index|position_index|   degree_vec| position_vec|
+---+----------+----+---------+----------+------+------------+--------------+-------------+-------------+
| 30|         7|3.94| bachelor|  engineer| 32500|         0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
| 27|         0|3.13|doctorate| secretary| 37000|         2.0|           2.0|    (2,[],[])|    (2,[],[])|
| 24|         1|3.81| bachelor|accountant| 23500|         0.0|           0.0|(2,[0],[1.0])|(2,[0],[1.0])|
| 35|         7|3.93|doctorate| secretary| 43500|         2.0|           2.0|    (2,[],[])|    (2,[],[])|
| 23|         1|3.78|   master|accountant| 30500|         1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|
| 32|         8|3.04| bachelor|accountant| 31500|         0.0|           0.0|(2,[0],[1.0])|(2,[0],[1.0])|
| 27|         2|3.52| bachelor| secretary| 185

#  Assemble features

In [37]:
assembler = VectorAssembler(
    inputCols=["age", "experience", "gpa", "degree_vec", "position_vec"],
    outputCol="features"
)

data = assembler.transform(data)
data.show()

+---+----------+----+---------+----------+------+------------+--------------+-------------+-------------+--------------------+
|age|experience| gpa|   degree|  position|salary|degree_index|position_index|   degree_vec| position_vec|            features|
+---+----------+----+---------+----------+------+------------+--------------+-------------+-------------+--------------------+
| 30|         7|3.94| bachelor|  engineer| 32500|         0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|[30.0,7.0,3.94,1....|
| 27|         0|3.13|doctorate| secretary| 37000|         2.0|           2.0|    (2,[],[])|    (2,[],[])|(7,[0,2],[27.0,3....|
| 24|         1|3.81| bachelor|accountant| 23500|         0.0|           0.0|(2,[0],[1.0])|(2,[0],[1.0])|[24.0,1.0,3.81,1....|
| 35|         7|3.93|doctorate| secretary| 43500|         2.0|           2.0|    (2,[],[])|    (2,[],[])|(7,[0,1,2],[35.0,...|
| 23|         1|3.78|   master|accountant| 30500|         1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|[23.0,

# Split data into training and testing sets

In [38]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Train the regression model

In [39]:
lr = LinearRegression(featuresCol="features", labelCol="salary")
model = lr.fit(train_data)

# Evaluate the model

In [41]:
predictions = model.transform(test_data)
predictions.select("features", "salary", "prediction").show(truncate=False)

+-------------------------------+------+------------------+
|features                       |salary|prediction        |
+-------------------------------+------+------------------+
|[21.0,0.0,3.12,1.0,0.0,1.0,0.0]|21500 |21647.59533015531 |
|(7,[0,2,3],[22.0,3.17,1.0])    |19000 |18026.517010779826|
|[22.0,1.0,2.56,1.0,0.0,1.0,0.0]|22500 |21872.262613668663|
|[23.0,0.0,3.81,0.0,1.0,0.0,1.0]|36500 |29816.384003785985|
|(7,[0,2,4],[24.0,3.05,1.0])    |28500 |22999.748702248853|
|[24.0,3.0,2.63,0.0,1.0,1.0,0.0]|26000 |29139.21006568931 |
|[25.0,4.0,2.8,0.0,1.0,1.0,0.0] |27500 |30627.771937498128|
|[27.0,2.0,3.52,1.0,0.0,0.0,0.0]|18500 |21656.509802306726|
|[29.0,2.0,3.77,1.0,0.0,0.0,0.0]|22000 |22513.054030344727|
|[29.0,3.0,2.63,0.0,1.0,0.0,1.0]|27500 |31991.622502598024|
|[29.0,6.0,2.88,0.0,1.0,1.0,0.0]|32500 |33578.44505979666 |
|[29.0,7.0,3.52,0.0,1.0,1.0,0.0]|33500 |35668.89547516976 |
|[30.0,4.0,3.17,1.0,0.0,1.0,0.0]|23000 |27570.34351308721 |
|[30.0,7.0,3.13,1.0,0.0,0.0,0.0]|20500 |

In [42]:
# model coefficients and intercept
print(f"Coefficients: {model.coefficients}")
print(f"Intercept: {model.intercept}")

# model performance metrics
training_summary = model.summary
print(f"RMSE: {training_summary.rootMeanSquaredError}")
print(f"R2: {training_summary.r2}")

Coefficients: [211.85180780402277,982.378447552427,1731.3624497198134,-14577.703048443514,-9820.411478616152,3919.498249665496,5712.651647554094]
Intercept: 22455.06132192303
RMSE: 2804.910231143828
R2: 0.9030454655486437
