# <center>Income prediction<center>

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder.appName("SalaryPrediction").getOrCreate()

In [3]:
data = spark.read.csv(r"C:\Users\Public\DW\ML\salary.csv", header=True, inferSchema=True)

In [4]:
data.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- education: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- race: string (nullable = true)
 |-- salary: string (nullable = true)



In [5]:
data.show(5)

+-------+---+-----------------+-------------+------------------+------+------+
| gender|age|        workclass|    education|        occupation|  race|salary|
+-------+---+-----------------+-------------+------------------+------+------+
|   Male| 41|        State-gov|    Doctorate|    Prof-specialty| White|  >50K|
|   Male| 49|          Private|      Masters|    Prof-specialty| White| <=50K|
|   Male| 51|          Private|      HS-grad| Machine-op-inspct| Black| <=50K|
| Female| 20|          Private| Some-college| Handlers-cleaners| White| <=50K|
|   Male| 69| Self-emp-not-inc|      HS-grad|             Sales| White| <=50K|
+-------+---+-----------------+-------------+------------------+------+------+
only showing top 5 rows



In [6]:
# performing string indexing on the target variable (because it is categorical)
label_indexer = StringIndexer(inputCol="salary", outputCol="label")

In [7]:
# performing string indexing on the others categorical columns
categorical_cols = ['gender', 'workclass', 'education', 'occupation', 'race']
indexers = [StringIndexer(inputCol=col, outputCol=col+'_index').fit(data) for col in categorical_cols]

In [8]:
# feature vector
feature_cols = [col+'_index' for col in categorical_cols] + ['age']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

In [9]:
# transforming
pipeline = Pipeline(stages=indexers + [assembler, label_indexer])
transformed_data = pipeline.fit(data).transform(data)

In [10]:
# tran test spliting
(training_data, testing_data) = transformed_data.randomSplit([0.95, 0.05], seed=42)

In [11]:
# building and training the machine learning model
rf = RandomForestClassifier(labelCol='label', featuresCol='features')
model = rf.fit(training_data)

In [12]:
# predictions on the testing data
predictions = model.transform(testing_data)

In [13]:
# evaluation
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.8012422360248447


In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction')

# precision
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})

# recall
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

# F1 score
f1_score = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Precision: 0.7944027874862367
Recall: 0.8101153504880213
F1 Score: 0.7959698741762633
