# <center>Ad Purchaser Classifier<center>

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

In [3]:
spark = SparkSession.builder.appName("purchases_ads").getOrCreate()

In [4]:
data = spark.read.option("header", "true").option("inferSchema", "true").csv(r"C:\Users\Public\DW\ML\Ads.csv")

In [5]:
data.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- EstimatedSalary: integer (nullable = true)
 |-- Purchased: integer (nullable = true)



In [6]:
data.show(8)

+------+---+---------------+---------+
|Gender|Age|EstimatedSalary|Purchased|
+------+---+---------------+---------+
|  Male| 19|          19000|        0|
|  Male| 35|          20000|        0|
|Female| 26|          43000|        0|
|Female| 27|          57000|        0|
|  Male| 19|          76000|        0|
|  Male| 27|          58000|        0|
|Female| 27|          84000|        0|
|Female| 32|         150000|        1|
+------+---+---------------+---------+
only showing top 8 rows



In [7]:
# converting categorical column "Gender" to numeric using StringIndexer
indexer = StringIndexer(inputCol="Gender", outputCol="GenderIndex")
data = indexer.fit(data).transform(data)

In [8]:
# feature vector
assembler = VectorAssembler(inputCols=["GenderIndex", "Age", "EstimatedSalary"], outputCol="features")
data = assembler.transform(data)

In [9]:
# train & test split
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [10]:
# defining the classifier
lr = LogisticRegression(featuresCol="features", labelCol="Purchased")

In [11]:
# training the model
model = lr.fit(train_data)

In [12]:
# making predictions on the test data
predictions = model.transform(test_data)

In [13]:
# evaluating the model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="Purchased")
accuracy = evaluator.evaluate(predictions)

In [19]:
# accuracy
print("Accuracy:", accuracy)

Accuracy: 0.9568452380952381


In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="Purchased", predictionCol="prediction", metricName="accuracy")

# precision
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})

# recall
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

# F1 score
f1_score = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Precision: 0.8983008495752123
Recall: 0.896551724137931
F1 Score: 0.891513658755038
