## Classification Model

### Task
Dataset: Bank Marketing Dataset (available on Kaggle)
Description: This dataset contains information about a bank's marketing campaigns, including customer data, campaign details, and response outcomes.
Task: Predict whether a customer will subscribe to a term deposit based on their characteristics and campaign information.

In [1]:
!pip install --ignore-install -q pyspark
!pip install --ignore-install -q findspark

In [5]:
import findspark
from pyspark.sql import SparkSession
import os
base_path = os.getcwd()

findspark.init()

spark = SparkSession.builder.appName('Bank-Marketing').getOrCreate()

In [6]:
data = spark.read.csv(base_path + "/work/06-ml-using-spark/bank.csv",inferSchema=True,header=True)
# Print the Schema of the dataframe
data.printSchema()
data.show()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)

+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+-----------+--------+---------+-------+-------+-------+----+----

In [7]:
data.columns

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'deposit']

## Prepare the data
- Index Categorical Variables Including the Label
- Assemble Features

In [13]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

# List of categorical columns
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

# Index categorical features
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in categorical_cols]

# Index the label column
label_indexer = StringIndexer(inputCol="deposit", outputCol="label")
indexers.append(label_indexer)

# List of input columns for the assembler
assembler_inputs = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'] + [column + "_index" for column in categorical_cols]
print(assembler_inputs)

# Create the assembler
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'job_index', 'marital_index', 'education_index', 'default_index', 'housing_index', 'loan_index', 'contact_index', 'month_index', 'poutcome_index']


### Create and Run the pipeline

In [23]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=indexers + [assembler])
pipeline_model = pipeline.fit(data)
data_transformed = pipeline_model.transform(data)
data_transformed.select("features", "label").show()

pipeline_model.save(base_path + '/work/06-ml-using-spark/models/data_transformation_pipeline')

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(16,[0,1,2,3,4,5,...|  1.0|
|(16,[0,1,2,3,4,5,...|  1.0|
|(16,[0,1,2,3,4,5,...|  1.0|
|(16,[0,1,2,3,4,5,...|  1.0|
|(16,[0,1,2,3,4,5,...|  1.0|
|[42.0,0.0,5.0,562...|  1.0|
|[56.0,830.0,6.0,1...|  1.0|
|[60.0,545.0,6.0,1...|  1.0|
|(16,[0,1,2,3,4,5,...|  1.0|
|[28.0,5090.0,6.0,...|  1.0|
|[38.0,100.0,7.0,7...|  1.0|
|(16,[0,1,2,3,4,5,...|  1.0|
|[29.0,199.0,7.0,1...|  1.0|
|[46.0,460.0,7.0,1...|  1.0|
|[31.0,703.0,8.0,9...|  1.0|
|[35.0,3837.0,8.0,...|  1.0|
|[32.0,611.0,8.0,5...|  1.0|
|(16,[0,1,2,3,4,5,...|  1.0|
|(16,[0,1,2,3,4,5,...|  1.0|
|[49.0,168.0,8.0,5...|  1.0|
+--------------------+-----+
only showing top 20 rows



### Model Training
1. Split the Data into Training and Test Sets
2. Train a Machine Learning Model

In [24]:
train_data, test_data = data_transformed.randomSplit([0.7, 0.3], seed=42)

from pyspark.ml.classification import LogisticRegression

# Initialize the Logistic Regression model
lr = LogisticRegression(featuresCol='features', labelCol='label')

# Train the model using the training data
lr_model = lr.fit(train_data)


### Make Predictions

In [25]:
# Make predictions on the test data
predictions = lr_model.transform(test_data)

# View sample predictions
predictions.select('label', 'prediction', 'probability').show(5)

+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|  0.0|       0.0|[0.78045155772234...|
|  1.0|       0.0|[0.68340248372662...|
|  1.0|       0.0|[0.74065751071030...|
|  1.0|       1.0|[0.24912785618237...|
|  1.0|       1.0|[0.38382171928105...|
+-----+----------+--------------------+
only showing top 5 rows



### Evaluate the Model's Performance

In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Initialize the evaluator
evaluator = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='rawPrediction', metricName='areaUnderROC')

# Evaluate the model's performance
auc = evaluator.evaluate(predictions)
print(f"Test Area Under ROC: {auc:.4f}")

Test Area Under ROC: 0.8653


### Save the Model

In [28]:
# Save the logistic regression model
lr_model.save(base_path + "/work/06-ml-using-spark/models/logistic_regression_model")

In [32]:
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegressionModel

path_to_models = base_path + '/work/06-ml-using-spark/models/'
new_pipeline_model = PipelineModel.load( path_to_models + 'data_transformation_pipeline')
new_lr_model = LogisticRegressionModel.load(path_to_models + 'logistic_regression_model')

In [35]:
new_data_transformed = new_pipeline_model.transform(data)
# new_data_transformed.select("features", "label").show()
new_train_data, new_test_data = new_data_transformed.randomSplit([0.7, 0.3], seed=42)

new_predictions = new_lr_model.transform(new_test_data)
new_predictions.select('label', 'prediction', 'probability').show(5)

+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|  0.0|       0.0|[0.78045155772234...|
|  1.0|       0.0|[0.68340248372662...|
|  1.0|       0.0|[0.74065751071030...|
|  1.0|       1.0|[0.24912785618237...|
|  1.0|       1.0|[0.38382171928105...|
+-----+----------+--------------------+
only showing top 5 rows

