# Marketing Campaign Response Prediction using ML

#Project Goal (Business First)

Objective: Predict whether a customer will respond to a marketing campaign so marketing teams can target the right customers and reduce wasted spend.

Why this matters:

Marketing campaigns are expensive

Predicting response improves ROI



1. Importing necessary libraries

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    RocCurveDisplay
)

RANDOM_STATE = 42


#Loading and dataset preprocessing



In [7]:
df = pd.read_excel("cleaned_marketing_campaign.xlsx")
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
1,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
2,6177,1985,PhD,Married,33454.0,1,0,2013-05-08,32,76,...,8,0,0,0,0,0,0,3,11,0
3,4855,1974,PhD,Together,30351.0,1,0,2013-06-06,19,14,...,9,0,0,0,0,0,0,3,11,1
4,1994,1983,Graduation,Married,51381.5,1,0,2013-11-15,11,5,...,7,0,0,0,0,0,0,3,11,0


In [11]:
df.shape


(980, 29)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   980 non-null    int64         
 1   Year_Birth           980 non-null    int64         
 2   Education            980 non-null    object        
 3   Marital_Status       980 non-null    object        
 4   Income               980 non-null    float64       
 5   Kidhome              980 non-null    int64         
 6   Teenhome             980 non-null    int64         
 7   Dt_Customer          980 non-null    datetime64[ns]
 8   Recency              980 non-null    int64         
 9   MntWines             980 non-null    int64         
 10  MntFruits            980 non-null    int64         
 11  MntMeatProducts      980 non-null    int64         
 12  MntFishProducts      980 non-null    int64         
 13  MntSweetProducts     980 non-null  

In [14]:
df['Response'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Response,Unnamed: 1_level_1
0,0.911224
1,0.088776


Data Cleaning

In [23]:
# Drop non-informative columns
cols_to_drop = ['ID', 'Z_CostContact', 'Z_Revenue', 'Dt_Customer']
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# Handle missing income by using median because it is robust to skew
df['Income'] = df['Income'].fillna(df['Income'].median())


Feature Engineering

In [25]:
# Convert Year_Birth to Age
if 'Year_Birth' in df.columns:
    df['Age'] = 2025 - df['Year_Birth']
    df.drop(columns=['Year_Birth'], inplace=True)

# Aggregate total spending
spending_cols = [
    'MntWines', 'MntFruits', 'MntMeatProducts',
    'MntFishProducts', 'MntSweetProducts', 'MntGoldProds'
]

existing_spending_cols = [col for col in spending_cols if col in df.columns]

df['Total_Spending'] = df[existing_spending_cols].sum(axis=1)

df['Children'] = df['Kidhome'] + df['Teenhome']

df.head()


Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,Age,Total_Spending,Children
0,Graduation,Single,46344.0,1,1,38,11,1,6,2,...,0,0,0,0,0,0,0,71,27,2
1,Graduation,Together,26646.0,1,0,26,11,4,20,10,...,0,0,0,0,0,0,0,41,53,1
2,PhD,Married,33454.0,1,0,32,76,10,56,3,...,0,0,0,0,0,0,0,40,169,1
3,PhD,Together,30351.0,1,0,19,14,0,24,3,...,0,0,0,0,0,0,1,51,46,1
4,Graduation,Married,51381.5,1,0,11,5,5,6,0,...,0,0,0,0,0,0,0,42,19,1


Feature - Target split

In [26]:
X = df.drop('Response', axis=1)
y = df['Response']

Define Feature types

In [27]:
categorical_features = ['Education', 'Marital_Status']
numerical_features = [col for col in X.columns if col not in categorical_features]

Pre-processing (Categorical + Numerical)

In [28]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ]
)


Train-test split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)


Model build - Logistic Regression

In [31]:
from sklearn.preprocessing import StandardScaler

# Define the numerical transformer pipeline for scaling
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Re-define the preprocessor to include scaling for numerical features
# Use the 'numerical_features' list already defined in a previous cell
scaled_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', numerical_transformer, numerical_features) # Apply StandardScaler here
    ]
)

# Update the logistic regression pipeline to use the scaled_preprocessor
log_reg_pipeline = Pipeline(steps=[
    ('preprocessing', scaled_preprocessor),
    ('classifier', LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=RANDOM_STATE
    ))
])

log_reg_pipeline.fit(X_train, y_train)

Evaluation

In [33]:
y_pred = log_reg_pipeline.predict(X_test)
y_prob = log_reg_pipeline.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

           0       0.96      0.85      0.91       179
           1       0.30      0.65      0.41        17

    accuracy                           0.84       196
   macro avg       0.63      0.75      0.66       196
weighted avg       0.90      0.84      0.86       196

ROC-AUC: 0.8644429838974695


## Model build - Random Forest Classifier

In [34]:
random_forest_pipeline = Pipeline(steps=[
    ('preprocessing', scaled_preprocessor), # Reuse the scaled preprocessor
    ('classifier', RandomForestClassifier(
        class_weight='balanced',
        random_state=RANDOM_STATE
    ))
])

random_forest_pipeline.fit(X_train, y_train)

## Evaluation - Logistic Regression and Random Forest Classifier

In [40]:
# Logistic Regression performance
lr_auc = roc_auc_score(y_test, y_prob)

# Random Forest performance
rf_probs = random_forest_pipeline.predict_proba(X_test)[:, 1]
rf_auc = roc_auc_score(y_test, rf_probs)

print(f"Logistic Regression ROC-AUC: {lr_auc:.3f}")
print(f"Random Forest ROC-AUC: {rf_auc:.3f}")

Logistic Regression ROC-AUC: 0.864
Random Forest ROC-AUC: 0.829


In [35]:
y_pred_rf = random_forest_pipeline.predict(X_test)
y_prob_rf = random_forest_pipeline.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred_rf))
print("ROC-AUC (Random Forest):", roc_auc_score(y_test, y_prob_rf))

              precision    recall  f1-score   support

           0       0.92      0.99      0.96       179
           1       0.67      0.12      0.20        17

    accuracy                           0.92       196
   macro avg       0.79      0.56      0.58       196
weighted avg       0.90      0.92      0.89       196

ROC-AUC (Random Forest): 0.8286230693394676


#Final Model Selection
Based on ROC-AUC performance and ability to capture non-linear customer behavior, Random Forest was selected as the final model.

The model outputs probabilities, allowing marketing teams to adjust decision thresholds based on budget and campaign capacity.

In [43]:
# Business assumptions
campaign_cost = 500
profit_if_response = 5000

# Predicted probabilities from model
df_results = X_test.copy()
df_results["response_prob"] = y_prob_rf # Changed from y_proba to y_prob_rf

# Expected Value calculation
df_results["expected_value"] = (
    df_results["response_prob"] * profit_if_response
) - campaign_cost

# Select only profitable customers
profitable_customers = df_results[df_results["expected_value"] > 0]

len(profitable_customers), df_results.shape[0]

(48, 196)

48: This is the number of customers from the test set that the Random Forest model predicted to be 'profitable'. These are the customers for whom the 'expected value' (profit if they respond minus campaign cost) is greater than zero.

196: This is the total number of customers in the X_test dataset.