<a href="https://colab.research.google.com/github/gulshan0201/DATA-Science/blob/main/Project_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================================================
# STEP 0: Import Libraries
# ============================================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:
# STEP 1: Load Train and Test Data
# ============================================================
# Make sure train.csv and test.csv are in the same folder as this script
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Test shape :", test.shape)
print("\nTrain columns:", train.columns.tolist())


Train shape: (45211, 17)
Test shape : (4521, 17)

Train columns: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']


In [3]:
# STEP 2: Create New Features (Function)
#   - financial_risk_index
#   - stability_score
#   - is_campaign_peak
# ============================================================
def add_selected_features(df):
    # 1) Financial Risk Index
    # balance + 1000 if housing='yes' + 1000 if loan='yes'
    df["financial_risk_index"] = (
        df["balance"]
        + 1000 * (df["housing"] == "yes").astype(int)
        + 1000 * (df["loan"] == "yes").astype(int)
    )

    # 2) Stability Score (job + education + marital)
    job_stability = {
        "management": 2,
        "technician": 2,
        "admin.": 2,
        "blue-collar": 1,
        "services": 1,
        "self-employed": 1,
        "entrepreneur": 1,
        "retired": 1,
        "student": 1,
        "unemployed": 0,
        "housemaid": 0,
        "unknown": 0
    }

    education_stability = {
        "tertiary": 2,
        "secondary": 2,
        "primary": 1,
        "unknown": 0
    }

    marital_stability = {
        "married": 1,
        "single": 0,
        "divorced": 0
    }

    df["stability_score"] = (
        df["job"].map(job_stability).fillna(0)
        + df["education"].map(education_stability).fillna(0)
        + df["marital"].map(marital_stability).fillna(0)
    )

    # 3) Seasonal Trend Feature: is_campaign_peak
    # Peak months: May, June, August
    peak_months = ["may", "jun", "aug"]
    df["is_campaign_peak"] = df["month"].str.lower().isin(peak_months).astype(int)

    return df


In [4]:
# STEP 3: Apply Feature Engineering to Train & Test
# ============================================================
train = add_selected_features(train)
test = add_selected_features(test)

print("\nNew columns added:")
print([c for c in train.columns if c in ["financial_risk_index", "stability_score", "is_campaign_peak"]])

# Optional: Save updated versions (if you want)
train.to_csv("train_with_new_features.csv", index=False)
test.to_csv("test_with_new_features.csv", index=False)


New columns added:
['financial_risk_index', 'stability_score', 'is_campaign_peak']


In [6]:
print(train.head())

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  \
0  unknown    5   may       261         1     -1         0  unknown  no   
1  unknown    5   may       151         1     -1         0  unknown  no   
2  unknown    5   may        76         1     -1         0  unknown  no   
3  unknown    5   may        92         1     -1         0  unknown  no   
4  unknown    5   may       198         1     -1         0  unknown  no   

   financial_risk_index  stability_score  is_campaign_peak  
0                  3143                5   

In [8]:
import pandas as pd
import numpy as np

def add_research_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds three novel research features to the Bank Marketing dataframe:
    1. CRSI  - Customer Relationship Strength Index
    2. ERS   - Economic Readiness Score
    3. EMF   - Engagement Momentum Feature
    """

    df = df.copy()

    # -------------------------------------------------------
    # 1️⃣ CRSI – Customer Relationship Strength Index
    # -------------------------------------------------------
    # Components:
    # - contact_before: 1 if pdays > 0 (has been contacted earlier), else 0
    # - prev_contacts: number of previous contacts (previous)
    # - past_success: 1 if poutcome == 'success', else 0
    # Weighted combination (you can tune the weights)
    contact_before = (df["pdays"] > 0).astype(int)
    prev_contacts = df["previous"]
    past_success = (df["poutcome"] == "success").astype(int)

    df["CRSI"] = (
        0.5 * contact_before +
        0.3 * prev_contacts +
        0.2 * past_success
    )

    # -------------------------------------------------------
    # 2️⃣ ERS – Economic Readiness Score
    # -------------------------------------------------------
    # Components:
    # - normalized_balance: (balance - mean) / std
    # - job_security_score: based on job type
    # - loan, housing: penalize existing credit burdens
    #
    # ERS = 0.4 * normalized_balance
    #       + 0.3 * job_security_score
    #       - 0.2 * (loan == 'yes')
    #       - 0.1 * (housing == 'yes')
    # -------------------------------------------------------

    # Handle case where std could be zero
    balance_mean = df["balance"].mean()
    balance_std = df["balance"].std(ddof=0)
    if balance_std == 0:
        normalized_balance = (df["balance"] - balance_mean)
    else:
        normalized_balance = (df["balance"] - balance_mean) / balance_std

    # Job security mapping (you can adjust per your assumption)
    job_security_map = {
        "management": 3,
        "technician": 3,
        "admin.": 2,
        "blue-collar": 2,
        "services": 2,
        "self-employed": 2,
        "entrepreneur": 2,
        "retired": 3,
        "student": 1,
        "housemaid": 1,
        "unemployed": 0,
        "unknown": 1
    }

    job_security_score = df["job"].map(job_security_map).fillna(1)

    loan_flag = (df["loan"] == "yes").astype(int)
    housing_flag = (df["housing"] == "yes").astype(int)

    df["ERS"] = (
        0.4 * normalized_balance +
        0.3 * job_security_score -
        0.2 * loan_flag -
        0.1 * housing_flag
    )

    # -------------------------------------------------------
    # 3️⃣ EMF – Engagement Momentum Feature
    # -------------------------------------------------------
    # EMF = (duration / effective_campaign_calls) * (1 + previous)
    # where effective_campaign_calls = max(campaign, 1) to avoid divide-by-zero
    # -------------------------------------------------------
    effective_campaign = df["campaign"].clip(lower=1)  # avoids division by 0

    df["EMF"] = (
        (df["duration"] / effective_campaign) *
        (1 + df["previous"])
    )

    return df


In [9]:
# Load your data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Add the 3 novel research features
train_with_features = add_research_features(train)
test_with_features = add_research_features(test)

# Optional: Save them
train_with_features.to_csv("train_with_research_features.csv", index=False)
test_with_features.to_csv("test_with_research_features.csv", index=False)

# Quick check
print(train_with_features[["CRSI", "ERS", "EMF"]].head())


   CRSI       ERS    EMF
0   0.0  0.902568  261.0
1   0.0  0.624842  151.0
2   0.0  0.121295   76.0
3   0.0  0.518882   92.0
4   0.0  0.121164  198.0
