In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline   
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import statsmodels.stats.proportion as smp
import statsmodels.stats.multitest as smm
import statsmodels.stats.weightstats as smw
import statsmodels.stats.diagnostic as smd


from imblearn.over_sampling import SMOTE


In [None]:
# Function to preprocess the data for modeling
def preprocess_data(df):    
    # Define features and target variable
    X = df.drop(columns=['Customer_ID', 'Customer_Churn', 'Defaulted'])
    y = df['Customer_Churn']
    
    # Identify numeric and categorical columns
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()
    
    # Create preprocessing pipelines for numeric and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )
    
    return X, y, preprocessor



In [None]:
# Function to train a logistic regression model
def train_logistic_regression(X, y, preprocessor):
    # Create a pipeline with preprocessing and logistic regression
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000, random_state=42))
    ])
    
    # Fit the model
    model_pipeline.fit(X, y)
    
    return model_pipeline



In [None]:
# Function to evaluate the model
def evaluate_model(model, X, y):
    # Make predictions
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:, 1]
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_proba)
    
    # Print classification report
    print("Classification Report:")
    print(classification_report(y, y_pred))
    
    # Print confusion matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y, y_pred))
    
    return accuracy, precision, recall, f1, roc_auc



In [None]:
# --- 1. Import Necessary Libraries ---
%pip install --upgrade scikit-learn>=1.3.0 causalml

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn for modeling and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, r2_score, mean_squared_error
from sklearn.cluster import KMeans

# Advanced Models
import xgboost as xgb

# CausalML for Uplift Modeling
from causalml.inference.meta import SLearner
from causalml.metrics import plot_uplift_by_percentile

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# --- 2. Load and Prepare the Dataset ---
df = pd.read_csv('cleaned_dataset.csv')

# Drop the Customer_ID as it's just an identifier
df = df.drop('Customer_ID', axis=1)

# Display basic info to confirm data types and no nulls
print("--- Initial Data Info ---")
print(df.info())

# Note: The dataset has weird repeating decimals in some columns (e.g., 84398.0555...),
# which are likely the result of mean imputation for missing values. For this exercise,
# we'll proceed, but in a real-world scenario, this would warrant further investigation.

# --- 3. Feature Engineering and Preprocessing Setup ---

# Separate categorical and numerical features
numerical_features = df.select_dtypes(include=np.number).columns.tolist()
categorical_features = df.select_dtypes(exclude=np.number).columns.tolist()

# The target variables should not be in the feature list
target_vars = ['Customer_Churn', 'Defaulted', 'Sales']
for var in target_vars:
    if var in numerical_features:
        numerical_features.remove(var)

# Create a preprocessing pipeline for reuse
# 1. OneHotEncode categorical features
# 2. Scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns (like targets) untouched for now
)

print("\nNumerical Features:", numerical_features)
print("Categorical Features:", categorical_features)
