In [2]:
# Feature Engineering Pipeline (Jupyter Notebook Version)

# Import required libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    LabelEncoder,
    FunctionTransformer
)
from sklearn.impute import SimpleImputer, KNNImputer
from xverse.transformer import MonotonicBinning
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Load the data
data_path = r"C:\Users\Daniel.Temesgen\Desktop\KIAM-Rsc\week5\Data\data.csv"
data = pd.read_csv(data_path)

# Display basic info
print("Data shape:", data.shape)
data.head()

Data shape: (95662, 16)


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [3]:
# Aggregate features per customer
aggregate_features = data.groupby('CustomerId').agg({
    'Amount': [
        ('Total_Transaction_Amount', 'sum'),
        ('Average_Transaction_Amount', 'mean'),
        ('Transaction_Count', 'count'),
        ('Std_Transaction_Amount', 'std')
    ]
}).reset_index()

# Flatten column names
aggregate_features.columns = ['CustomerId', 'Total_Transaction_Amount', 
                            'Average_Transaction_Amount', 'Transaction_Count', 
                            'Std_Transaction_Amount']

# Merge aggregate features back to original dataset
data_enhanced = data.merge(aggregate_features, on='CustomerId', how='left')

# Display the first few rows of enhanced dataset
print("Enhanced Data shape:", data_enhanced.shape)
data_enhanced.head()

Enhanced Data shape: (95662, 20)


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,Total_Transaction_Amount,Average_Transaction_Amount,Transaction_Count,Std_Transaction_Amount
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0,109921.75,923.712185,119,3042.294251
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0,109921.75,923.712185,119,3042.294251
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0,1000.0,500.0,2,0.0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0,228727.2,6019.136842,38,17169.24161
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0,228727.2,6019.136842,38,17169.24161


In [4]:
# Convert TransactionStartTime to datetime
data_enhanced['TransactionStartTime'] = pd.to_datetime(data_enhanced['TransactionStartTime'])

# Extract temporal features
data_enhanced['Transaction_Hour'] = data_enhanced['TransactionStartTime'].dt.hour
data_enhanced['Transaction_Day'] = data_enhanced['TransactionStartTime'].dt.day
data_enhanced['Transaction_Month'] = data_enhanced['TransactionStartTime'].dt.month
data_enhanced['Transaction_Year'] = data_enhanced['TransactionStartTime'].dt.year

# Display the first few rows with new features
print("Data with extracted features shape:", data_enhanced.shape)
data_enhanced[['TransactionId', 'TransactionStartTime', 'Transaction_Hour', 
               'Transaction_Day', 'Transaction_Month', 'Transaction_Year']].head()

Data with extracted features shape: (95662, 24)


Unnamed: 0,TransactionId,TransactionStartTime,Transaction_Hour,Transaction_Day,Transaction_Month,Transaction_Year
0,TransactionId_76871,2018-11-15 02:18:49+00:00,2,15,11,2018
1,TransactionId_73770,2018-11-15 02:19:08+00:00,2,15,11,2018
2,TransactionId_26203,2018-11-15 02:44:21+00:00,2,15,11,2018
3,TransactionId_380,2018-11-15 03:32:55+00:00,3,15,11,2018
4,TransactionId_28195,2018-11-15 03:34:21+00:00,3,15,11,2018


In [5]:
# Identify categorical columns
categorical_cols = ['CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 
                    'ProductCategory', 'ChannelId', 'PricingStrategy']

# Apply Label Encoding to columns with high cardinality
label_encoders = {}
for col in ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId']:
    le = LabelEncoder()
    data_enhanced[col + '_encoded'] = le.fit_transform(data_enhanced[col])
    label_encoders[col] = le

# Apply One-Hot Encoding to other categorical columns
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_transformed = one_hot_encoder.fit_transform(data_enhanced[categorical_cols])
ohe_df = pd.DataFrame(ohe_transformed, 
                      columns=one_hot_encoder.get_feature_names_out(categorical_cols))

# Concatenate one-hot encoded columns with the dataset
data_encoded = pd.concat([data_enhanced, ohe_df], axis=1)

# Drop original categorical columns
data_encoded = data_encoded.drop(columns=categorical_cols + 
                               ['TransactionId', 'BatchId', 'AccountId', 
                                'SubscriptionId', 'CustomerId'])

# Display the first few rows of encoded dataset
print("Data with encoded features shape:", data_encoded.shape)
data_encoded.head()

Data with encoded features shape: (95662, 65)


Unnamed: 0,Amount,Value,TransactionStartTime,FraudResult,Total_Transaction_Amount,Average_Transaction_Amount,Transaction_Count,Std_Transaction_Amount,Transaction_Hour,Transaction_Day,...,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,PricingStrategy_0,PricingStrategy_1,PricingStrategy_2,PricingStrategy_4
0,1000.0,1000,2018-11-15 02:18:49+00:00,0,109921.75,923.712185,119,3042.294251,2,15,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-20.0,20,2018-11-15 02:19:08+00:00,0,109921.75,923.712185,119,3042.294251,2,15,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,500.0,500,2018-11-15 02:44:21+00:00,0,1000.0,500.0,2,0.0,2,15,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,20000.0,21800,2018-11-15 03:32:55+00:00,0,228727.2,6019.136842,38,17169.24161,3,15,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-644.0,644,2018-11-15 03:34:21+00:00,0,228727.2,6019.136842,38,17169.24161,3,15,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
# Check for missing values
print("Missing values before handling:\n", data_encoded.isnull().sum())

# Define numerical columns for imputation (including new features)
numerical_cols = ['Amount', 'Value', 'Total_Transaction_Amount', 
                  'Average_Transaction_Amount', 'Transaction_Count', 
                  'Std_Transaction_Amount', 'Transaction_Hour', 
                  'Transaction_Day', 'Transaction_Month', 'Transaction_Year']

# Apply KNN Imputation for numerical columns
knn_imputer = KNNImputer(n_neighbors=5)
data_encoded[numerical_cols] = knn_imputer.fit_transform(data_encoded[numerical_cols])

# Apply Simple Imputation (mode) for any remaining categorical columns
simple_imputer = SimpleImputer(strategy='most_frequent')
data_encoded[ohe_df.columns] = simple_imputer.fit_transform(data_encoded[ohe_df.columns])

# Remove rows with any remaining missing values (if any)
data_encoded = data_encoded.dropna()

# Verify no missing values remain
print("Missing values after handling:\n", data_encoded.isnull().sum())

# Display the first few rows of the final dataset
print("Final Data shape:", data_encoded.shape)
data_encoded.head()

Missing values before handling:
 Amount                      0
Value                       0
TransactionStartTime        0
FraudResult                 0
Total_Transaction_Amount    0
                           ..
ChannelId_ChannelId_5       0
PricingStrategy_0           0
PricingStrategy_1           0
PricingStrategy_2           0
PricingStrategy_4           0
Length: 65, dtype: int64
Missing values after handling:
 Amount                      0
Value                       0
TransactionStartTime        0
FraudResult                 0
Total_Transaction_Amount    0
                           ..
ChannelId_ChannelId_5       0
PricingStrategy_0           0
PricingStrategy_1           0
PricingStrategy_2           0
PricingStrategy_4           0
Length: 65, dtype: int64
Final Data shape: (95662, 65)


Unnamed: 0,Amount,Value,TransactionStartTime,FraudResult,Total_Transaction_Amount,Average_Transaction_Amount,Transaction_Count,Std_Transaction_Amount,Transaction_Hour,Transaction_Day,...,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,PricingStrategy_0,PricingStrategy_1,PricingStrategy_2,PricingStrategy_4
0,1000.0,1000.0,2018-11-15 02:18:49+00:00,0,109921.75,923.712185,119.0,3042.294251,2.0,15.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-20.0,20.0,2018-11-15 02:19:08+00:00,0,109921.75,923.712185,119.0,3042.294251,2.0,15.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,500.0,500.0,2018-11-15 02:44:21+00:00,0,1000.0,500.0,2.0,0.0,2.0,15.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,20000.0,21800.0,2018-11-15 03:32:55+00:00,0,228727.2,6019.136842,38.0,17169.24161,3.0,15.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-644.0,644.0,2018-11-15 03:34:21+00:00,0,228727.2,6019.136842,38.0,17169.24161,3.0,15.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
from sklearn.preprocessing import MinMaxScaler

# Define numerical columns for scaling
numerical_cols = ['Amount', 'Value', 'Total_Transaction_Amount', 
                  'Average_Transaction_Amount', 'Transaction_Count', 
                  'Std_Transaction_Amount', 'Transaction_Hour', 
                  'Transaction_Day', 'Transaction_Month', 'Transaction_Year']

# Apply Standardization
standard_scaler = StandardScaler()
data_encoded[numerical_cols] = standard_scaler.fit_transform(data_encoded[numerical_cols])

# Optionally apply Normalization (if preferred over standardization)
# normalizer = MinMaxScaler()
# data_encoded[numerical_cols] = normalizer.fit_transform(data_encoded[numerical_cols])

# Display the first few rows of the scaled dataset
print("Data with scaled features shape:", data_encoded.shape)
data_encoded[numerical_cols].head()

Data with scaled features shape: (95662, 65)


Unnamed: 0,Amount,Value,Total_Transaction_Amount,Average_Transaction_Amount,Transaction_Count,Std_Transaction_Amount,Transaction_Hour,Transaction_Day,Transaction_Month,Transaction_Year
0,-0.046371,-0.072291,0.170118,-0.067623,-0.311831,-0.168001,-2.15553,-0.100739,0.848684,-0.994246
1,-0.054643,-0.080251,0.170118,-0.067623,-0.311831,-0.168001,-2.15553,-0.100739,0.848684,-0.994246
2,-0.050426,-0.076352,0.165122,-0.072568,-0.444993,-0.20215,-2.15553,-0.100739,0.848684,-0.994246
3,0.107717,0.096648,0.175567,-0.008155,-0.40402,-0.009434,-1.949214,-0.100739,0.848684,-0.994246
4,-0.059704,-0.075183,0.175567,-0.008155,-0.40402,-0.009434,-1.949214,-0.100739,0.848684,-0.994246


#Proxy Target Variable Engineering 

#identifying disengaged customers as high-risk proxies

In [8]:
import pandas as pd
from datetime import datetime

# Assuming the data is loaded into a DataFrame (replace with actual data loading if needed)
data = pd.read_csv(data_path)

# Convert TransactionStartTime to datetime
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])

# Calculate customer-level features
latest_date = data['TransactionStartTime'].max()

customer_features = data.groupby('CustomerId').agg({
    'TransactionId': 'count',  # Total transactions
    'Amount': [
        lambda x: (x < 0).sum(),  # Count of negative transactions
        'mean'  # Average transaction amount
    ],
    'TransactionStartTime': lambda x: (latest_date - x.max()).total_seconds() / (24 * 3600),  # Days since last transaction
    'ProductCategory': lambda x: (x == 'financial_services').sum() / len(x),  # Proportion of financial_services transactions
    'FraudResult': 'max'  # Any fraud flags
}).reset_index()

# Rename columns for clarity
customer_features.columns = [
    'CustomerId', 
    'total_transactions', 
    'negative_transactions', 
    'avg_transaction_amount', 
    'days_since_last_transaction', 
    'prop_financial_services', 
    'has_fraud'
]

# Define disengagement criteria for high-risk
def assign_credit_risk(row):
    if (row['total_transactions'] < 2) or \
       (row['negative_transactions'] / row['total_transactions'] > 0.5) or \
       (row['days_since_last_transaction'] > 30) or \
       (row['prop_financial_services'] > 0.5 and row['avg_transaction_amount'] < 0) or \
       (row['has_fraud'] == 1):
        return 1  # High-risk (disengaged)
    return 0  # Low-risk

# Apply the criteria
customer_features['credit_risk'] = customer_features.apply(assign_credit_risk, axis=1)

# Merge back to original data (if needed for transaction-level analysis)
data = data.merge(customer_features[['CustomerId', 'credit_risk']], on='CustomerId', how='left')

# View results
print(customer_features[['CustomerId', 'total_transactions', 'negative_transactions', 
                        'days_since_last_transaction', 'prop_financial_services', 'credit_risk']])

           CustomerId  total_transactions  negative_transactions  \
0        CustomerId_1                   1                      1   
1       CustomerId_10                   1                      1   
2     CustomerId_1001                   5                      2   
3     CustomerId_1002                  11                      6   
4     CustomerId_1003                   6                      2   
...               ...                 ...                    ...   
3737   CustomerId_992                   6                      2   
3738   CustomerId_993                   5                      2   
3739   CustomerId_994                 101                     40   
3740   CustomerId_996                  17                      2   
3741   CustomerId_998                  22                      8   

      days_since_last_transaction  prop_financial_services  credit_risk  
0                       83.716829                 0.000000            1  
1                       83.716887  

#RFM segmentation

In [9]:
import pandas as pd
from datetime import datetime

# Provided data

# Convert TransactionStartTime to datetime
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])

# Define snapshot date (day after the latest transaction)
snapshot_date = pd.to_datetime('2018-11-16T00:00:00Z')

# Calculate RFM metrics
rfm_metrics = data.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,  # Recency: days since last transaction
    'TransactionId': 'count',  # Frequency: number of transactions
    'Amount': 'sum'  # Monetary: total amount
}).reset_index()

# Rename columns for clarity
rfm_metrics.columns = ['CustomerId', 'Recency', 'Frequency', 'Monetary']

# Apply the criteria
customer_features['credit_risk'] = customer_features.apply(assign_credit_risk, axis=1)

# Merge back to original data (if needed for transaction-level analysis)
rfm_metrics = rfm_metrics.merge(customer_features[['CustomerId', 'credit_risk']], on='CustomerId', how='left')

# Display results
print(rfm_metrics)

           CustomerId  Recency  Frequency  Monetary  credit_risk
0        CustomerId_1       -6          1  -10000.0            1
1       CustomerId_10       -6          1  -10000.0            1
2     CustomerId_1001       -1          5   20000.0            1
3     CustomerId_1002      -64         11    4225.0            1
4     CustomerId_1003      -78          6   20000.0            0
...               ...      ...        ...       ...          ...
3737   CustomerId_992      -85          6   20000.0            0
3738   CustomerId_993      -64          5   20000.0            0
3739   CustomerId_994      -89        101  543873.0            0
3740   CustomerId_996      -22         17  139000.0            1
3741   CustomerId_998      -90         22  100000.0            0

[3742 rows x 5 columns]


#Cluster Customers

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans



# Preprocess RFM features
# Select RFM columns for clustering
rfm_features = rfm_metrics[['Recency', 'Frequency', 'Monetary','credit_risk']]

# Standardize features
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_features)

# Apply K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
rfm_metrics['Cluster'] = kmeans.fit_predict(rfm_scaled)

# Display results
print(rfm_metrics[['CustomerId', 'Recency', 'Frequency', 'Monetary', 'Cluster','credit_risk']])

           CustomerId  Recency  Frequency  Monetary  Cluster  credit_risk
0        CustomerId_1       -6          1  -10000.0        0            1
1       CustomerId_10       -6          1  -10000.0        0            1
2     CustomerId_1001       -1          5   20000.0        0            1
3     CustomerId_1002      -64         11    4225.0        0            1
4     CustomerId_1003      -78          6   20000.0        2            0
...               ...      ...        ...       ...      ...          ...
3737   CustomerId_992      -85          6   20000.0        2            0
3738   CustomerId_993      -64          5   20000.0        2            0
3739   CustomerId_994      -89        101  543873.0        2            0
3740   CustomerId_996      -22         17  139000.0        0            1
3741   CustomerId_998      -90         22  100000.0        2            0

[3742 rows x 6 columns]


#Model Training and Tracking

In [None]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
import warnings
warnings.filterwarnings("ignore")



# Features and target
X = rfm_metrics[['Recency', 'Frequency', 'Monetary']]
y = rfm_metrics['credit_risk']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define models and hyperparameter grids
models = {
    'LogisticRegression': {
        'model': LogisticRegression(random_state=42),
        'param_grid': {
            'C': [0.1, 1, 10],
            'solver': ['liblinear', 'lbfgs']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'param_grid': {
            'n_estimators': [50, 100],
            'max_depth': [None, 10],
            'min_samples_split': [2, 5]
        }
    }
}

# MLflow experiment tracking
mlflow.set_experiment("Credit_Risk_Modeling")

for model_name, config in models.items():
    with mlflow.start_run(run_name=model_name):
        # Log parameters
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("random_state", 42)

        # Grid Search
        grid_search = GridSearchCV(
            estimator=config['model'],
            param_grid=config['param_grid'],
            cv=2,  # Small dataset, use 2-fold CV
            scoring='accuracy',
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)

        # Best model
        best_model = grid_search.best_estimator_
        mlflow.log_params(grid_search.best_params_)

        # Evaluate on test set
        if len(X_test) > 0:  # Ensure test set is not empty
            y_pred = best_model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=0)
            recall = recall_score(y_test, y_pred, zero_division=0)
        else:
            accuracy, precision, recall = 0, 0, 0  # Handle small dataset case

        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)

        # Log model
        mlflow.sklearn.log_model(best_model, f"{model_name}_model")

        print(f"{model_name} Best Params: {grid_search.best_params_}")
        print(f"{model_name} Metrics: Accuracy={accuracy:.2f}, Precision={precision:.2f}, Recall={recall:.2f}")



LogisticRegression Best Params: {'C': 0.1, 'solver': 'lbfgs'}
LogisticRegression Metrics: Accuracy=0.83, Precision=0.96, Recall=0.71




RandomForest Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
RandomForest Metrics: Accuracy=0.98, Precision=0.99, Recall=0.98


# Unit Testing

In [11]:
import pandas as pd
import pytest
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load data
transaction_data = pd.read_csv(data_path)

# RFM data - remove the print() statement

df=rfm_metrics[['CustomerId', 'Recency', 'Frequency', 'Monetary', 'Cluster','credit_risk']]

rfm_data = df[['CustomerId', 'Recency', 'Frequency', 'Monetary', 'Cluster', 'credit_risk']]

# If you want to print it for viewing, do it separately
print(rfm_data[['CustomerId', 'Recency', 'Frequency', 'Monetary', 'Cluster', 'credit_risk']])

# Rename and merge
rfm_data = rfm_data.rename(columns={'credit_risk': 'is_high_risk'})[['CustomerId', 'Recency', 'Frequency', 'Monetary', 'is_high_risk']]
main_data = transaction_data.merge(rfm_data[['CustomerId', 'Recency', 'Frequency', 'Monetary', 'is_high_risk']], 
                                  on='CustomerId', how='left')

# Rest of your test functions remain the same
def test_data_integrity():
    """Test data for missing values and correct types."""
    assert not main_data[['Recency', 'Frequency', 'Monetary', 'is_high_risk']].isnull().any().any(), "Data contains missing values"
    assert main_data['Recency'].dtype == int, "Recency should be integer"
    assert main_data['Frequency'].dtype == int, "Frequency should be integer"
    assert main_data['Monetary'].dtype == float, "Monetary should be float"
    assert main_data['is_high_risk'].isin([0, 1]).all(), "is_high_risk should be 0 or 1"

def test_data_merge():
    """Test if merge preserves all transactions."""
    assert len(main_data) == len(transaction_data), "Merge should preserve all transactions"
    assert 'is_high_risk' in main_data.columns, "is_high_risk column missing"
    assert main_data['CustomerId'].nunique() == rfm_data['CustomerId'].nunique(), "Customer count mismatch after merge"

def test_data_split():
    """Test if train-test split maintains data integrity."""
    X = main_data[['Recency', 'Frequency', 'Monetary']]
    y = main_data['is_high_risk']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    assert len(X_train) + len(X_test) == len(X), "Train-test split size mismatch"
    assert len(y_train) + len(y_test) == len(y), "Target split size mismatch"

def test_model_output():
    """Test if model produces valid predictions."""
    X = main_data[['Recency', 'Frequency', 'Monetary']]
    y = main_data['is_high_risk']
    model = LogisticRegression(random_state=42)
    model.fit(X, y)
    predictions = model.predict(X)
    assert len(predictions) == len(y), "Prediction length mismatch"
    assert set(predictions).issubset([0, 1]), "Predictions should be 0 or 1"

           CustomerId  Recency  Frequency  Monetary  Cluster  credit_risk
0        CustomerId_1       -6          1  -10000.0        0            1
1       CustomerId_10       -6          1  -10000.0        0            1
2     CustomerId_1001       -1          5   20000.0        0            1
3     CustomerId_1002      -64         11    4225.0        0            1
4     CustomerId_1003      -78          6   20000.0        2            0
...               ...      ...        ...       ...      ...          ...
3737   CustomerId_992      -85          6   20000.0        2            0
3738   CustomerId_993      -64          5   20000.0        2            0
3739   CustomerId_994      -89        101  543873.0        2            0
3740   CustomerId_996      -22         17  139000.0        0            1
3741   CustomerId_998      -90         22  100000.0        2            0

[3742 rows x 6 columns]


In [12]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# RFM data - remove the print() statement

df=rfm_metrics[['CustomerId', 'Recency', 'Frequency', 'Monetary', 'Cluster','credit_risk']]

rfm_data = df[['CustomerId', 'Recency', 'Frequency', 'Monetary', 'Cluster', 'credit_risk']]

# Features and target
X = df[['Recency', 'Frequency', 'Monetary']]
y = df['credit_risk']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define models and hyperparameter grids
models = {
    'LogisticRegression': {
        'model': LogisticRegression(random_state=42),
        'param_grid': {
            'C': [0.1, 1, 10],
            'solver': ['liblinear', 'lbfgs']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'param_grid': {
            'n_estimators': [50, 100],
            'max_depth': [None, 10],
            'min_samples_split': [2, 5]
        }
    }
}

# MLflow experiment tracking
mlflow.set_experiment("Credit_Risk_Modeling")

for model_name, config in models.items():
    with mlflow.start_run(run_name=model_name):
        # Log parameters
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("random_state", 42)

        # Grid Search
        grid_search = GridSearchCV(
            estimator=config['model'],
            param_grid=config['param_grid'],
            cv=2,
            scoring='accuracy',
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)

        # Best model
        best_model = grid_search.best_estimator_
        mlflow.log_params(grid_search.best_params_)

        # Evaluate on test set
        if len(X_test) > 0:
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, 'predict_proba') else best_model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=0)
            recall = recall_score(y_test, y_pred, zero_division=0)
            f1 = f1_score(y_test, y_pred, zero_division=0)
            roc_auc = roc_auc_score(y_test, y_pred_proba) if len(np.unique(y_test)) > 1 else 0
        else:
            accuracy, precision, recall, f1, roc_auc = 0, 0, 0, 0, 0

        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("roc_auc", roc_auc)

        # Log model
        mlflow.sklearn.log_model(best_model, f"{model_name}_model")

        # Plot and log ROC curve (if test set exists and model supports probabilities)
        if len(X_test) > 0 and hasattr(best_model, 'predict_proba'):
            fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
            plt.figure()
            plt.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc:.2f})")
            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlabel("False Positive Rate")
            plt.ylabel("True Positive Rate")
            plt.title(f"ROC Curve - {model_name}")
            plt.legend(loc="best")
            plt.savefig(f"roc_curve_{model_name}.png")
            mlflow.log_artifact(f"roc_curve_{model_name}.png")
            plt.close()

        print(f"{model_name} Best Params: {grid_search.best_params_}")
        print(f"{model_name} Metrics: Accuracy={accuracy:.2f}, Precision={precision:.2f}, Recall={recall:.2f}, F1={f1:.2f}, ROC-AUC={roc_auc:.2f}")



LogisticRegression Best Params: {'C': 0.1, 'solver': 'lbfgs'}
LogisticRegression Metrics: Accuracy=0.83, Precision=0.96, Recall=0.71, F1=0.82, ROC-AUC=0.91




RandomForest Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
RandomForest Metrics: Accuracy=0.98, Precision=0.99, Recall=0.98, F1=0.99, ROC-AUC=1.00


#Model Evaluation

In [13]:
import mlflow.sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Define rfm_metrics
# RFM data 

df=rfm_metrics[['CustomerId', 'Recency', 'Frequency', 'Monetary', 'Cluster','credit_risk']]

rfm_data = df[['CustomerId', 'Recency', 'Frequency', 'Monetary', 'Cluster', 'credit_risk']]

# Features and target
X = rfm_metrics[['Recency', 'Frequency', 'Monetary']]
y = rfm_metrics['credit_risk']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Replace with actual run IDs from MLflow UI
run_ids = {
    'LogisticRegression': '<logistic_run_id>',
    'RandomForest': '<random_forest_run_id>'
}

for model_name, run_id in run_ids.items():
    try:
        # Load model
        model = mlflow.sklearn.load_model(f"runs:/{run_id}/{model_name}_model")

        # Predict
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else y_pred

        # Compute metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        roc_auc = roc_auc_score(y_test, y_pred_proba) if len(np.unique(y_test)) > 1 else 0

        # Print results
        print(f"\n{model_name} Metrics:")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Precision: {precision:.2f}")
        print(f"Recall: {recall:.2f}")
        print(f"F1 Score: {f1:.2f}")
        print(f"ROC-AUC: {roc_auc:.2f}")

        # Plot ROC curve
        if len(X_test) > 0 and hasattr(model, 'predict_proba'):
            fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
            plt.figure()
            plt.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc:.2f})")
            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlabel("False Positive Rate")
            plt.ylabel("True Positive Rate")
            plt.title(f"ROC Curve - {model_name}")
            plt.legend(loc="best")
            plt.show()
    except Exception as e:
        print(f"Error loading {model_name} model: {e}")

Error loading LogisticRegression model: Invalid value "<logistic_run_id>" for parameter 'run_id' supplied.
Error loading RandomForest model: Invalid value "<random_forest_run_id>" for parameter 'run_id' supplied.
