In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [22]:
df = pd.read_csv('SF_Mandarin_dataset_processed.csv')

In [23]:
# List of target variables for each bank
banks = ['BankA_decision', 'BankB_decision', 'BankC_decision', 'BankD_decision', 'BankE_decision']


# Log Regression

In [24]:
for bank in banks:
    # Split the data into features and target variable
    X = df.drop(banks, axis=1)
    y = df[bank]

    # Define numerical and categorical features
    numerical_features = X.select_dtypes(include=['int64', 'int32', 'int16', 'float64']).columns
    categorical_features = X.select_dtypes(include=['category']).columns

    # Create transformers for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine transformers using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Initialize logistic regression model
    model = Pipeline(steps=[('preprocessor', preprocessor),
                             ('classifier', LogisticRegression())])

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    print(f"Bank: {bank}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(classification_rep)
    print("\n")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Bank: BankA_decision
Accuracy: 0.7517401392111369
Classification Report:
              precision    recall  f1-score   support

      denied       0.00      0.00      0.00       611
       error       0.00      0.00      0.00        31
     success       0.75      1.00      0.86      1944

    accuracy                           0.75      2586
   macro avg       0.25      0.33      0.29      2586
weighted avg       0.57      0.75      0.65      2586



Bank: BankB_decision
Accuracy: 0.7374323279195669
Classification Report:
              precision    recall  f1-score   support

      denied       0.00      0.00      0.00       649
       error       0.00      0.00      0.00        30
     success       0.74      1.00      0.85      1907

    accuracy                           0.74      2586
   macro avg       0.25      0.33      0.28      2586
weighted avg       0.54      0.74      0.63      2586



Bank: BankC_decision
Accuracy: 0.7521268368136118
Classification Report:
              p

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# SMOTE

In [25]:
for bank in banks:
    # Split the data into features and target variable
    X = df.drop(banks, axis=1)
    y = df[bank]

    # Define numerical and categorical features
    numerical_features = X.select_dtypes(include=['int64', 'int32', 'int16', 'float64']).columns
    categorical_features = X.select_dtypes(include=['category']).columns

    # Create transformers for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine transformers using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Use imblearn's Pipeline with SMOTE
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', LogisticRegression())
    ])

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    print(f"Bank: {bank}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(classification_rep)
    print("\n")

Bank: BankA_decision
Accuracy: 0.27300850734725446
Classification Report:
              precision    recall  f1-score   support

      denied       0.22      0.31      0.26       313
       error       0.01      0.38      0.02        16
     success       0.74      0.26      0.38       964

    accuracy                           0.27      1293
   macro avg       0.32      0.32      0.22      1293
weighted avg       0.60      0.27      0.35      1293



Bank: BankB_decision
Accuracy: 0.31167826759474093
Classification Report:
              precision    recall  f1-score   support

      denied       0.24      0.22      0.23       320
       error       0.01      0.31      0.01        13
     success       0.75      0.34      0.47       960

    accuracy                           0.31      1293
   macro avg       0.33      0.29      0.24      1293
weighted avg       0.62      0.31      0.41      1293



Bank: BankC_decision
Accuracy: 0.3665893271461717
Classification Report:
             