In [None]:
pip install numpy pandas scikit-learn pyro-ppl causalml econml pgmpy dowhy

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
#from causalml.inference.tree import CausalForest
from dowhy import CausalModel
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from econml.dml import CausalForestDML
from pyro.infer import SVI, Trace_ELBO
import pyro.distributions as dist
import pyro
import torch

In [None]:
# Simulate some credit risk data
np.random.seed(42)
n = 1000
X = np.random.normal(0, 1, size=(n, 3))  # 3 features: credit score, income, loan amount
T = np.random.binomial(1, p=0.5, size=(n,))  # Treatment: 0 (low interest rate), 1 (high interest rate)
Y = (X[:, 0] * 0.8 - X[:, 1] * 0.3 + T * 1.5 + np.random.normal(0, 1, size=n)) > 0  # Outcome: default or not
data = pd.DataFrame(np.column_stack([X, T, Y]), columns=["credit_score", "income", "loan_amount", "interest_rate", "default"])


In [None]:
model.view_model()

In [None]:
# Define the SCM using DoWhy and specify the causal graph
model = CausalModel(
    data=data,
    treatment="interest_rate",
    outcome="default",
    common_causes=["credit_score", "income", "loan_amount"],
    instruments=None  # You can add instruments for IV methods
)

# Identify causal effect
identified_estimand = model.identify_effect()

# Estimate the effect using propensity score matching
estimate = model.estimate_effect(identified_estimand, method_name="backdoor.propensity_score_matching")
print("Estimated Causal Effect:", estimate.value)

In [None]:
# Ipmort Important Libraries

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# The raw link to the CSV file in the GitHub repository
url = 'https://raw.githubusercontent.com/dzrich/PhD_Research_in_CausalML/main/Data/CreditRiskDataset.csv'

# Load the CSV file into a pandas DataFrame
CreditRiskDataset = pd.read_csv(url)

# Display the first few rows of the dataset
print(CreditRiskDataset.head())

   person_age  person_income person_home_ownership  person_emp_length  \
0          22          59000                  RENT              123.0   
1          21           9600                   OWN                5.0   
2          25           9600              MORTGAGE                1.0   
3          23          65500                  RENT                4.0   
4          24          54400                  RENT                8.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status  \
0    PERSONAL          D      35000          16.02            1   
1   EDUCATION          B       1000          11.14            0   
2     MEDICAL          C       5500          12.87            1   
3     MEDICAL          C      35000          15.23            1   
4     MEDICAL          C      35000          14.27            1   

   loan_percent_income cb_person_default_on_file  cb_person_cred_hist_length  \
0                 0.59                         Y                           3  

# Step 1: Data Engineering


In [None]:
# Step 1: Data Engineering
def data_engineering(CreditRiskDataset):
    # Example: Data cleaning, feature engineering, handling missing values, etc.
    # This step will vary depending on your dataset and use case.
    CreditRiskDataset = CreditRiskDataset.dropna()
    # Feature selection and engineering
    # Example: Assume 'X' are features and 'y' is the target
    X = CreditRiskDataset.drop('cb_person_default_on_file', axis=1)
    y = CreditRiskDataset['cb_person_default_on_file']
    return X, y

In [None]:
# Step 2: Correlational ML Techniques - Training Correlational ML Models
def train_corr_ml_model(X_train, y_train):
    # Example: Random Forest Classifier
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    return model

In [None]:
pip install dowhy

Collecting dowhy
  Downloading dowhy-0.11.1-py3-none-any.whl.metadata (17 kB)
Collecting causal-learn>=0.1.3.0 (from dowhy)
  Downloading causal_learn-0.1.3.8-py3-none-any.whl.metadata (4.2 kB)
Downloading dowhy-0.11.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.4/383.4 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading causal_learn-0.1.3.8-py3-none-any.whl (174 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.5/174.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: causal-learn, dowhy
Successfully installed causal-learn-0.1.3.8 dowhy-0.11.1


In [None]:
import dowhy
from dowhy import CausalModel

In [None]:
# Step 3: Causal ML Techniques - Training Causal ML Models
def train_causal_ml_model(X_train, y_train, CreditRiskDataset):
    # Using DoWhy for causal inference
    # Define the causal model
    causal_model = CausalModel(
        CreditRiskDataset=CreditRiskDataset,
        treatment="treatment",  # Placeholder; adjust as needed
        outcome="outcome",  # Placeholder; adjust as needed
        common_causes=["common_cause1", "common_cause2"]  # Placeholder; adjust as needed
    )
    # Identify causal effect
    identified_estimand = causal_model.identify_effect()
    # Estimate the causal effect
    causal_estimate = causal_model.estimate_effect(identified_estimand)
    return causal_estimate

In [None]:
# Step 4: Performance Evaluation
def evaluate_model(model, X_test, y_test):
    # Using accuracy and F1-score for evaluation
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    return accuracy, f1

In [None]:
# Step 5: Main Execution Workflow
def main():
    # Load Data (placeholder for actual dataset)
    # The raw link to the CSV file in the GitHub repository
    url = 'https://raw.githubusercontent.com/dzrich/PhD_Research_in_CausalML/main/Data/CreditRiskDataset.csv'

    # Load the CSV file into a pandas DataFrame
    CreditRiskDataset = pd.read_csv(url)


    # Data Engineering
    X, y = data_engineering(CreditRiskDataset)

    # Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Correlational ML Model
    corr_model = train_corr_ml_model(X_train, y_train)
    corr_accuracy, corr_f1 = evaluate_model(corr_model, X_test, y_test)
    print(f"Correlational Model Accuracy: {corr_accuracy}, F1-Score: {corr_f1}")
if __name__ == "__main__":
    main()


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
print(X_train.head(5))


NameError: name 'X_train' is not defined

In [None]:
 # Train Causal ML Model (Placeholder - actual implementation depends on data specifics)
    causal_estimate = train_causal_ml_model(X_train, y_train, CreditRiskDataset)
    print(f"Causal Estimate: {causal_estimate.value}")

    # Generalizability Check (Example Decision Logic)
    if corr_accuracy > 0.8:  # Threshold for generalizability - adjust based on context
        print("Correlational ML Model is generalizable. Deploying...")
    elif causal_estimate.value > 0.1:  # Example threshold for causal model
        print("Causal ML Model is generalizable. Deploying...")
    else:
        print("Models are not generalizable. Reiterate data engineering or model training steps.")

if __name__ == "__main__":
    main()

https://github.com/dzrich/PhD_Research_in_CausalML/tree/main/Data