### Structural equation fitting on sparse data and known DAG:

let's say H is our target variable,  
we want to compare predicting H using sparse data and ML to   
predicting H using sparse data and causal AI methods  
this is before any intervention or counterfactual questions are asked.

for the causal AI method:
 - classic constraints from DAG
 - twin networks
 - makes sense to treat different subgraphs differently? I.e. the 3 'setting' variables leading to F could simply MLed on

In [1]:
import numpy as np
import pandas as pd
import sympy as sp
import networkx as nx
import matplotlib.pyplot as plt
from scipy.stats import skewnorm
# import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import KBinsDiscretizer
import optuna
import xgboost as xgb

from xgboost import XGBClassifier

In [2]:
# def data_gen_process(seed, n_samples):
#     np.random.seed(seed)
#     n_samples = n_samples

#     data = {
#         'A': np.random.binomial(1, 0.25, n_samples),
#         'B': np.random.poisson(3, n_samples),
#         'C': np.random.poisson(2, n_samples),
#         'D': np.random.poisson(10, n_samples),
#         'E': np.round(1 + 4 * np.random.beta(3, 2, n_samples)).astype(int),}

#     df = pd.DataFrame(data)
#     df['G'] = np.where(
#         df['A'] == 0,
#         np.exp(-0.5 * df['B']),
#         1 - np.exp(-0.3 * df['B']))

#     df['F'] = (0.5 * df['C'] + 0.3 * df['D'] + np.random.normal(0, 1, n_samples)).astype(int)
#     df['F'] = df['F'].clip(0, 4)  # F is between 0 and 4

#     logit_H = (
#         df['G']
#         - 0.5 * df['C']
#         + 0.3 * df['D']
#         + 0.2 * df['E']
#         - 0.4 * df['F']
#         + 1.5 * df['B']
#         + np.random.normal(0, 1, n_samples)
#     )
#     df['H'] = (logit_H > 0).astype(int)

#     # reverse the influence of B on H when conditioned on A (Simpson's)
#     df['H'] = np.where(
#         df['A'] == 1,
#         (logit_H - 3 * df['B'] > 0).astype(int),
#         df['H'])
    
#     return df

In [3]:
## Simpson's:

# # Calculate the mean of H for different values of B overall
# overall_means = df.groupby('B')['H'].mean()

# # Calculate the mean of H for different values of B conditioned on A
# conditioned_means_A0 = df[df['A'] == 0].groupby('B')['H'].mean()
# conditioned_means_A1 = df[df['A'] == 1].groupby('B')['H'].mean()

# # Plot the results
# plt.figure(figsize=(12, 6))

# plt.plot(overall_means.index, overall_means.values, label='Overall', marker='o')
# plt.plot(conditioned_means_A0.index, conditioned_means_A0.values, label='Conditioned on A=0', marker='o')
# plt.plot(conditioned_means_A1.index, conditioned_means_A1.values, label='Conditioned on A=1', marker='o')

# plt.xlabel('B')
# plt.ylabel('Mean of H')
# plt.title('Simpson\'s Paradox: Relationship between B and H')
# plt.legend()
# plt.grid(True)
# plt.show()

In [4]:
# the DAG describing this data generating process:
G = nx.DiGraph()
G.add_edges_from([
    ('A', 'G'),
    ('B', 'G'),
    ('G', 'H'),
    ('C', 'F'),
    ('D', 'F'),
    ('E', 'F'),
    ('F', 'H'),
    ('B', 'H'),
])
# plot it
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G)
# nx.draw(G, pos, with_labels=True, node_size=5000, node_color='lightblue', font_size=20, font_weight='bold', arrowsize=20)
# plt.show()

<Figure size 1200x1200 with 0 Axes>

### Trad ML pred

In [2]:
## Data Generation
def data_gen_process(seed=42, n_samples=1000):
    np.random.seed(seed)

    A = np.random.binomial(1, 0.5, n_samples)
    B = np.random.normal(0, 1, n_samples)
    C = np.random.normal(0, 1, n_samples)
    D = np.random.normal(0, 1, n_samples)
    E = np.random.normal(0, 1, n_samples)

    G = np.tanh(A + 0.5 * B) + np.random.normal(0, 0.5, n_samples)  # Increase noise
    F = np.sin(C) + np.log(np.abs(D) + 1) + E**2 + np.random.normal(0, 0.5, n_samples)  # Increase noise

    H_base = G**2 + 2 * np.cos(F) + 0.5 * B**2 + np.random.normal(0, 0.5, n_samples)  # More complex non-linear relationship
    H = np.where(B > 0, H_base - 1.5 * G, H_base + 1.5 * G)  # Simpson's effect

    Z = np.random.binomial(1, 0.5, n_samples)
    H += Z * np.random.normal(0, 0.5, n_samples)

    # Discretize H
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    H_discrete = discretizer.fit_transform(H.reshape(-1, 1)).flatten()

    return pd.DataFrame({
        'A': A,
        'B': B,
        'C': C,
        'D': D,
        'E': E,
        'G': G,
        'F': F,
        'H': H_discrete,
        'Z': Z 
    })

### NN arch using TensorFlow/Keras

In [7]:
data = data_gen_process()
# Split the data into train, validation, and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

X_train = train_data.drop('H', axis=1)
y_train = train_data['H']
X_val = val_data.drop('H', axis=1)
y_val = val_data['H']
X_test = test_data.drop('H', axis=1)
y_test = test_data['H']

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

def create_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    return model

# Assuming X_train and y_train are prepared from the data generation process
model = create_model(input_dim=X_train.shape[1])
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f'Test MAE: {test_mae:.2f}')
# accuracy
y_pred = model.predict(X_test)
y_pred = np.round(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 3.4712 - mae: 1.5444 - val_loss: 1.8575 - val_mae: 1.1388
Epoch 2/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.5953 - mae: 0.9956 - val_loss: 1.3442 - val_mae: 0.9665
Epoch 3/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1.4260 - mae: 0.9189 - val_loss: 1.1060 - val_mae: 0.8799
Epoch 4/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.9921 - mae: 0.7819 - val_loss: 0.9735 - val_mae: 0.8197
Epoch 5/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.8070 - mae: 0.7150 - val_loss: 0.8816 - val_mae: 0.7717
Epoch 6/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7263 - mae: 0.7058 - val_loss: 0.7961 - val_mae: 0.7229
Epoch 7/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7493 - mae

### XGBClassifier

In [6]:
data = data_gen_process()

# Split the data into train, validation, and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

X_train = train_data.drop('H', axis=1)
y_train = train_data['H']
X_val = val_data.drop('H', axis=1)
y_val = val_data['H']
X_test = test_data.drop('H', axis=1)
y_test = test_data['H']

# Optuna objective function
def objective(trial):
    param = {
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': 5,
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
    }
    
    model = XGBClassifier(**param, use_label_encoder=False)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    
    preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, preds)
    return 1.0 - accuracy

# Optimize hyperparameters with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Train final model with best hyperparameters
best_params = study.best_params
model = XGBClassifier(**best_params, use_label_encoder=False)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

# Evaluate model on the test set
test_preds = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_preds)

print(f'Test Accuracy: {test_accuracy:.4f}')
print(f'Best Hyperparameters: {best_params}')

[I 2024-07-19 11:15:53,968] A new study created in memory with name: no-name-5f25bbc0-cb1f-4d9a-a58c-1cdd195ccbfd


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
Parameters: { "use_label_encoder" } are not used.

[I 2024-07-19 11:15:55,559] Trial 0 finished with value: 0.405 and parameters: {'max_depth': 3, 'learning_rate': 0.00016356023841197014, 'n_estimators': 985, 'gamma': 1.2642689276226094e-05, 'min_child_weight': 9, 'subsample': 0.5167032509478577, 'colsample_bytree': 0.7102751650019303}. Best is trial 0 with value: 0.405.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
Parameters: { "use_label_encoder" } are not used.

[I 2024-07-19 11:15:56,991] Trial 1 finis

Test Accuracy: 0.6650
Best Hyperparameters: {'max_depth': 5, 'learning_rate': 0.06024998776829071, 'n_estimators': 214, 'gamma': 1.0433466910423212e-08, 'min_child_weight': 1, 'subsample': 0.9058544802754726, 'colsample_bytree': 0.7450794802932229}


### MLPClassifier

In [13]:
df = data_gen_process(seed=42, n_samples=10000)
X = df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'Z']]  # Include Z as a feature
y = df['H']
y_binary = (y > np.median(y)).astype(int)  # Convert to binary target

X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train a neural network classifier
clf = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42)
clf.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)
y_pred = clf.predict(X_test_scaled)
accuracy_initial = accuracy_score(y_test, y_pred)
accuracy_initial
# Check on new data from the same DGP
# df_new = data_gen_process(seed=43, n_samples=5000)
# X_new = df_new[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'Z']]
# y_new = df_new['H']
# y_new_binary = (y_new > np.median(y_new)).astype(int)  # Convert to binary target

# X_new_scaled = scaler.transform(X_new)
# y_pred_new = clf.predict(X_new_scaled)
# accuracy_new = accuracy_score(y_new_binary, y_pred_new)

# accuracy_initial, accuracy_new

0.8515

### XGBoost

In [11]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

df = data_gen_process(seed=42, n_samples=10000)
# Assuming df is the DataFrame generated from data_gen_process
X = df.drop('H', axis=1)
y = df['H']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.01, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
# accuracy
y_pred = model.predict(X_test)
y_pred = np.round(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

[0]	validation_0-rmse:0.77958
[1]	validation_0-rmse:0.77609
[2]	validation_0-rmse:0.77262
[3]	validation_0-rmse:0.76760
[4]	validation_0-rmse:0.76269
[5]	validation_0-rmse:0.75938
[6]	validation_0-rmse:0.75459
[7]	validation_0-rmse:0.75109
[8]	validation_0-rmse:0.74639
[9]	validation_0-rmse:0.74377
[10]	validation_0-rmse:0.73917
[11]	validation_0-rmse:0.73476
[12]	validation_0-rmse:0.73223
[13]	validation_0-rmse:0.72775
[14]	validation_0-rmse:0.72467
[15]	validation_0-rmse:0.72178
[16]	validation_0-rmse:0.71737
[17]	validation_0-rmse:0.71448
[18]	validation_0-rmse:0.71167
[19]	validation_0-rmse:0.70904
[20]	validation_0-rmse:0.70568
[21]	validation_0-rmse:0.70143
[22]	validation_0-rmse:0.69731
[23]	validation_0-rmse:0.69496
[24]	validation_0-rmse:0.69228
[25]	validation_0-rmse:0.68983
[26]	validation_0-rmse:0.68668
[27]	validation_0-rmse:0.68411
[28]	validation_0-rmse:0.68016
[29]	validation_0-rmse:0.67638
[30]	validation_0-rmse:0.67328
[31]	validation_0-rmse:0.66951
[32]	validation_0-

### Causal Discovery upto Markov