In [1]:
import numpy as np
from sklearn.datasets import make_spd_matrix
from sklearn.linear_model import LinearRegression
import math

from opossum import UserInterface
import matplotlib.pyplot as plt
import causalml
from causalml.inference.meta import LRSRegressor
from xgboost import XGBRegressor
from causalml.inference.meta import BaseRRegressor
from causalml.propensity import ElasticNetPropensityModel

from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from matplotlib import pyplot as plt

The sklearn.utils.testing module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.


In [2]:
## create datasets in an array?
## for each thing in the data set, take each array and split into 1/3. two new arrays : Test and StackingTrain. 
# dataSetArray...
N = 30
k = 3
seed= 5

u = UserInterface(N, k, seed=seed, categorical_covariates = None)

X=[]
assignment=[]
y=[]
treatment=[]
propensityScores=[]

def splitArrays(l):
    return np.array_split(l, 3)

def addDatasets(y_gen, X_gen, assignment_gen, treatment_gen):
    X.append(splitArrays(X_gen))
    y.append(splitArrays(y_gen))
    assignment.append(splitArrays(assignment_gen))
    treatment.append(splitArrays(treatment_gen))
    
    
####IMPORTANT: after generating the data, we wish to split the dataset into three parts:
## one for training the R learner => 0
## one for fitting the OLS stacking model => 1
## and one for testing. => 2

### because of the nature of the opossum data set output, we will use numpy arrays
### after generating a dataset, we add the dataset to an array of datasets. each entry in this (super)array of datasets
### contains three sub arrays. one train, one stacking, one test array. 

## so accessing the training dataset of the second data generating funciton is then: X[1][0]
## accessing the treatment vector (true treatment effect) of the first data set, testing data: treatment[0][2]

In [3]:
##############################################################################################
## setupA => difficult nuisance com-ponents and an easy treatment effect function
##############################################################################################
u.generate_treatment(random_assignment = False, 
                     assignment_prob = 'low', 
                     constant_pos = False, 
                     constant_neg = False,
                     heterogeneous_pos = True, 
                     heterogeneous_neg = False, 
                     no_treatment = False, 
                     discrete_heterogeneous = False,
                     treatment_option_weights = None, 
                     intensity = 10)

y_A, X_A, assignment_A, treatment_A = u.output_data(binary=False, 
                                               x_y_relation = 'nonlinear_interaction')

addDatasets(y_A, X_A, assignment_A, treatment_A)

In [9]:
##############################################################################################
## setupB => randomized trial
##############################################################################################
u.generate_treatment(random_assignment = True, 
                     assignment_prob = 0.5, 
                     treatment_option_weights = [0.0, 0.0, 0.4, 0.6, 0.0, 0.0],
                     intensity = 5)

y_B, X_B, assignment_B, treatment_B = u.output_data(binary=False, x_y_relation = 'linear_simple')

addDatasets(y_B, X_B, assignment_B, treatment_B)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False])

In [10]:
##############################################################################################
## setupC => easy propensity score and difficult baseline
##############################################################################################
u.generate_treatment(random_assignment = False, 
                     assignment_prob = 'low', 
                     constant_pos = True, 
                     constant_neg = False,
                     heterogeneous_pos = False, 
                     heterogeneous_neg = False, 
                     no_treatment = False, 
                     discrete_heterogeneous = False,
                     treatment_option_weights = None, 
                     intensity = 10)

y_C, X_C, assignment_C, treatment_C = u.output_data(binary=False, 
                                               x_y_relation = 'nonlinear_interaction')

addDatasets(y_C, X_C, assignment_C, treatment_C)


##############################################################################################
## setupD => unrelated treatment and control arms???
##############################################################################################


In [None]:
## Get propensity scores using CausalML package. (scores for each dataset and save them in an array)
propensityScores = []

for x in range(3):
    pm = ElasticNetPropensityModel(n_fold=5, random_state=42)
    estimatedpropensityscores = pm.fit_predict(X[x], assignment[x])
    propensityScores.append(estimatedpropensityscores)

In [None]:
##############################################################################################
## what are we estimating here.. for one dataset.
##############################################################################################

# R Learner with propensity score input
# Calling the Base Learner class and feeding in XGB
learner_r = BaseRRegressor(learner=XGBRegressor())
ate_r_XGBRegressor = learner_r.estimate_ate(X=X[i], treatment = assignment[i], p=propensityScores[i], y=y[i])
print('Using the BaseRRegressor class and using XGB:')
print(ate_r_XGBRegressor)

# Calling the Base Learner class and feeding in LinearRegression
## comes from from sklearn.linear_model import LinearRegression.. so i assume all can come from there???
learner_rLinearRegression = BaseRRegressor(learner=LinearRegression())
ate_r_LinearRegression = learner_rLinearRegression.estimate_ate(X=X[i], treatment = assignment[i], p=propensityScores[i], y=y[i])
print('Using the BaseRRegressor class and using Linear Regression:')
print(ate_r_LinearRegression)


from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

learner_p = BaseRRegressor(learner=DecisionTreeRegressor())
ate_r = learner_p.estimate_ate(X=X[i], treatment = assignment[i], p=propensityScores[i], y=y[i])
print('Using the BaseRRegressor class and using DecisionTree:')
print(ate_r_A)

In [None]:
#to get CATE from the causalml package, predict on some holdout set? 
### FOR THIS YOU NEED TO CREATE A HOLDOUT SET AT ALL!!!!!
cate_r = learner_r.fit_predict(X[i], treatment = assignment[i], y=y[i], p=propensityScores[i])
treatment[i] 

### NOT SURE THAT THIS MAKES SENSE AS A PLOT. 
### you are plotting the PREDICTED treatment effect on the treated. (cate_r)
### s


alpha=0.2
bins=30
plt.figure(figsize=(12,8))
plt.hist(cate_r, alpha=alpha, bins=bins, label='R Learner')
plt.hist(treatment[i], alpha=alpha, bins=bins, label='true treatment effect')
plt.title('Distribution of CATE Predictions by Meta Learner')
plt.xlabel('Individual Treatment Effect (ITE/CATE)')
plt.ylabel('# of Samples')
_=plt.legend().subtract(in_num1, in_num2)  

In [None]:
### how to get the mean square error from a model? 
## take actual - predicted and square each one, sum over and take mean. 

#### YOU ARE MISSING THE FACT THAT THERE ARE NOT AS MANY TREATED. THE PREDICTION ON NON TREATED MAKES NO SENSE. sorry!
a = np.array(cate_r)
b = np.array(treatment[i])
mses = ((a-b)**2).mean()