In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
import ylearn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
np.random.seed(2022)

In [3]:
import os
import sys
# 项目根目录
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath('./grasp_Xlearn_XGB.ipynb')))
# 添加系统环境变量
print(BASE_DIR)
sys.path.insert(0, BASE_DIR)

d:\Demo\PythonDemo\2022WAIC_CausalLearing


In [4]:
train = pd.read_csv(BASE_DIR+'/data/train.csv')
test = pd.read_csv(BASE_DIR+'/data/test.csv')

In [5]:
# replace nan
def build_data(train):
    train_ = {}
    for i in train.columns:
        # 对每一序列进行预处理
        # preprocessing for each series
        train_i = train[i]
        if any(train[i].isna()):
            # 对缺失值取均值处理
            print(train[i].isna().sum())
            train_i = train_i.replace(np.nan, train[i].mean())
        if len(train_i.value_counts()) <= 20 and train_i.dtype != object:
            train_i = train_i.astype(int)
        train_[i] = train_i

    return pd.DataFrame(train_)

train = build_data(train)
test = build_data(test)

8
25
2
5


In [6]:
all_cov = list(train.columns)
# save data and their corresponding transformers
class TransData:
    def __init__(self, name, is_obj=False):
        self.is_obj = is_obj
        self.name = name
        self.transformer = None

    def __call__(self, data):
        self.df = data[self.name]
        series = self.df.to_numpy().reshape(-1, 1)
        if self.df.dtype == object:
            self.is_obj = True
            self.transformer = OrdinalEncoder()
            self.data = self.transformer.fit_transform(series).astype(int)
        elif self.df.dtype != int:
            self.transformer = StandardScaler()
            self.data = self.transformer.fit_transform(series)
        else:
            self.data = series

In [7]:
# data preprocessing
data_dict = {}
cat_name = []
test_dict = {}

for name in all_cov:
    t = TransData(name=name)
    t(train)
    data_dict[name] = t.data.reshape(-1, )
    if t.is_obj:
        cat_name.append(name)
    if name not in ['treatment', 'outcome']:
        try:
            test_i = t.transformer.transform(test[name].values.reshape(-1, 1)).reshape(-1, )
        except:
            test_i = test[name]
        test_dict[name] = test_i
train_transformed = pd.DataFrame(data_dict)
test_data = pd.DataFrame(test_dict)
print(cat_name)

['V_8', 'V_10', 'V_14', 'V_26']


In [8]:
'''
V 特征列  x 干预方案 y 作用结果
因果图已知 需根据数据分析 提取各特征列对应的影响变量
构造模型
实现因果预测的模型
'''
V = train_transformed.drop(['treatment', 'outcome'], axis=1).values
x = train_transformed['treatment'].values
y = train_transformed['outcome'].values

In [9]:
# 因果发现 
CFdata = np.concatenate((V[:,35:40],x.reshape(-1,1),y.reshape(-1,1)),axis=1)

# from causallearn.search.PermutationBased.GRaSP import grasp
# G = grasp(X=CFdata,score_func='local_score_BIC', maxP=8 )
# nodes = G.get_nodes()
# print(len(nodes))


# def get_G_parents(G, node):
#     parent = G.get_parents(node)
#     parents = []
#     for i in parent:
#         parents.append(i.get_name())
#     return parents


# node_tr = nodes[40]
# parents_tr = get_G_parents(G,node_tr)
# node_out = nodes[41]
# parents_out = get_G_parents(G,node_out)

In [10]:
# def x2V_in_parents(parents):
#     new_parents = []
#     for i in parents:
#         new_parents.append(str(i).replace("x","V_"))
#     return new_parents

# grasp_confounder_list = x2V_in_parents(parents_tr)
# grasp_convariate_list = x2V_in_parents(parents_out[:-1])
# print(grasp_confounder_list,grasp_convariate_list)

In [11]:
grasp_confounder_list = ['V_1', 'V_2', 'V_3', 'V_11', 'V_18', 'V_33', 'V_35', 'V_37']

grasp_convariate_list = ['V_1', 'V_2', 'V_3', 'V_4', 'V_9', 'V_10', 'V_11', 'V_14', 'V_16', 'V_17', 'V_18', 'V_19', 'V_21', 'V_22', 'V_28', 'V_29', 'V_30', 'V_31', 'V_32', 'V_33', 'V_35', 'V_36', 'V_39']

In [12]:
def get_ce(data, x1_model, x2_model):
    ce1 = x1_model.estimate(data)
    ce2 = x2_model.estimate(data)
    return np.concatenate([ce1.reshape(-1, 1), ce2.reshape(-1, 1)], axis=1)

In [13]:
from ylearn.estimator_model import TLearner, XLearner,SLearner
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor
from sklearn import svm
import xgboost as xgb
model = xgb.XGBRegressor(seed=1850,n_estimators=1000,learning_rate=0.1,
                         max_depth=11)
# model = RandomForestRegressor(n_estimators=150)
# from model import Regressor
# import imp
# imp.reload(Regressor)
# model = GradientBoostingRegressor(learning_rate=0.01, criterion="friedman_mse",
#                                              n_estimators=150, max_features=0.6)
tl1 = XLearner(model=model)
tl2 = XLearner(model=model)
tl1.fit(data=train_transformed, treatment='treatment', outcome='outcome', treat=1,
        control=0, covariate=grasp_confounder_list)
tl2.fit(data=train_transformed, treatment='treatment', outcome='outcome', treat=2,
        control=0, covariate=grasp_confounder_list)
print(tl1.estimate(quantity="CATE"))
print(tl2.estimate(quantity="CATE"))

  from pandas import MultiIndex, Int64Index


0.3977107
0.7562739


In [None]:
ce = get_ce(train_transformed, tl1, tl2)
ce_test = get_ce(test_data, x1_model=tl1, x2_model=tl2)
ce_total = np.concatenate((ce, ce_test), axis=0)
# Save the result
# ce2csv(ce_total,"GraspConfounder_Xlearn_XGB_n1000_depth11.csv")