# Causal Analysis of Knowledge Extraction Dataset

In [1]:
%load_ext autoreload
%autoreload 2
import networkx as nx
import numpy as np
import pandas as pd
import scipy
import random
import seaborn as sns

from utils import treatment_effect, xray_util

sns.set_context("notebook")
%matplotlib inline

random.seed(0)
np.random.seed(0)

TREATMENT_COL = 'arg2_0'
treatment_subdimensions = ['arg2_1']
TREATMENT_VAL = 'PRON'

OUTCOME_NAME = 'truth'

CONT_COVARIATES = ['confidence']

TREATMENT = 'T'
OUTCOME = 'O'

In [None]:
dataset = xray_util.read_xray_input('data/woe_parse-posAsSet-input.txt', ['confidence', 'arg1', 'relation', 'arg2'])

In [27]:
# Introduce binary treatment column as proposed by Data X-Ray
dataset[TREATMENT] = dataset[TREATMENT_COL].str.contains(TREATMENT_VAL)
dataset.rename({OUTCOME_NAME: OUTCOME}, axis=1, inplace=True)
dataset.head()

COVARIATES = dataset.drop([TREATMENT, OUTCOME, TREATMENT_COL] + treatment_subdimensions, axis=1).columns.values
CAT_COVARIATES = list(c for c in COVARIATES if c not in CONT_COVARIATES)

for c in CONT_COVARIATES:
    dataset[c] = dataset[c].astype(float)

## Analysis with Matching and Propensity Score Stratification

In [29]:
dataset.head()

Unnamed: 0,confidence,arg1_0,arg1_1,relation_0,relation_1,arg2_0,arg2_1,O,T
0,0.6,PRON,they,VERB,plan-raise,NOUN,premiums,False,False
1,0.6,PRON,they,VERB,plan-reduce,NOUN,benefits,False,False
2,0.7,PRT-DET-ADJ-NOUN,The-two-year-note-'s-yield,ADP-VERB-ADJ,was-unchanged-at,NOUN-NUM,5.95-percent,True,False
3,0.9,DET-NUM,The-12,VERB,is,.-NUM,$-70.00,False,False
4,0.9,DET-ADJ-NOUN,The-principal-opposition-parties,VERB,boycotted,DET-NOUN,the-polls,True,False


In [33]:
match_vars = []
att = treatment_effect.match_then_stratify(dataset, match_vars, TREATMENT, TREATMENT_COL, treatment_subdimensions,
                                           OUTCOME, CONT_COVARIATES, CAT_COVARIATES, n_models=10, additional_excludes=[])

Training model 1/10
Training model 2/10
Training model 3/10
Training model 4/10
Training model 5/10
Training model 6/10
Train Accuracy:  0.9107358262967431
Please provide more data of treated population with the following characteristics:
Propensity score: 0.027145922819829943-0.04936559907633777
NOUN         0.331325
DET-NOUN     0.265060
DET          0.084337
NOUN-VERB    0.060241
NOUN-PRT     0.048193
Name: arg1_0, dtype: float64
that             0.030120
Story-Filed      0.024096
people           0.024096
the-Senate       0.024096
half-of-which    0.018072
Name: arg1_1, dtype: float64
ADP-VERB        0.337349
VERB            0.228916
NOUN-ADP        0.078313
ADP-VERB-ADJ    0.054217
ADP-VERB-PRT    0.048193
Name: relation_0, dtype: float64
stuffed-with             0.042169
would-be-inhabited-by    0.030120
rose                     0.024096
sponsored-by             0.018072
's                       0.018072
Name: relation_1, dtype: float64
min    0.1
25%    0.7
50%    0.9
75%    1.0

  .format(op=op_str, alt_op=unsupported[op_str]))


#### Get statistical significance by comparing with placebo treatment

In [36]:
n_iter = 10
placebo_results = treatment_effect.generate_placebo_effects(dataset, match_vars, TREATMENT, TREATMENT_COL, treatment_subdimensions,
                                                           OUTCOME, CONT_COVARIATES, CAT_COVARIATES, n_iter=n_iter)

In [37]:
treatment_effect.check_treatment_effect(att, placebo_results)

Treatment effect within 95%-bounds of placebo effect: -0.12983939256613164 <= -0.09136310223266747 <= 0.02971386889911014
