# Causal Analysis of Knowledge Extraction Dataset
We use the knowledge extraction data to analyze if pronouns in `arg2` cause erroneous extractions.

In [None]:
%load_ext autoreload
%autoreload 2
import networkx as nx
import numpy as np
import pandas as pd
import scipy
import random
import seaborn as sns

from utils import treatment_effect, xray_util

sns.set_context("notebook")
%matplotlib inline

random.seed(0)
np.random.seed(0)

TREATMENT_COL = 'arg2_0'
treatment_subdimensions = ['arg2_1']
TREATMENT_VAL = 'PRON'

OUTCOME_NAME = 'truth'

CONT_COVARIATES = ['confidence']

TREATMENT = 'T'
OUTCOME = 'O'

In [57]:
dataset = xray_util.read_xray_input('data/woe_parse-posAsSet-input.txt', ['confidence', 'arg1', 'relation', 'arg2'])

In [58]:
# Introduce binary treatment column as proposed by Data X-Ray
dataset[TREATMENT] = dataset[TREATMENT_COL].str.contains(TREATMENT_VAL)
dataset.rename({OUTCOME_NAME: OUTCOME}, axis=1, inplace=True)
dataset.head()

COVARIATES = dataset.drop([TREATMENT, OUTCOME, TREATMENT_COL] + treatment_subdimensions, axis=1).columns.values
CAT_COVARIATES = list(c for c in COVARIATES if c not in CONT_COVARIATES)

for c in CONT_COVARIATES:
    dataset[c] = dataset[c].astype(float)

In [60]:
# Add additional data based on what data we found lacking in the first round of analysis
additional_data = pd.read_csv('data/knowledge-extraction-additionalData-500K.csv', index_col=0)
additional_data_1 = pd.read_csv('data/knowledge-extraction-additionalData-500K+100K.csv', index_col=0)
additional_data_2 = pd.read_csv('data/knowledge-extraction-additionalData-treated-9M+100K.csv', index_col=0)
dataset = pd.concat([dataset, additional_data, additional_data_1, additional_data_2], ignore_index=True)

In [65]:
# This is the actual analysis through propensity stratification.
match_vars = []
att = treatment_effect.match_then_stratify(dataset, match_vars, TREATMENT, TREATMENT_COL, treatment_subdimensions,
                                           OUTCOME, CONT_COVARIATES, CAT_COVARIATES, n_models=20, additional_excludes=[])

Training model 1/20
Training model 2/20
Training model 3/20
Training model 4/20
Training model 5/20
Training model 6/20
Training model 11/20
Train Accuracy:  0.9394957983193277
Please provide more data of treated population with the following characteristics:
Propensity score: 0.014599911267853035-0.07271655267849762
PRON            0.378151
NOUN            0.260504
DET-ADJ-NOUN    0.105042
NOUN-ADJ        0.084034
NOUN-VERB       0.029412
Name: arg1_0, dtype: float64
he      0.121849
they    0.071429
you     0.042017
It      0.029412
We      0.029412
Name: arg1_1, dtype: float64
VERB             0.747899
ADP-VERB-ADJ     0.037815
VERB-PRT         0.033613
NOUN-ADP         0.029412
NOUN-VERB-ADP    0.021008
Name: relation_0, dtype: float64
stuffed-with    0.029412
moved           0.025210
has             0.016807
make            0.016807
have            0.016807
Name: relation_1, dtype: float64
min    0.1
25%    0.4
50%    0.5
75%    0.9
max    1.0
Name: confidence, dtype: float64
Plea

  .format(op=op_str, alt_op=unsupported[op_str]))


#### Get statistical significance by comparing with placebo treatment

In [66]:
n_iter = 10
placebo_results = treatment_effect.generate_placebo_effects(dataset, match_vars, TREATMENT, TREATMENT_COL, treatment_subdimensions,
                                                           OUTCOME, CONT_COVARIATES, CAT_COVARIATES, n_iter=n_iter)

In [67]:
treatment_effect.check_treatment_effect(att, placebo_results)

Treatment effect outside 95%-bounds of placebo effect: -0.36362820913319394 not in [-0.05562623147695818, 0.12054995274705321]
