# 03 DataSet Augmentation and Classification Model Training

This code aims to augment the *Train Set* by making use of linguisitic patterns, train the cause-effect relation classification model (LSTM) and evaluate this Classification Model. 

* **Input**: The processed dataset including the sentences tagged with NPs Pairs.
* **Approaches**: Define the labeling functions by using linguisitic patterns and use *Label Model* to get augmented *Train Set*; Apply LSTM to classify the sentences with cause-effect pairs.
* **Output**: the cause-effect relation classification model based on LSTM

In [1]:
### import and install necessary packages

import os
import re
import random
import glob
import copy

import pandas as pd
import numpy as np

import time

from collections import Counter

path_here = os.getcwd()


## 03-A. Partition datasets
train set: 70%; dev set: 20%; test set: 10%;


In [2]:

### purpose: to get the binary labels of one pair  (1: postive pairs; -1: negative pairs)
### input: (dataframe containg each sentence, all seed pairs)
### output: the binary labels of one pair
def get_BiLabel(df_enwiki_causality, causal_pairs):
    
    tuple_pairs = [[i,j] for i, j, k in causal_pairs] # the first two 
    label_ls = []

    for index, row in df_enwiki_causality.iterrows():
        if row['pairs'] in tuple_pairs:
            ind = tuple_pairs.index(row['pairs'])
            label_ls.append(1)
        else:
            label_ls.append(-1)
    
    print(Counter(label_ls))
    return label_ls

### purpose: partition the dataset into train, deve and test
### input: one dataframe containing each sentence
### output: 3 datamfranes containing each sentence
def partition(df_enwiki_causality_v4):
    nb_train = 6
    nb_dev = 2
    nb_test = 2
    
    index_shuffle = [i for i in df_enwiki_causality_v4.index]
    random.shuffle(index_shuffle)
    list_shuffle = [index_shuffle[i::10] for i in range(10)]
    
    df_train = df_enwiki_causality_v4.iloc[[j for i in list_shuffle[:nb_train] for j in i]]
    df_dev = df_enwiki_causality_v4.iloc[[j for i in list_shuffle[nb_train:nb_train+nb_dev] for j in i]]
    df_test = df_enwiki_causality_v4.iloc[[j for i in list_shuffle[nb_train+nb_dev:10] for j in i]]
    
    return df_train, df_dev, df_test


In [3]:

# partition dataset
df_enwiki_causality_v4 = pd.read_pickle(path_here + '/res/df_enwiki_causality.pkl')
df_train, df_dev, df_test = partition(df_enwiki_causality_v4)
# len(df_train) + len(df_dev) + len(df_test) == len(df_enwiki_causality_v4)  # test: correct

# get the binary labels for those pairs (whether they are causal pairs)
df_causalpairs_train = pd.read_csv(path_here + '/res/df_causalpairs_train2.csv', index_col = 0)
causal_pairs = [df_causalpairs_train.loc[i].to_list()[1:4] for i in range(len(df_causalpairs_train))] 
Y_dev = get_BiLabel(df_dev, causal_pairs)

df_causalpairs_test = pd.read_csv(path_here + '/res/df_causalpairs_test2.csv', index_col = 0)
causal_pairs_test = [df_causalpairs_test.loc[i].to_list()[1:4] for i in range(len(df_causalpairs_test))] 
Y_test = get_BiLabel(df_test, causal_pairs_test)

## 03-B. Define the labeing functions

In [5]:

from preprocessors_03 import get_NPs_text, get_text_between, get_left_tokens, get_right_tokens

POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1


In [6]:
### define the labeling functions


from snorkel.labeling import labeling_function


# Check for the `causal connectives` words appearing between the mentions
causalConnectives = {'for this reason', 'with the result that', 'because of','thanks to', 'due to',
                    'beause', 'as', 'since', 'so', 'so that',
                    'the reason was that', 'is due to'}
@labeling_function(resources=dict(causalConnectives=causalConnectives), pre=[get_text_between])
def lf_causalConnectives(x, causalConnectives):
    if len(causalConnectives.intersection(set(x.text_between))) > 0 :
        return POSITIVE   
    else:
        return ABSTAIN
    

    
    
# Check for the `causation verbs` words appearing between the mentions
causationVerbs = {'cause', 'lead to', 'bring about', 'generate', 'make', 'force', 'allow',
                 'kill', 'melt', 'dry', 'break', 'drop', 
                 'poison', 'hang', 'punch', 'clean',
                 'blacken', 'sweeten', 'thicken', 'nullify', 'liquefy', 'verify',
                 'kill', 'feed', 'die', 'eat'}
@labeling_function(resources=dict(causationVerbs=causationVerbs), pre=[get_text_between])
def lf_causationVerbs(x, causationVerbs):
    return POSITIVE if len(causationVerbs.intersection(set(x.text_between))) > 0 else ABSTAIN




# Check for the `causation adverbs` words appearing to the left and right of the mentions
causationAdverbs = {'audibly', 'visibly',
                   'manilestly', 'publicly', 'conspicuously',
                   'successfully', 'plausibly', 'conveniently', 'amusingly', 'pleasantly',
                   'irrevocably', 'ously', 'rudely',
                   'obediently', 'gratefully', 'consequently', 'painfully',
                   'mechanically', 'magically'}

@labeling_function(resources=dict(causationAdverbs=causationAdverbs), pre=[get_left_tokens])
def lf_causationAdverbs_left(x, causationAdverbs):
    if len(set(causationAdverbs).intersection(set(x.ele1_left_tokens))) > 0:
        return POSITIVE
    elif len(set(causationAdverbs).intersection(set(x.ele2_left_tokens))) > 0:
        return POSITIVE
    else:
        return ABSTAIN
    
@labeling_function(resources=dict(causationAdverbs=causationAdverbs), pre=[get_right_tokens])
def lf_causationAdverbs_right(x, causationAdverbs):
    if len(set(causationAdverbs).intersection(set(x.ele1_right_tokens))) > 0:
        return POSITIVE
    elif len(set(causationAdverbs).intersection(set(x.ele2_right_tokens))) > 0:
        return POSITIVE
    else:
        return ABSTAIN    
    

In [10]:
### define the Distant Supervision labeling function 

# the methods to download the knowledges from Wiki Data --> apply "has cause (P828)" to the query in this link <https://query.wikidata.org/#PREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E%0APREFIX%20rdfs%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E%0APREFIX%20p%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2F%3E%0APREFIX%20v%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fstatement%2F%3E%0A%0ASELECT%20%3Fx%20%3Fx_label%20%3Fy%20%3Fy_label%20WHERE%20%7B%0A%09%3Fx%20wdt%3AP828%20%3Fy%20.%0A%09OPTIONAL%20%7B%0A%09%09%3Fx%20rdfs%3Alabel%20%3Fx_label%20FILTER%20%28lang%28%3Fx_label%29%20%3D%20%22en%22%29%20.%0A%20%20%20%20%20%20%20%20%3Fy%20rdfs%3Alabel%20%3Fy_label%20FILTER%20%28lang%28%3Fy_label%29%20%3D%20%22en%22%29%20.%0A%09%7D%0A%7D> 
known_Causality = pd.read_csv(path_here +'/data/query.tsv', sep='\t')

# delete the labels' rows including NaN;
known_Causality = known_Causality.dropna()
known_CauseEffect_pairs = list(zip(known_Causality['x_label'].str.lower(), known_Causality['y_label'].str.lower()))

@labeling_function(resources=dict(known_CauseEffect_pairs=known_CauseEffect_pairs), pre=[get_NPs_text])
def lf_distant_supervision(x, known_CauseEffect_pairs):
    p1, p2 = x.NPs
    if (p1, p2) in known_CauseEffect_pairs:
        return POSITIVE
    else:
        return ABSTAIN

In [12]:
### Apply Labeling Functions to the Data

from snorkel.labeling import PandasLFApplier

lfs = [
    lf_causalConnectives,
    lf_causationVerbs, 
    lf_causationAdverbs_left,
    lf_causationAdverbs_right,
    lf_distant_supervision
    ]
applier = PandasLFApplier(lfs)


In [13]:
from snorkel.labeling import LFAnalysis

L_dev = applier.apply(df_dev)
L_train = applier.apply(df_train)

100%|██████████| 82060/82060 [03:16<00:00, 417.17it/s]
100%|██████████| 246184/246184 [09:49<00:00, 417.71it/s]


In [102]:
dev_analysis = LFAnalysis(L_dev, lfs).lf_summary(np.array(Y_dev))
dev_analysis



Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
lf_causalConnectives,0,[1],0.098672,0.002084,0.0,33,0,0.004076
lf_causationVerbs,1,[1],0.011687,0.001999,0.0,9,0,0.009385
lf_causationAdverbs_left,2,[1],0.000256,3.7e-05,0.0,0,0,0.0
lf_causationAdverbs_right,3,[1],0.00039,7.3e-05,0.0,0,0,0.0
lf_distant_supervision,4,[1],0.000427,2.4e-05,0.0,0,0,0.0


## 03-C. Train labeling model for augmentated Train Set

In [107]:
### training labeling model (try to converge several LFs into the single models)

from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, np.array(Y_dev), n_epochs=5000, log_freq=500, seed=12345)


In [108]:
### Evaluate the label model

from snorkel.analysis import metric_score
from snorkel.utils import probs_to_preds

probs_dev = label_model.predict_proba(L_dev)
preds_dev = probs_to_preds(probs_dev)
# change the results only containing <0 and 1>
Y_dev_01 = np.array([x if x==1 else 0 for x in Y_dev])


print(f"Label model accuracy score: {metric_score(Y_dev_01, preds_dev, probs=probs_dev, metric='accuracy')}")
print(f"Label model precision score: {metric_score(Y_dev_01, preds_dev, probs=probs_dev, metric='precision')}")
print(f"Label model recall score: {metric_score(Y_dev_01, preds_dev, probs=probs_dev, metric='recall')}")
print(f"Label model f1 score: {metric_score(Y_dev_01, preds_dev, probs=probs_dev, metric='f1')}")
print(f"Label model roc-auc: {metric_score(Y_dev_01, preds_dev, probs=probs_dev, metric='roc_auc')}")


Label model accuracy score: 0.9936997319034853
Label model precision score: 0.0
Label model recall score: 0.0
Label model f1 score: 0.0
Label model roc-auc: 0.5248609225508569


In [19]:
### Filter out training data points which did not recieve a label from any LF

from snorkel.labeling import filter_unlabeled_dataframe

probs_train = label_model.predict_proba(L_train)
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train)

# convert the label into unite
Y_train_filter = [0 if i[0] > i[1] else 1 for i in probs_train_filtered]        
Y_train_filter_tf= [False if i==0 else True for i in Y_train_filter] 


## 03-D. Train cause-effect relation classification model

In [87]:

### purpose: to add dataframe with more columns for the usage of LSTM
### input: one dataframe containing each sentence
### output: one datamfrane has 3 columns more
def add3colDf(df_train_filtered):
    
    # be aware it will be the new dataframe
    new_list = df_train_filtered.columns.to_list()
    t_list = ['text_between', 'ele1_left_tokens', 'ele2_right_tokens']
    new_list.extend(t_list)
    df = pd.DataFrame(columns = new_list)

    for ind, rows in df_train_filtered.iterrows():
        text_between = rows['tokens'][rows['ele1_word_idx'][1]+1: rows['ele2_word_idx'][0]]
        ele1_left_tokens = rows['tokens'][:rows['ele1_word_idx'][0]]
        ele2_right_tokens = rows['tokens'][rows['ele2_word_idx'][1]+1:]

        ### add the new row
        dict_newrow = dict(rows)
        dict_newrow.update({ 'ele1_left_tokens': ele1_left_tokens, 
                  'text_between': text_between, 'ele2_right_tokens': ele2_right_tokens})

        ### new dataframe
        df = df.append(dict_newrow, ignore_index = True)
        
    return df


In [91]:
### Training our End Extraction Model (LSTM)

from tf_model_03 import get_model, get_feature_arrays
from utils_03 import get_n_epochs

# prepare the train dataframe with more columns
df_train_filtered_3more = add3colDf(df_train_filtered)

X_train = get_feature_arrays(df_train_filtered_3more)
model = get_model()
batch_size = 64
model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=get_n_epochs())


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1d6595748>

In [97]:
### Evaluating the trained model on the test set(LSTM)

# prepare the test dataframe with more columns
df_test_3more = add3colDf(df_test)

X_test = get_feature_arrays(df_test_3more)
probs_test = model.predict(X_test)
preds_test = probs_to_preds(probs_test)

# change the results only containing <0 and 1>
Y_test_01 = np.array([x if x==1 else 0 for x in Y_test])

print(f"Test accuracy when trained with soft labels: {metric_score(Y_test_01, preds_dev, probs=probs_dev, metric='accuracy')}")
print(f"Test precision when trained with soft labels: {metric_score(Y_test_01, preds_dev, probs=probs_dev, metric='precision')}")
print(f"Test recall when trained with soft labels: {metric_score(Y_test_01, preds_dev, probs=probs_dev, metric='recall')}")
print(f"Test F1 when trained with soft labels: {metric_score(Y_test_01, preds_dev, probs=probs_dev, metric='f1')}")
print(f"Test ROC-AUC when trained with soft labels: {metric_score(Y_test_01, preds_dev, probs=probs_dev, metric='roc_auc')}")



Test F1 when trained with soft labels: 0.0004102717117654738
Test ROC-AUC when trained with soft labels: 0.5283391564394855


Test accuracy when trained with soft labels: 0.9967462832074092
Test precision when trained with soft labels: 0.0
Test recall when trained with soft labels: 0.0
Test F1 when trained with soft labels: 0.0
Test ROC-AUC when trained with soft labels: 0.5355835641424763
