In [1]:
import pandas as pd
import numpy as np
from spacy.matcher import Matcher 
import spacy
import snorkel
from sklearn.model_selection import train_test_split
from snorkel.labeling.model import LabelModel

import tensorflow as tf
from tensorflow.keras.layers import (
    Bidirectional,
    Concatenate,
    Dense,
    Embedding,
    Input,
    LSTM,
)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import warnings
warnings.filterwarnings('ignore')

# Relation extraction based on Weak-Supervision

In this notebook, we aim to extract supply-chain relations using weak-supervision concepts.
The datasets are the text extracted from 10-K on SEC.

In [3]:
# We first load the Spacy vocab core
nlp = spacy.load("en_core_web_lg")

## Data Load and Pre-processing

In [4]:
def string_to_tuple(df):
    """
        This function serves to change the position tuple into a list
    """
    tups = []
    
    for idx, row in df.iterrows():
        temp = []

        for tok in row['position'].split(", "):
            num = int(tok.replace("(", "").replace(")", "")) 
            temp.append(num) 
        
        tups.append(temp)
    return tups

In [5]:
def load_data():
    """
        This function loads the data and replaces some values
    """
    # load data
    df = pd.read_csv("final_merged.csv")
    df = df[['Column','source','target_x','supply','sentence','position']]
    df = df.rename(columns={'target_x':'target'})
    df = df[df['position'].notnull()]
    
    # change string into tuples for the position value 
    tups_lst = string_to_tuple(df)
    df['position'] = tups_lst

    # replace string to numbers
    df['supply'] = df['supply'].replace('0',0)
    df['supply'] = df['supply'].replace('0.0',0)
    df['supply'] = df['supply'].replace('1.0',1)
    df['supply'] = df['supply'].replace('1',1)
    df['supply'] = df['supply'].replace('?',0)
    
    return df

In [6]:
def data_preprocess(df):
    """
        This function pre-processes the loaded data.
        For each given sentence, we create a list of tokens. 
        Then, based on the position of the named entity(company name), we extract the left and right tokens
        Then, it splits the data into dev, train, and test set as pandas dataframes.
    """
    # Initiate new lists to store the pre-processed values
    tokens_lst = []
    left_tokens_lst = []
    right_tokens_lst = []

    # Data pre-processing
    for idx, row in df.iterrows():

        # change the sentence into spacy's object
        doc = nlp(row['sentence'])

        # token list of the sentence        
        toks = [tok.orth_ for tok in doc]

        # store position
        start, end = row['position']

        # append the values to the lists
        tokens_lst.append(toks)
        left_tokens_lst.append(toks[:start])
        right_tokens_lst.append(toks[end+1:])

    # Assign those computed lists into the datafamr
    df['tokens'] = tokens_lst
    df['left_tokens'] = left_tokens_lst
    df['right_tokens'] = right_tokens_lst

    # split the dataframe based on the labels: 1, 0, unknown
    df_zero = df[df['supply'] == 0]
    df_one = df[df['supply'] == 1]
    df_null = df[df['supply'].isnull()]

    # types into integers
    df_zero['supply'] = df_zero['supply'].astype('int64')
    df_one['supply'] = df_one['supply'].astype('int64')

    # unlabeled data become training set
    df_train = df_null[['source','target','sentence','position','tokens','left_tokens','right_tokens']]

    # creating the dataframes
    X_one = df_one[['source','target','sentence','position','tokens','left_tokens','right_tokens']]
    Y_one = np.array(df_one['supply'])


    X_zero = df_zero[['source','target','sentence','position','tokens','left_tokens','right_tokens']]
    Y_zero = np.array(df_zero['supply'])


    # split the labeled dataframe into dev and test set
    X_one_val, X_one_test, Y_one_val, Y_one_test = train_test_split(X_one, Y_one, test_size = 0.5)
    X_zero_val, X_zero_test, Y_zero_val, Y_zero_test = train_test_split(X_zero, Y_zero, test_size = 0.5)

    # concatenate the 1, 0 labeled data
    df_dev = pd.concat([X_one_val,X_zero_val])
    Y_dev = np.append(Y_one_val,Y_zero_val)

    df_test = pd.concat([X_one_test,X_zero_test])
    Y_test = np.append(Y_one_test,Y_zero_test)

    return df_dev, Y_dev, df_train, df_test, Y_test


In [7]:
# we load the data
df = load_data()

# pre-process the data and split then into dev, train, and test set
df_dev, Y_dev, df_train, df_test, Y_test = data_preprocess(df)

FileNotFoundError: [Errno 2] File b'final_merged.csv' does not exist: b'final_merged.csv'

## Weak-supervision

We defined different labeling functions to extract supply-chain relations from the text.
Please note that those labeling functions can be re-used to extract other relations

### Labeling Functions

In [None]:
from snorkel.labeling import labeling_function

# Possible outputs for labeling function. 
POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1

# Check for the `spouse` words appearing between the person mentions
supplying = {"supplier", "supplied",  "supplying", "supplies", "supply"}
@labeling_function(resources=dict(supplying=supplying))
def lf_supply(row, supplying):
    for term in supplying:
        if term in row['sentence']:
             return POSITIVE
    return ABSTAIN

customer = {"customers","customer"}
@labeling_function(resources=dict(customer=customer))
def lf_customer(row, customer):
    for term in customer:
        if term in row['sentence']:
            return POSITIVE
    return ABSTAIN

sales_to = {"sales to"}
@labeling_function(resources=dict(sales_to=sales_to))
def lf_sales_to(row, sales_to):
    for term in sales_to:
        if term in row['sentence']:
            return POSITIVE
    return ABSTAIN

our_customer = {"our", "customers"}
@labeling_function(resources=dict(our_customer=our_customer))
def lf_our_customer(row, our_customer):
    if "our" in row['sentence'] and "customers" in row['sentence']:
        return POSITIVE
    return ABSTAIN

acquisition= {"acquisition", "acquired"}
@labeling_function(resources=dict(acquisition=acquisition))
def lf_acquisition(row, acquisition):
    for term in acquisition:
        if term in row['sentence']:
            return NEGATIVE
    return ABSTAIN

people = {"CEO",'ceo','manager','Manager','Mr.','Mrs.','Ms.'}
@labeling_function(resources=dict(people=people))
def lf_people(row, people):
    for term in people:
        if term in row['sentence']:
            return NEGATIVE
    return ABSTAIN

sold = {"sold to"}
@labeling_function(resources=dict(sold=sold))
def lf_sold(row, sold):
    for term in sold:
        if term in row['sentence']:
            return POSITIVE
    return ABSTAIN

relations = {"relationship","with"}
@labeling_function(resources=dict(relations=relations))
def lf_relation(row, relations):
    if "relation" in row['sentence'] and "with" in row['sentence']:
        return POSITIVE
    return ABSTAIN

competition = {"competitors","competition"}
@labeling_function(resources=dict(competition=competition))
def lf_competition(row, competition):
    for term in competition:
        if term in row['sentence']:
            return NEGATIVE
    return ABSTAIN

### Apply LFs

In [8]:
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.analysis import metric_score
from snorkel.utils import probs_to_preds

# Accumulate all the labeling_functions for supply
supply_lfs = [
    lf_supply,
    lf_customer,
    lf_sales_to,
    lf_our_customer,
    lf_acquisition,
    lf_people,
    lf_sold,
    lf_relation,
    lf_competition
]

# Apply the above labeling functions to the data in Pandas dataframe formats
applier = PandasLFApplier(supply_lfs)

# Use the applier of the labeling functions to both development set and train set
L_dev = applier.apply(df_dev)
L_train = applier.apply(df_train)

# Analyze the performance of the labeling functions.
# Our development set had hand-labeled labels so we can check the accuracies

LFAnalysis(L_dev, lfs).lf_summary(Y_dev)

NameError: name 'lf_supply' is not defined

### Prediction and evaluation

In [255]:
# Baseline model: Majority voting among all the labeling functions
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling import filter_unlabeled_dataframe

majority = MajorityLabelVoter()
preds_train = majority.predict(L=L_train)

""" 
    Our model: Snorkel Label Model
    This model considers the probabilistic aspects of the labeling functions
    It produces a single set of noise-aware labels
"""

# caridnality : 2 (True and False)
label_model = LabelModel(cardinality=2, verbose=True)

# Fit the label_model
label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500)

# accruacy for the majority model using the test set
majority_acc = majority.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

# accuracy for the label model using the test set
label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")
      
# check the F-1 score and ROC_AUC score
probs_dev = label_model.predict_proba(L_dev)
preds_dev = probs_to_preds(probs_dev)
print(
    f"Label model f1 score: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')}"
)
print(
    f"Label model roc-auc: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}"
)

Majority Vote Accuracy:   78.0%
Label Model Accuracy:     90.4%
Label model f1 score: 0.6293333333333333
Label model roc-auc: 0.940890357300998


## Train the extraction model

The output of the label model are the set of probabilities among the binary choice. Those probabilities(train labels) still contain noises. To achieve a high accuracy of the model, we can utilize the tokens of the sentences to train our end extraction model.

In this case, we filter out those data points that did not receive any labels from any of the labelinng function to minimize the potential noises. Then, we will train a bidrectional LSTM model with the train data points.

### Data filtering

In [234]:
"""To train the extraction model, 
we first output the probabilities of the binary choices: True and False from our label model.
Then, using the probabilities, we train our end model
"""

# extract the probabiliteis from the training set using our label model
probs_train = label_model.predict_proba(L_train)

# Since we cannot use the data points that did not receive any labels (Not covered by our labeling functions),
# we filter them out

# extract only the data points that received any labels from the labeling functions
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)


In [272]:
from typing import Tuple

#def uniform_length(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
def uniform_length(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:

    """ 
        since the length of sentence varies much, we make the lengths uniform
    """
    
    # extract three: tokens, left_tokens, right_tokens
    toks = df.tokens
    left_toks = df.left_tokens
    right_toks = df.right_tokens
    
    
    def token_filter(l, max_len=50):
        return l[:max_len] + [""] * (max_len - len(l))

    tokens = np.array(list(map(token_filter, toks)))
    left_tokens = np.array(list(map(token_filter, left_toks)))
    right_tokens = np.array(list(map(token_filter, right_toks)))
    
    return left_tokens, right_tokens
    #return tokens, left_tokens, right_tokens

In [273]:
def bidirectional_lstm(tokens: tf.Tensor, rnn_state_size: int = 64, num_buckets: int = 40000, embed_dim: int = 36,):
    """
        Bidirectional LSTM model
    """
    
    # Converts each string in the input Tensor to its hash mod by a number of buckets.
    ids = tf.strings.to_hash_bucket_fast(tokens, num_buckets)
    
    # Turns positive integers (indexes) into dense vectors of fixed size
    embedded_input = Embedding(num_buckets, embed_dim)(ids)
    
    # return the bidrecitonal LSTM
    return Bidirectional(LSTM(rnn_state_size, activation=tf.nn.relu))(
        embedded_input, mask=tf.strings.length(tokens)
    )


In [None]:
def rnn_model(
    rnn_state_size: int = 64, num_buckets: int = 40000, embed_dim: int = 12) -> tf.keras.Model:
    
    """
    This lstm model predicts the label probailities given the embedded tokens
    
    rnn_state_size: state size of LSTM model
    num_buckets: Number of buckets to hash strings to integers
    embed_dim: Size of token embeddings

    """
    #toks_ph = Input((None,), dtype="string")
    #toks_embs = bidirectional_lstm(b_ph, rnn_state_size, num_buckets, embed_dim
    #layer = Concatenate(1)([left_embs, bet_embs, right_embs])
    
    # Instantiate Input Keras Object. Data type : string
    left_obj = Input((None,), dtype="string")
    right_obj = Input((None,), dtype="string")
    
    # intput embeddings
    left_lstm = bidirectional_lstm(left_obj, rnn_state_size, num_buckets, embed_dim)
    right_lstm = bidirectional_lstm(right_obj, rnn_state_size, num_buckets, embed_dim)
    
    # concatenate two inputs
    layer = Concatenate(1)([left_lstm, right_lstm])
    
    # Dense layers with relu activations
    layer = Dense(64, activation=tf.nn.relu)(layer)
    layer = Dense(32, activation=tf.nn.relu)(layer)
    
    # Output layer with softmax activation
    probabilities = Dense(2, activation=tf.nn.softmax)(layer)
    
    #  final model using the characteristics above
    model = tf.keras.Model(inputs=[left_obj, right_obj], outputs=probabilities)
    
    #model = tf.keras.Model(inputs=[bet_ph, left_ph, right_ph], outputs=probabilities)
    
    # compile the model: AdagradOptimizer, cross_entropy
    model.compile(tf.train.AdagradOptimizer(0.1), "categorical_crossentropy")
    return model

In [309]:
X_train = uniform_length(df_train_filtered)
model = rnn_model()
batch_size = 64
model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1aff1b4ef0>

### Evaluation

After filtting the model, we evaluate our results using the test set. Note that the test set is fairly unbalanced. 

In [321]:
X_test = uniform_length(df_test)
probs_test = model.predict(X_test)
preds_test = probs_to_preds(probs_test)

print(
    f"Test accuracy when trained with soft labels: {metric_score(Y_test, preds=preds_test, metric='accuracy')}"
)

print(
    f"Test F1 when trained with soft labels: {metric_score(Y_test, preds=preds_test, metric='f1')}"
)
print(
    f"Test ROC-AUC when trained with soft labels: {metric_score(Y_test, probs=probs_test, metric='roc_auc')}"
)

Test accuracy when trained with soft labels: 0.8053668478260869
Test F1 when trained with soft labels: 0.4609595484477893
Test ROC-AUC when trained with soft labels: 0.9074948056813925


### Close-up

Let's look at what sentences did our model label as supply relations

In [331]:
# predict the probabilities of the train set
sp = model.predict(X_train)

# outputs
sp_label = probs_to_preds(sp)

# create a temp df
temp_train = df_train_filtered[['source','target','sentence']]
temp_train['supply'] = sp_label

pd.set_option('display.max_colwidth', -1)
temp_train[temp_train['supply'] == 1]


Unnamed: 0,source,target,sentence,supply
31,"Motivnation, Inc.",TrixMotive,"On December 7, 2005, a customer of TrixMotive filed a lawsuit in the Superior Court of Santa Clara County of California against TrixMotive claiming for breach of contract and warranty, intentional and negligence misrepresentation for a customized vehicle.",1
32,"Motivnation, Inc.",TrixMotive,"On December 7, 2005, a customer of TrixMotive filed a lawsuit in the Superior Court of Santa Clara County of California against TrixMotive claiming for breach of contract and warranty, intentional and negligence misrepresentation for a customized vehicle.",1
36,"Motivnation, Inc.",TrixMotive,"On January 24, 2008, a customer of TrixMotive filed a lawsuit in the Superior Court of Middlesex County of New Jersey against TrixMotive claiming for breach of contract and warranty, intentional and negligence misrepresentation for a customized vehicle.",1
37,"Motivnation, Inc.",TrixMotive,"On January 24, 2008, a customer of TrixMotive filed a lawsuit in the Superior Court of Middlesex County of New Jersey against TrixMotive claiming for breach of contract and warranty, intentional and negligence misrepresentation for a customized vehicle.",1
160,Stanadyne Corp,John Deere,"Deere was the only customer that accounted for more than 10% of Stanadyne’s net sales in 2012 and 2011 , at 41.4% , and 38.3% , respectively.",1
...,...,...,...,...
38846,"Remy International, Inc.",Hyundai,"Net sales to our other largest customer, Hyundai, accounted for approximately 12% , 10% , 9% , and 9% of our net sales for the years ended December 31, 2014 and 2013 , the period August 15, 2012 to December 31, 2012 and the period January 1, 2012 to August 14, 2012 , respectively.",1
38849,"Remy International, Inc.",Hyundai,Hyundai is our fastest growing OEM customer.,1
38897,"Remy International, Inc.",Hyundai,"In 2014, Hyundai became our largest customer and accounted for approximately 12% and 10% of our net sales for the years ended December 31, 2014 and 2013 , respectively.",1
38898,"Remy International, Inc.",General Motors Co,"GM, our second largest customer, accounted for 12% and 16% of our net sales for 2014 and 2013 , respectively.",1
