In [1]:
from sklearn.metrics import confusion_matrix,classification_report,f1_score,accuracy_score
import pandas as pd
import pickle,spacy,itertools,random,re
nlp = spacy.load('en_core_web_lg')

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load parsed documents.
with open('../dataset/docs.pkl','rb') as p:
    docs = pickle.load(p)

### Trigger Similarity (Dumbest)

<div class="alert alert-block alert-info">
<b>Trigger similarity approach is basically checking the similarity score of two trigger words with spacy's similarity() method.</b>
</div>

In [4]:
def triggerSimilarity(word1,word2,trs):
    """
    This func. will get triggers and a threshold value. It will check the similarity of the given triggers and
    return 1 if the similarity is greater threshold value, or 0 if similarity is less than threshold. 
    """
    word1 = nlp(max(word1.lower().split()))[0]
    word2 = nlp(max(word2.lower().split()))[0]
    
    if word1.similarity(word2) > trs:
        return 1
    else:
        return 0

In [5]:
def prepare_trigger_pairs():
    """
    This func. will check each mention in each event of each document. Pair the coreferent triggers. Then, add some
    negative samples of mention pairs. 
    """
    allTrigers = []
    for doc in docs:
        pCombs = []
        ments = []
        for event in doc['Events']:
            mentions = []
            for mention in event['MENTIONS']:
                mentions.append(mention['TEXT'])
            mentions = list(set(mentions))  
            ments.append(mentions)
            combinations = list(itertools.combinations(mentions,2))
            for c in combinations:
                c= list(c)
                c.append(1)
                pCombs.append(c)
        try:
            nCombs = []   
            for im in range(len(ments)):
                for trig in ments[im]:
                    if random.sample([1,2,3,4],1)[0] == 1: #[BALANCED]#
                        M = ments.copy()
                        M.pop(im)

                        nTrig = random.sample(random.sample(M,1)[0],1)[0]
                        nCombs.append([trig,nTrig,0])
            allTrigers += pCombs+nCombs
        except:
            pass
    return allTrigers

In [6]:
allTrigers = prepare_trigger_pairs()

In [13]:
for trs in [0.30,0.315,0.33,0.339]:
    df = pd.DataFrame(allTrigers,columns=['word1','word2','tLabel'])
    df['pLabel'] = [triggerSimilarity(df.iloc[i].word1,df.iloc[i].word2,trs) for i in range(len(df))]
    print(f1_score(df.tLabel,df.pLabel))
    print(classification_report(df.tLabel,df.pLabel))

0.637989366843886
              precision    recall  f1-score   support

           0       0.79      0.47      0.59      1138
           1       0.52      0.82      0.64       805

    accuracy                           0.61      1943
   macro avg       0.65      0.64      0.61      1943
weighted avg       0.68      0.61      0.61      1943

0.6362735381565906
              precision    recall  f1-score   support

           0       0.78      0.50      0.61      1138
           1       0.53      0.80      0.64       805

    accuracy                           0.62      1943
   macro avg       0.65      0.65      0.62      1943
weighted avg       0.67      0.62      0.62      1943

0.6454592098512058
              precision    recall  f1-score   support

           0       0.78      0.55      0.64      1138
           1       0.55      0.78      0.65       805

    accuracy                           0.64      1943
   macro avg       0.66      0.66      0.64      1943
weighted avg      

- *There are no random parameter of this approach. The only hyperparameter is the similarity threshold.*
- **Best threshold was .33 and got ~0.64 F1 Score.**

### Trigger+Context Similarity (Second Dumbest)

<div class="alert alert-block alert-info">
<b>Trigger+Context similarity approach is basically checking the similarity score of previousWord+trigger1+nextWord and previousWord+trigger2+nextWord with spacy's similarity() method.</b>
</div>

In [8]:
def trigger_contextSimilarity(triggerNcontext1,triggerNcontext2,trs):
    triggerNcontext1 = nlp(triggerNcontext1.lower())
    triggerNcontext2 = nlp(triggerNcontext2.lower())
    
    if triggerNcontext1.similarity(triggerNcontext2) > trs:
        return 1
    else:
        return 0

In [9]:
def triggerNcontext(trig,charseq,raw):
    """
    This func. will get the trigger word with previous and next word.
    """
    return re.search(f'.\s{trig}\s[\w\d]*\s[\w\d]*|[\w\d]*\s[\w\d]*\s{trig}\.|[\w\d]*\s{trig}\s[\w\d]*',raw[charseq[0]-35:charseq[1]+35])[0].replace('\n',' ').strip()

In [14]:
def prepare_triggerNcontexts_pairs():
    """
    This func. will check each mention in each event of each document. Pair the coreferent triggers+context words.
    Then, add some negative samples of mention pairs. 
    """
    allTrigerContexts = []
    for doc in docs:
        pCombs = []
        ments = []
        for event in doc['Events']:
            mentions = []
            for mention in event['MENTIONS']:
                try:
                    mentions.append(triggerNcontext(mention['TEXT'],mention['CHARSEQ'],doc['rawSGM']))
                except:
                    mentions.append(mention['TEXT'])
            mentions = list(set(mentions))  
            ments.append(mentions)
            combinations = list(itertools.combinations(mentions,2))
            for c in combinations:
                c= list(c)
                c.append(1)
                pCombs.append(c)
        try:
            nCombs = []   
            for im in range(len(ments)):
                for trig in ments[im]:
                    if random.sample([1,2],1)[0] == 1:
                        M = ments.copy()
                        M.pop(im)

                        nTrig = random.sample(random.sample(M,1)[0],1)[0]
                        nCombs.append([trig,nTrig,0])
            allTrigerContexts += pCombs+nCombs
        except:
            pass
    return allTrigerContexts

In [15]:
allTriggerContexts = prepare_triggerNcontexts_pairs()

In [17]:
for trs in [0.595,0.6,0.61]:
    df = pd.DataFrame(allTriggerContexts,columns=['word1','word2','tLabel'])
    df['pLabel'] = [trigger_contextSimilarity(df.iloc[i].word1,df.iloc[i].word2,trs) for i in range(len(df))]
    print(f1_score(df.tLabel,df.pLabel))
    print(classification_report(df.tLabel,df.pLabel))

0.7007903650733911
              precision    recall  f1-score   support

           0       0.75      0.59      0.66      2617
           1       0.63      0.78      0.70      2374

    accuracy                           0.68      4991
   macro avg       0.69      0.69      0.68      4991
weighted avg       0.69      0.68      0.68      4991

0.6991622239146992
              precision    recall  f1-score   support

           0       0.75      0.60      0.67      2617
           1       0.64      0.77      0.70      2374

    accuracy                           0.68      4991
   macro avg       0.69      0.69      0.68      4991
weighted avg       0.69      0.68      0.68      4991

0.6960077896786757
              precision    recall  f1-score   support

           0       0.74      0.63      0.68      2617
           1       0.65      0.75      0.70      2374

    accuracy                           0.69      4991
   macro avg       0.69      0.69      0.69      4991
weighted avg     

- *There are no random parameter of this approach. The only hyperparameter is the similarity threshold.*
- **Best threshold was .595 and got ~0.70 F1 Score.**

### **TO DO:**
- Use different class distributions. (Imbalanced)
- Use arguments?
- Build clusters to check with different metrics (MUC, CEAF, B^3, BLANC)