In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
import sklearn
import pandas as pd
import numpy as np

In [3]:
import os
from os import listdir
import sys
import json

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [5]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


In [6]:
project_dir = "/xdisk/msurdeanu/alexeeva/data/annotations-related"

In [7]:
annotated_data = os.path.join(project_dir, "annotated_as_of_dec13_both_uganda_and_rice")
annotated_data

'/xdisk/msurdeanu/alexeeva/data/annotations-related/annotated_as_of_dec13_both_uganda_and_rice'

In [8]:
# df = pd.read_csv(os.path.join(annotated_data, "Subtask1-MainTask-double-annotation-prep-as-of-Nov2.tsv"), sep="\t")

In [9]:
# df.head()

In [10]:
# load annotated data
adf = pd.DataFrame()
for file in listdir(annotated_data):
    print(file)
    if file.endswith("tsv"):
        f_path = os.path.join(annotated_data, file)
        temp_df = pd.read_csv(f_path, sep='\t', usecols = ["paragraph", "mention text (just a few words around the trigger)","trigger","sentence","annotation: b (belief or attitude), n (not a belief and not an attitude)"]).dropna()
        print(len(temp_df))
        adf = pd.concat([adf, temp_df])

Subtask1-MainTask-double-annotation-prep-as-of-Nov2.tsv
1026


In [11]:
adf.head()

Unnamed: 0,sentence,trigger,mention text (just a few words around the trigger),"annotation: b (belief or attitude), n (not a belief and not an attitude)",paragraph
0,"Their willingness to use storage is , however ...",willingness,"Their willingness to use storage is, however, ...",b,"Exhibit 14: Average Quantity of Maize, Rice an..."
1,"They appreciated the participatory approach , ...",consider,they consider,b,"As part of our survey, most farmers recognize ..."
2,"Based on this study , 30 m of Shuttle Radar To...",considered,) data was considered in our CFD simulation us...,n,"In terms of simulation accuracy, the shapes of..."
3,Mechanical threshing was more popular in the D...,popular,was more popular in the Delta than in he Middl...,b,The Delta had less bird damage than the Middle...
4,"The authors hope that the thriving features , ...",hope,"The authors hope that the thriving features, c...",n,This monograph describes the genetic resources...


In [12]:
adf["sentence"] = [s.strip() for s in adf["sentence"]]
adf = adf.drop_duplicates(subset = ["sentence", "mention text (just a few words around the trigger)"])

In [13]:
# len(df)

In [14]:
anns = adf["annotation: b (belief or attitude), n (not a belief and not an attitude)"]

In [15]:
list(anns).count("b")

377

In [16]:
# percentage of sentences annotated as beliefs
386/1026

0.3762183235867446

In [17]:
# annotated + sampled
df = adf
print("Annotated =", len(df))

Annotated = 1001


In [18]:
df['label'] = np.array([1 if x == "b" else 0 for x in df['annotation: b (belief or attitude), n (not a belief and not an attitude)']])

In [19]:
df.head()

Unnamed: 0,sentence,trigger,mention text (just a few words around the trigger),"annotation: b (belief or attitude), n (not a belief and not an attitude)",paragraph,label
0,"Their willingness to use storage is , however ...",willingness,"Their willingness to use storage is, however, ...",b,"Exhibit 14: Average Quantity of Maize, Rice an...",1
1,"They appreciated the participatory approach , ...",consider,they consider,b,"As part of our survey, most farmers recognize ...",1
2,"Based on this study , 30 m of Shuttle Radar To...",considered,) data was considered in our CFD simulation us...,n,"In terms of simulation accuracy, the shapes of...",0
3,Mechanical threshing was more popular in the D...,popular,was more popular in the Delta than in he Middl...,b,The Delta had less bird damage than the Middle...,1
4,"The authors hope that the thriving features , ...",hope,"The authors hope that the thriving features, c...",n,This monograph describes the genetic resources...,0


In [20]:
# from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# transformer_name = 'roberta-base'
transformer_name = "bert-base-cased"
# for cross validation, the model should be initialized inside the cv loop
# model = AutoModelForSequenceClassification.from_pretrained(transformer_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(transformer_name)

In [21]:
def tokenize(batch):
    return tokenizer(batch['sentence'], truncation=True)

In [22]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

def compute_metrics(eval_pred):
    y_true = eval_pred.label_ids
    y_pred = np.argmax(eval_pred.predictions, axis=-1)
    report = metrics.classification_report(y_true, y_pred)
    print("report: \n", report)
    
    print("rep type: ", type(report))
    

    return {'f1':metrics.f1_score(y_true, y_pred)}

In [23]:


from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel

# https://github.com/huggingface/transformers/blob/65659a29cf5a079842e61a63d57fa24474288998/src/transformers/models/bert/modeling_bert.py#L1486

class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs,
        )
        cls_outputs = outputs.last_hidden_state[:, 0, :]
        cls_outputs = self.dropout(cls_outputs)
        logits = self.classifier(cls_outputs)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )



In [24]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     compute_metrics=compute_metrics,
#     train_dataset=train_ds,
#     eval_dataset=eval_ds,
#     tokenizer=tokenizer,
#     )

In [25]:
from sklearn.model_selection import KFold

In [26]:
from transformers import AutoConfig


In [27]:
def get_sample_based_on_idx(data, indeces):
    return data.iloc[indeces, :].reset_index()

In [28]:
df.head(50)

Unnamed: 0,sentence,trigger,mention text (just a few words around the trigger),"annotation: b (belief or attitude), n (not a belief and not an attitude)",paragraph,label
0,"Their willingness to use storage is , however ...",willingness,"Their willingness to use storage is, however, ...",b,"Exhibit 14: Average Quantity of Maize, Rice an...",1
1,"They appreciated the participatory approach , ...",consider,they consider,b,"As part of our survey, most farmers recognize ...",1
2,"Based on this study , 30 m of Shuttle Radar To...",considered,) data was considered in our CFD simulation us...,n,"In terms of simulation accuracy, the shapes of...",0
3,Mechanical threshing was more popular in the D...,popular,was more popular in the Delta than in he Middl...,b,The Delta had less bird damage than the Middle...,1
4,"The authors hope that the thriving features , ...",hope,"The authors hope that the thriving features, c...",n,This monograph describes the genetic resources...,0
5,"Of these , the EAC , COMESA , and SADC are all...",considered,SADC are all considered significant regional i...,n,"Of these, the EAC, COMESA, and SADC are all co...",0
6,"Moreover , consumers already prefer local rice...",prefer,", consumers already prefer local rice in that ...",b,Since the introduction of improved rice variet...,1
7,You need to be especially careful with items t...,want,don’t want to inadvertently violate the EAR.,n,"Suppliers, consultants, and other third-partie...",0
8,"Needless to say , it is the reflection of the ...",say,"Needless to say, it is the reflection of the c...",n,"After PAPRIZ started its activity, it became c...",0
9,Government and political stability were long r...,perceived,is no longer perceived as a major constraint.,b,New actions are possible to address the multip...,1


In [29]:
len(df)

1001

In [30]:
set(df["label"])

{0, 1}

In [31]:
df["text"] = df["sentence"]

In [32]:
df.head()

Unnamed: 0,sentence,trigger,mention text (just a few words around the trigger),"annotation: b (belief or attitude), n (not a belief and not an attitude)",paragraph,label,text
0,"Their willingness to use storage is , however ...",willingness,"Their willingness to use storage is, however, ...",b,"Exhibit 14: Average Quantity of Maize, Rice an...",1,"Their willingness to use storage is , however ..."
1,"They appreciated the participatory approach , ...",consider,they consider,b,"As part of our survey, most farmers recognize ...",1,"They appreciated the participatory approach , ..."
2,"Based on this study , 30 m of Shuttle Radar To...",considered,) data was considered in our CFD simulation us...,n,"In terms of simulation accuracy, the shapes of...",0,"Based on this study , 30 m of Shuttle Radar To..."
3,Mechanical threshing was more popular in the D...,popular,was more popular in the Delta than in he Middl...,b,The Delta had less bird damage than the Middle...,1,Mechanical threshing was more popular in the D...
4,"The authors hope that the thriving features , ...",hope,"The authors hope that the thriving features, c...",n,This monograph describes the genetic resources...,0,"The authors hope that the thriving features , ..."


In [33]:
num_epochs = 20
batch_size = 8
weight_decay = 0.01
# model_name = f'{transformer_name}-sequence-classification'
training_args = TrainingArguments(
#     output_dir="./results",
#     log_level='error',
#     num_train_epochs=num_epochs,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     evaluation_strategy='epoch',
#     weight_decay=weight_decay,
#     save_total_limit = 1,
#       # report_to = "none",
#   #     logging_steps = 'epoch',
#     #     load_best_model_at_end=True,
#     metric_for_best_model="eval_f1",
#     load_best_model_at_end=True,
#     save_strategy = "epoch"
    output_dir="./results",
    log_level='error',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    weight_decay=weight_decay,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1"
    )

In [34]:


fold = 0
kfold = KFold(n_splits=10, shuffle=True, random_state=1)
for train_df_idx, eval_df_idx in kfold.split(df):
    print("FOLD: ", fold)
    new_df = pd.DataFrame()
    
    train_df = get_sample_based_on_idx(df, train_df_idx)
    print("LEN DF: ", len(train_df))
#     train_df['label'] = [int(item) for item in train_df["annotation: b (belief or attitude), n (not a belief and not an attitude)"]]
    print("done train df")
    eval_df = get_sample_based_on_idx(df, eval_df_idx)
#     eval_df["label"] = [int(item) for item in eval_df['annotation: b (belief or attitude), n (not a belief and not an attitude)']]
    print("done eval df")
    print("LEN EVAL: ", len(eval_df))
    print(eval_df.head())
    ds = DatasetDict()
    ds['train'] = Dataset.from_pandas(train_df)
    ds['validation'] = Dataset.from_pandas(eval_df)
    train_ds = ds['train'].map(
        tokenize, batched=True,
        remove_columns=['index', 'sentence', 'trigger', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph'],
    )
    eval_ds = ds['validation'].map(
        tokenize,
        batched=True,
        remove_columns=['index', 'sentence', 'trigger', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph'],
    )
    print(ds["validation"])

    config = AutoConfig.from_pretrained(
        transformer_name,
        num_labels=2,
    )

#     model = AutoModelForSequenceClassification.from_pretrained(transformer_name, num_labels=2)
#     tokenizer = AutoTokenizer.from_pretrained(transformer_name)
    model = (
        BertForSequenceClassification
        .from_pretrained(transformer_name, config=config)
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
    )
    trainer.train()
    # after training, predict (will use best model?)
    preds = trainer.predict(eval_ds)
#     print(preds, "<<<")
    final_preds = [np.argmax(x) for x in preds.predictions]
#     print(final_preds, "<--")
#     print("labs: ", eval_df["label"])
    real_f1 = metrics.f1_score(final_preds, eval_df["label"])
    print("F-1: ", real_f1)
    count_f_n = 0
    count_f_p = 0
    for i, item in enumerate(final_preds):
        if not item == eval_ds["label"][i]:
            false_df = pd.DataFrame()
            false_df["sentence"] = [eval_df["sentence"][i]]
            false_df["real"] = [eval_df["label"][i]]
            false_df["predicted"] = [item]
            new_df = pd.concat([new_df, false_df])
#             print("NEW: \n", false_df.head())
            if item == 0:
                count_f_n += 1

            else:
                count_f_p += 1
#                 print(eval_ds["sentence"][i], " " , eval_ds["label"][i], " ", item, "\n")

    #     else:
    #         print(">>>", list(X_test)[i], " " , y_test_enc[i], " ", list(y_test)[i], " ", item, "\n")
    print(count_f_n)
    print(count_f_p)

    
#     print(new_df.head())
#     new_df.to_csv(os.path.join("/xdisk/msurdeanu/alexeeva/data/annotations-related/false-pos-neg", "false_annotations_" + str(fold) + ".tsv"), sep="\t")  
    fold += 1
    

FOLD:  0
LEN DF:  900
done train df
done eval df
LEN EVAL:  101
   index                                           sentence    trigger  \
0      6  Moreover , consumers already prefer local rice...     prefer   
1     17  Farmers may not need insurance as rice grown i...  perceived   
2     34  Crystal ( 2004 ) found that the world is facin...      found   
3     35  This study contributes by presenting and compa...  perceived   
4     41  While there are good reasons to believe that g...    believe   

  mention text (just a few words around the trigger)  \
0  , consumers already prefer local rice in that ...   
1  yields and was perceived to have lower risks t...   
2                               Crystal (2004) found   
3  econometrically estimated and perceived bird d...   
4  good reasons to believe that general equilibri...   

  annotation: b (belief or attitude), n (not a belief and not an attitude)  \
0                                                  b                        

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['index', 'sentence', 'trigger', 'mention text (just a few words around the trigger)', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph', 'label', 'text'],
    num_rows: 101
})


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss,F1
1,No log,0.571428,0.617647
2,No log,0.677112,0.586207
3,No log,0.881812,0.596491
4,No log,1.173479,0.712329
5,0.305000,1.357123,0.666667
6,0.305000,1.268511,0.65625
7,0.305000,1.469745,0.702703
8,0.305000,1.592169,0.646154
9,0.029400,1.427627,0.695652
10,0.029400,1.645962,0.646154


report: 
               precision    recall  f1-score   support

           0       0.82      0.79      0.81        68
           1       0.60      0.64      0.62        33

    accuracy                           0.74       101
   macro avg       0.71      0.72      0.71       101
weighted avg       0.75      0.74      0.74       101

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.79      0.88      0.83        68
           1       0.68      0.52      0.59        33

    accuracy                           0.76       101
   macro avg       0.73      0.70      0.71       101
weighted avg       0.75      0.76      0.75       101

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.79      0.90      0.84        68
           1       0.71      0.52      0.60        33

    accuracy                           0.77       101
   macro avg       0.75      0.71      0.72     

report: 
               precision    recall  f1-score   support

           0       0.89      0.79      0.84        68
           1       0.65      0.79      0.71        33

    accuracy                           0.79       101
   macro avg       0.77      0.79      0.77       101
weighted avg       0.81      0.79      0.80       101

rep type:  <class 'str'>
F-1:  0.7123287671232875
7
14
FOLD:  1
LEN DF:  901
done train df
done eval df
LEN EVAL:  100
   index                                           sentence     trigger  \
0      2  Based on this study , 30 m of Shuttle Radar To...  considered   
1      3  Mechanical threshing was more popular in the D...     popular   
2      8  Needless to say , it is the reflection of the ...         say   
3     13  An additional 6 farmers are also expected to b...    expected   
4     50  The farmers trust the front-line government sp...       trust   

  mention text (just a few words around the trigger)  \
0  ) data was considered in our CFD s

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['index', 'sentence', 'trigger', 'mention text (just a few words around the trigger)', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph', 'label', 'text'],
    num_rows: 100
})




Epoch,Training Loss,Validation Loss,F1
1,No log,0.644154,0.5
2,No log,1.001799,0.684932
3,No log,1.26797,0.727273
4,No log,1.616623,0.714286
5,0.270100,1.543015,0.702703
6,0.270100,1.140814,0.731707
7,0.270100,1.809203,0.704545
8,0.270100,1.788921,0.74359
9,0.036900,1.901645,0.717949
10,0.036900,2.041246,0.691358


report: 
               precision    recall  f1-score   support

           0       0.68      0.98      0.81        59
           1       0.93      0.34      0.50        41

    accuracy                           0.72       100
   macro avg       0.81      0.66      0.65       100
weighted avg       0.79      0.72      0.68       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.76      0.88      0.82        59
           1       0.78      0.61      0.68        41

    accuracy                           0.77       100
   macro avg       0.77      0.75      0.75       100
weighted avg       0.77      0.77      0.76       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.80      0.86      0.83        59
           1       0.78      0.68      0.73        41

    accuracy                           0.79       100
   macro avg       0.79      0.77      0.78     

report: 
               precision    recall  f1-score   support

           0       0.81      0.86      0.84        59
           1       0.78      0.71      0.74        41

    accuracy                           0.80       100
   macro avg       0.80      0.79      0.79       100
weighted avg       0.80      0.80      0.80       100

rep type:  <class 'str'>
F-1:  0.7435897435897435
12
8
FOLD:  2
LEN DF:  901
done train df
done eval df
LEN EVAL:  100
   index                                           sentence      trigger  \
0     12  They are considered in press : Comparative Stu...   considered   
1     16  Next , we regress the absolute differences ( d...  willingness   
2     19  When IDPs and refugees returned to their farms...        found   
3     23  d. Long Run Stock Options Incentive Plan : on ...   considered   
4     47  Gomis and McCoy ( 2005 ) , for example , found...        found   

  mention text (just a few words around the trigger)  \
0  They are considered in press

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['index', 'sentence', 'trigger', 'mention text (just a few words around the trigger)', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph', 'label', 'text'],
    num_rows: 100
})




Epoch,Training Loss,Validation Loss,F1
1,No log,0.567833,0.423077
2,No log,0.604085,0.705882
3,No log,0.858895,0.72093
4,No log,1.007233,0.692308
5,0.334900,1.495419,0.641026
6,0.334900,1.45837,0.714286
7,0.334900,1.572162,0.710526
8,0.334900,2.253654,0.608696
9,0.045800,1.657921,0.693333
10,0.045800,1.770324,0.647887


report: 
               precision    recall  f1-score   support

           0       0.68      0.97      0.80        61
           1       0.85      0.28      0.42        39

    accuracy                           0.70       100
   macro avg       0.76      0.62      0.61       100
weighted avg       0.74      0.70      0.65       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.83      0.74      0.78        61
           1       0.65      0.77      0.71        39

    accuracy                           0.75       100
   macro avg       0.74      0.75      0.74       100
weighted avg       0.76      0.75      0.75       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.85      0.74      0.79        61
           1       0.66      0.79      0.72        39

    accuracy                           0.76       100
   macro avg       0.75      0.77      0.76     

report: 
               precision    recall  f1-score   support

           0       0.85      0.74      0.79        61
           1       0.66      0.79      0.72        39

    accuracy                           0.76       100
   macro avg       0.75      0.77      0.76       100
weighted avg       0.78      0.76      0.76       100

rep type:  <class 'str'>
F-1:  0.7209302325581396
8
16
FOLD:  3
LEN DF:  901
done train df
done eval df
LEN EVAL:  100
   index                                           sentence      trigger  \
0      0  Their willingness to use storage is , however ...  willingness   
1     14  Note : Statistics are based on import data fro...     believed   
2     40  Preliminary research conducted by WARDA sugges...   appreciate   
3     46  In time , as the 1998 law is implemented , the...     expected   
4     57  All farmers already preferred to use fertilize...    preferred   

  mention text (just a few words around the trigger)  \
0  Their willingness to use sto

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['index', 'sentence', 'trigger', 'mention text (just a few words around the trigger)', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph', 'label', 'text'],
    num_rows: 100
})




Epoch,Training Loss,Validation Loss,F1
1,No log,0.601725,0.542373
2,No log,0.811868,0.44
3,No log,0.708159,0.712329
4,No log,1.136772,0.698795
5,0.328800,1.443211,0.675325
6,0.328800,1.722877,0.666667
7,0.328800,1.451835,0.701299
8,0.328800,1.695451,0.686567
9,0.050000,1.544532,0.69697
10,0.050000,2.311984,0.681818


report: 
               precision    recall  f1-score   support

           0       0.74      0.89      0.81        64
           1       0.70      0.44      0.54        36

    accuracy                           0.73       100
   macro avg       0.72      0.67      0.68       100
weighted avg       0.72      0.73      0.71       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.71      0.95      0.81        64
           1       0.79      0.31      0.44        36

    accuracy                           0.72       100
   macro avg       0.75      0.63      0.63       100
weighted avg       0.74      0.72      0.68       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.84      0.83      0.83        64
           1       0.70      0.72      0.71        36

    accuracy                           0.79       100
   macro avg       0.77      0.78      0.77     

report: 
               precision    recall  f1-score   support

           0       0.84      0.83      0.83        64
           1       0.70      0.72      0.71        36

    accuracy                           0.79       100
   macro avg       0.77      0.78      0.77       100
weighted avg       0.79      0.79      0.79       100

rep type:  <class 'str'>
F-1:  0.7123287671232876
10
11
FOLD:  4
LEN DF:  901
done train df
done eval df
LEN EVAL:  100
   index                                           sentence    trigger  \
0      9  Government and political stability were long r...  perceived   
1     11  SRI should consequently not be thought of as a...    thought   
2     29  Rebuilding West Africa 's food potential408 Ho...       want   
3     30  While we do not observe any systematic differe...       said   
4     31  If your company wants to maintain this kind of...      wants   

  mention text (just a few words around the trigger)  \
0      is no longer perceived as a major c

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['index', 'sentence', 'trigger', 'mention text (just a few words around the trigger)', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph', 'label', 'text'],
    num_rows: 100
})




Epoch,Training Loss,Validation Loss,F1
1,No log,0.556975,0.666667
2,No log,0.571777,0.75
3,No log,0.834342,0.753247
4,No log,1.280302,0.727273
5,0.297400,1.157688,0.756757
6,0.297400,1.305523,0.739726
7,0.297400,1.629023,0.746667
8,0.297400,1.788442,0.736842
9,0.030100,1.709833,0.739726
10,0.030100,1.688769,0.727273


report: 
               precision    recall  f1-score   support

           0       0.89      0.52      0.65        62
           1       0.53      0.89      0.67        38

    accuracy                           0.66       100
   macro avg       0.71      0.71      0.66       100
weighted avg       0.75      0.66      0.66       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.83      0.89      0.86        62
           1       0.79      0.71      0.75        38

    accuracy                           0.82       100
   macro avg       0.81      0.80      0.80       100
weighted avg       0.82      0.82      0.82       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.85      0.84      0.85        62
           1       0.74      0.76      0.75        38

    accuracy                           0.81       100
   macro avg       0.80      0.80      0.80     

report: 
               precision    recall  f1-score   support

           0       0.84      0.87      0.86        62
           1       0.78      0.74      0.76        38

    accuracy                           0.82       100
   macro avg       0.81      0.80      0.81       100
weighted avg       0.82      0.82      0.82       100

rep type:  <class 'str'>
F-1:  0.7567567567567567
10
8
FOLD:  5
LEN DF:  901
done train df
done eval df
LEN EVAL:  100
   index                                           sentence     trigger  \
0      4  The authors hope that the thriving features , ...        hope   
1      5  Of these , the EAC , COMESA , and SADC are all...  considered   
2     18  Respondent 's Number Date Location Translator ...       think   
3     27  Approvals from spouses to own a business or le...  considered   
4     33  Parents described an interest in participating...        felt   

  mention text (just a few words around the trigger)  \
0  The authors hope that the thriving

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['index', 'sentence', 'trigger', 'mention text (just a few words around the trigger)', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph', 'label', 'text'],
    num_rows: 100
})




Epoch,Training Loss,Validation Loss,F1
1,No log,0.558854,0.597015
2,No log,1.107185,0.566667
3,No log,1.612828,0.682353
4,No log,1.348987,0.717391
5,0.266700,1.3238,0.666667
6,0.266700,1.426789,0.741573
7,0.266700,1.550642,0.712644
8,0.266700,1.443941,0.7
9,0.052100,1.51055,0.683544
10,0.052100,1.825619,0.727273


report: 
               precision    recall  f1-score   support

           0       0.73      0.88      0.80        60
           1       0.74      0.50      0.60        40

    accuracy                           0.73       100
   macro avg       0.73      0.69      0.70       100
weighted avg       0.73      0.73      0.72       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.71      0.95      0.81        60
           1       0.85      0.42      0.57        40

    accuracy                           0.74       100
   macro avg       0.78      0.69      0.69       100
weighted avg       0.77      0.74      0.72       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.80      0.73      0.77        60
           1       0.64      0.72      0.68        40

    accuracy                           0.73       100
   macro avg       0.72      0.73      0.72     

report: 
               precision    recall  f1-score   support

           0       0.86      0.73      0.79        60
           1       0.67      0.82      0.74        40

    accuracy                           0.77       100
   macro avg       0.77      0.78      0.77       100
weighted avg       0.79      0.77      0.77       100

rep type:  <class 'str'>
F-1:  0.7415730337078652
7
16
FOLD:  6
LEN DF:  901
done train df
done eval df
LEN EVAL:  100
   index                                           sentence   trigger  \
0      1  They appreciated the participatory approach , ...  consider   
1     36  As a result , women were better partners for g...      want   
2     51  The authors also compare the cost data with ma...      find   
3     70  WSIS Stocktaking Pla \ orm www.wsis.org/stockt...      felt   
4     83  Buyers generally expect to receive the benefit...    expect   

  mention text (just a few words around the trigger)  \
0                                      they consi

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['index', 'sentence', 'trigger', 'mention text (just a few words around the trigger)', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph', 'label', 'text'],
    num_rows: 100
})




Epoch,Training Loss,Validation Loss,F1
1,No log,0.652166,0.4
2,No log,0.850278,0.619048
3,No log,1.370759,0.553846
4,No log,1.805766,0.617021
5,0.286400,1.950497,0.631579
6,0.286400,1.968059,0.648649
7,0.286400,1.858481,0.65
8,0.286400,2.239042,0.674419
9,0.043400,2.233651,0.591549
10,0.043400,2.415809,0.611111


report: 
               precision    recall  f1-score   support

           0       0.66      0.93      0.77        60
           1       0.73      0.28      0.40        40

    accuracy                           0.67       100
   macro avg       0.70      0.60      0.59       100
weighted avg       0.69      0.67      0.62       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.75      0.70      0.72        60
           1       0.59      0.65      0.62        40

    accuracy                           0.68       100
   macro avg       0.67      0.68      0.67       100
weighted avg       0.69      0.68      0.68       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.71      0.88      0.79        60
           1       0.72      0.45      0.55        40

    accuracy                           0.71       100
   macro avg       0.71      0.67      0.67     

report: 
               precision    recall  f1-score   support

           0       0.80      0.72      0.75        60
           1       0.63      0.72      0.67        40

    accuracy                           0.72       100
   macro avg       0.71      0.72      0.71       100
weighted avg       0.73      0.72      0.72       100

rep type:  <class 'str'>
F-1:  0.6744186046511628
11
17
FOLD:  7
LEN DF:  901
done train df
done eval df
LEN EVAL:  100
   index                                           sentence        trigger  \
0     21  Provide identification information , test date...           want   
1     24  A woman worker in Seychelles must give her emp...       expected   
2     28  The business case for renewables was strong , ...       expected   
3     32  Because the maximum storage time is two years ...      preferred   
4     44  Yield measured on samples overestimated the yi...  overestimated   

  mention text (just a few words around the trigger)  \
0  score recipient

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['index', 'sentence', 'trigger', 'mention text (just a few words around the trigger)', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph', 'label', 'text'],
    num_rows: 100
})




Epoch,Training Loss,Validation Loss,F1
1,No log,0.51485,0.617647
2,No log,1.190132,0.651163
3,No log,1.945873,0.613333
4,No log,2.016858,0.6
5,0.293700,1.830593,0.589744
6,0.293700,1.9998,0.557377
7,0.293700,1.679148,0.651163
8,0.293700,2.434522,0.644444
9,0.054200,2.241391,0.637681
10,0.054200,2.525415,0.607595


report: 
               precision    recall  f1-score   support

           0       0.75      0.87      0.80        61
           1       0.72      0.54      0.62        39

    accuracy                           0.74       100
   macro avg       0.74      0.70      0.71       100
weighted avg       0.74      0.74      0.73       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.79      0.69      0.74        61
           1       0.60      0.72      0.65        39

    accuracy                           0.70       100
   macro avg       0.69      0.70      0.69       100
weighted avg       0.72      0.70      0.70       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.75      0.79      0.77        61
           1       0.64      0.59      0.61        39

    accuracy                           0.71       100
   macro avg       0.69      0.69      0.69     

report: 
               precision    recall  f1-score   support

           0       0.79      0.69      0.74        61
           1       0.60      0.72      0.65        39

    accuracy                           0.70       100
   macro avg       0.69      0.70      0.69       100
weighted avg       0.72      0.70      0.70       100

rep type:  <class 'str'>
F-1:  0.6511627906976744
11
19
FOLD:  8
LEN DF:  901
done train df
done eval df
LEN EVAL:  100
   index                                           sentence      trigger  \
0     10  Rice fields harbour a surprisingly rich level ...      thought   
1     20  Govermental Organizations that have interest i...  interest in   
2     75  They also find that improved tenure security l...         find   
3     77  Hani Salem Sonbol , Chief Executive Officer , ...         said   
4     87  These models describe four possible climate fu...   considered   

  mention text (just a few words around the trigger)  \
0  of biodiversity, thought to

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['index', 'sentence', 'trigger', 'mention text (just a few words around the trigger)', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph', 'label', 'text'],
    num_rows: 100
})




Epoch,Training Loss,Validation Loss,F1
1,No log,0.480274,0.7
2,No log,0.636271,0.704545
3,No log,0.973702,0.736842
4,No log,1.270357,0.702703
5,0.298600,1.397358,0.675325
6,0.298600,1.212073,0.746667
7,0.298600,1.576771,0.72973
8,0.298600,1.637221,0.72093
9,0.045000,1.738612,0.72973
10,0.045000,1.627166,0.72


report: 
               precision    recall  f1-score   support

           0       0.80      0.80      0.80        60
           1       0.70      0.70      0.70        40

    accuracy                           0.76       100
   macro avg       0.75      0.75      0.75       100
weighted avg       0.76      0.76      0.76       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.83      0.72      0.77        60
           1       0.65      0.78      0.70        40

    accuracy                           0.74       100
   macro avg       0.74      0.75      0.74       100
weighted avg       0.75      0.74      0.74       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.81      0.87      0.84        60
           1       0.78      0.70      0.74        40

    accuracy                           0.80       100
   macro avg       0.80      0.78      0.79     

report: 
               precision    recall  f1-score   support

           0       0.82      0.88      0.85        60
           1       0.80      0.70      0.75        40

    accuracy                           0.81       100
   macro avg       0.81      0.79      0.80       100
weighted avg       0.81      0.81      0.81       100

rep type:  <class 'str'>
F-1:  0.7466666666666666
12
7
FOLD:  9
LEN DF:  901
done train df
done eval df
LEN EVAL:  100
   index                                           sentence      trigger  \
0      7  You need to be especially careful with items t...         want   
1     15  As a result , the Poordi variety was the most ...  appreciated   
2     22  Elissa Golberg , Assistant Deputy Minister , S...         said   
3     25  As such , they were pre-disposed to value farm...        value   
4     26  She suggested that standardization and transpa...    confident   

  mention text (just a few words around the trigger)  \
0       don’t want to inadverte

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['index', 'sentence', 'trigger', 'mention text (just a few words around the trigger)', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph', 'label', 'text'],
    num_rows: 100
})




Epoch,Training Loss,Validation Loss,F1
1,No log,0.52084,0.439024
2,No log,0.410993,0.655738
3,No log,0.88191,0.655738
4,No log,0.911643,0.746667
5,0.309300,1.066156,0.666667
6,0.309300,1.4077,0.647887
7,0.309300,1.757258,0.615385
8,0.309300,1.815687,0.642857
9,0.040900,1.920841,0.641026
10,0.040900,1.733687,0.646154


report: 
               precision    recall  f1-score   support

           0       0.76      0.99      0.86        69
           1       0.90      0.29      0.44        31

    accuracy                           0.77       100
   macro avg       0.83      0.64      0.65       100
weighted avg       0.80      0.77      0.73       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.84      0.86      0.85        69
           1       0.67      0.65      0.66        31

    accuracy                           0.79       100
   macro avg       0.75      0.75      0.75       100
weighted avg       0.79      0.79      0.79       100

rep type:  <class 'str'>
report: 
               precision    recall  f1-score   support

           0       0.84      0.86      0.85        69
           1       0.67      0.65      0.66        31

    accuracy                           0.79       100
   macro avg       0.75      0.75      0.75     

report: 
               precision    recall  f1-score   support

           0       0.95      0.77      0.85        69
           1       0.64      0.90      0.75        31

    accuracy                           0.81       100
   macro avg       0.79      0.84      0.80       100
weighted avg       0.85      0.81      0.82       100

rep type:  <class 'str'>
F-1:  0.7466666666666666
3
16


In [35]:
# print(train_df["label"])

In [36]:
# torch.cuda.memory_summary(device=None, abbreviated=False)