In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
df = pd.read_excel("train_bsq_abstract_chatgpt.xlsx", index_col=0)
df

Unnamed: 0,t,ab,q,label
0,Does carbon farming provide a cost-effective o...,"In this study, we apply a whole farm bioeconom...",Does the article discuss agroecological practi...,1
1,Influence of Organic and Mineral Fertilizers o...,The intensive use of mineral (M) fertilizers m...,Does the article discuss agroecological practi...,1
2,Climate Change Mitigation Options in the Fores...,The Intergovernmental Panel on Climate Change ...,Does the article discuss agroecological practi...,1
3,Rye cover crop incorporation and high watertab...,Drainage and cultivation of peat soils almost ...,Does the article discuss agroecological practi...,1
4,Emerging Issues and Potential Opportunities in...,The rice-wheat cropping system (RWCS) is the b...,Does the article discuss agroecological practi...,1
...,...,...,...,...
995,Consequences of field N2O emissions for the en...,One way of reducing the emissions of fossil fu...,Does the article discuss the impact of methane...,0
996,Estimation of net greenhouse gas balance using...,"The net greenhouse gas balance (NGHGB), estima...",Does the article discuss the impact of methane...,0
997,Farmers' adaptation to climate-smart agricultu...,Some of the measures to be taken to reduce gre...,Does the article discuss the impact of methane...,0
998,Valorization Methodology for Agriculture Secto...,Agriculture sector holds an essential role in ...,Does the article discuss the impact of methane...,1


In [3]:
df_train = df.sample(frac=0.9, random_state=2023)
df_val = df.loc[[ix for ix in df.index if ix not in df_train.index]]

df_train = df_train.reset_index()
df_val = df_val.reset_index()

In [4]:
%%capture
!pip install transformers
!pip install datasets
!pip install torch

In [5]:
from transformers import BertTokenizer, BertModel
import torch

In [6]:
model_name = "bert-base-cased"
bert_model = BertModel.from_pretrained(model_name)
bert_tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)
e = bert_model.eval()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
bert_tokenizer.all_special_tokens, bert_tokenizer.all_special_ids

(['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'], [100, 102, 0, 101, 103])

In [8]:
for s in df["q"].unique():
    print(len(bert_tokenizer.tokenize(s)))

10
8
12
15
12
7
9
22
15
15


In [9]:
from torch.utils.data import Dataset
class DatasetTaskChatGPT(Dataset):
    def __init__(self, df, maxlen_t=0, maxlen_ab=483, maxlen_q=25):
        self.df = df
        self.tokenizer = bert_tokenizer
        self.maxlen_t = maxlen_t
        self.maxlen_ab = maxlen_ab
        self.maxlen_q = maxlen_q

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        sentence1 = str(self.df.loc[index, 't'])
        sentence2 = str(self.df.loc[index, 'ab'])
        sentence3 = str(self.df.loc[index, 'q'])

        label = int(self.df.loc[index, "label"])
        
        tokens1 = self.tokenizer.tokenize(sentence1) if len(sentence1)>0 else ["[UNK]"]
        tokens2 = self.tokenizer.tokenize(sentence2) if len(sentence2)>0 else ["[UNK]"]
        tokens3 = self.tokenizer.tokenize(sentence3) if len(sentence3)>0 else ["[UNK]"]

        if len(tokens1) <= self.maxlen_t:
            tokens1 = tokens1 + ['[PAD]' for _ in range(self.maxlen_t - len(tokens1))]
        else:
            tokens1 = tokens1[:self.maxlen_t]

        if len(tokens2) <= self.maxlen_ab:
            tokens2 = tokens2 + ['[PAD]' for _ in range(self.maxlen_ab - len(tokens2))]
        else:
            tokens2 = tokens2[:self.maxlen_ab]
        
        if len(tokens3) <= self.maxlen_q:
            tokens3 = tokens3 + ['[PAD]' for _ in range(self.maxlen_q - len(tokens3))]
        else:
            tokens3 = tokens3[:self.maxlen_q]
          
        tokens = ["[CLS]"]+tokens1+["[SEP]"]+tokens2+["[SEP]"]+tokens3+["[SEP]"]
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens_ids_tensor = torch.tensor(tokens_ids)
        attn_mask = (tokens_ids_tensor != 0).long() # [PAD] => 0

        return tokens_ids_tensor, attn_mask, label

In [10]:
from torch.utils.data import DataLoader

train_set = DatasetTaskChatGPT(df = df_train)
val_set = DatasetTaskChatGPT(df = df_val)

train_loader = DataLoader(train_set, batch_size=16, num_workers = 2, shuffle=False)
val_loader = DataLoader(val_set, batch_size=16, num_workers = 2, shuffle=False)

In [11]:
import torch.nn as nn
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        torch.manual_seed(2022)
        
        self.bert_layer = BertModel.from_pretrained(model_name).cuda()
        self.cls_layer = nn.Linear(768, 2).cuda()

    def forward(self, seq, attn_masks):

        cont_reps = self.bert_layer(seq, attention_mask=attn_masks)
        cls_rep = cont_reps.last_hidden_state[:, 0]
        logits = self.cls_layer(cls_rep)

        return logits

In [12]:
import torch.optim as optim

net = Classifier()

weights = torch.tensor([1., 1.])#torch.tensor([1., 2.188])
criterion = nn.CrossEntropyLoss(weight=weights, reduction='mean').cuda()

opti = optim.Adam(net.parameters(), lr = 8e-5)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits)
    soft_probs = probs.argmax(1)
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc
    
def evaluate(net, criterion, dataloader):
    net.eval()
    mean_acc, mean_loss = 0, 0
    count = 0
    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits, labels).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

def evaluate_precision_recall_fscore_support(net, dataloader):
    net.eval()
    preds = []
    tests = []
    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()
            logits = net(seq, attn_masks)
            probs = torch.sigmoid(logits)
            soft_probs = probs.argmax(1)
            preds += soft_probs.squeeze().tolist()
            tests += labels.tolist()
    return tests, preds

In [14]:
def train(net, criterion, opti, train_loader, val_loader, epochs):
    for ep in range(epochs):
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            opti.zero_grad()  

            seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()

            logits = net(seq, attn_masks)

            loss = criterion(logits, labels)

            loss.backward()

            opti.step()

            if (it + 1) % 10 == 0:
                acc = get_accuracy_from_logits(logits, labels)
                # tests, preds = evaluate_precision_recall_fscore_support(net, train_loader)
                # print(classification_report(tests, preds))
                print("Iteration {} of epoch {} complete. Loss : {} Train Accuracy : {}".format(it+1, ep+1, loss.item(), acc))
        val_acc, val_loss = evaluate(net, criterion, val_loader)
        tests, preds = evaluate_precision_recall_fscore_support(net, val_loader)
        print(classification_report(tests, preds))
        print("Epoch {} complete! Validation Accuracy : {}, Validation Loss : {}".format(ep+1, val_acc, val_loss))

In [16]:
epochs = 7
train(net, criterion, opti, train_loader, val_loader, epochs)

Iteration 10 of epoch 1 complete. Loss : 0.28662875294685364 Train Accuracy : 0.875
Iteration 20 of epoch 1 complete. Loss : 0.2613835632801056 Train Accuracy : 0.875
Iteration 30 of epoch 1 complete. Loss : 0.3063660264015198 Train Accuracy : 0.8125
Iteration 40 of epoch 1 complete. Loss : 0.5068246126174927 Train Accuracy : 0.625
Iteration 50 of epoch 1 complete. Loss : 0.45571690797805786 Train Accuracy : 0.75
              precision    recall  f1-score   support

           0       0.59      0.63      0.61        35
           1       0.79      0.77      0.78        65

    accuracy                           0.72       100
   macro avg       0.69      0.70      0.70       100
weighted avg       0.72      0.72      0.72       100

Epoch 1 complete! Validation Accuracy : 0.6964285969734192, Validation Loss : 0.6241320861237389
Iteration 10 of epoch 2 complete. Loss : 0.23604127764701843 Train Accuracy : 0.875
Iteration 20 of epoch 2 complete. Loss : 0.26598864793777466 Train Accuracy

In [20]:
train_acc, train_loss = evaluate(net, criterion, train_loader)
tests, preds = evaluate_precision_recall_fscore_support(net, train_loader)
print(classification_report(tests, preds))
print("Complete! Train Accuracy : {}, Train Loss : {}".format(train_acc, train_loss))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       336
           1       0.94      0.88      0.91       564

    accuracy                           0.89       900
   macro avg       0.88      0.89      0.88       900
weighted avg       0.89      0.89      0.89       900

Complete! Train Accuracy : 0.8892543911933899, Train Loss : 0.26945389192878155


In [18]:
val_acc, val_loss = evaluate(net, criterion, val_loader)
tests, preds = evaluate_precision_recall_fscore_support(net, val_loader)
print(classification_report(tests, preds))
print("Complete! Validation Accuracy : {}, Validation Loss : {}".format(val_acc, val_loss))

              precision    recall  f1-score   support

           0       0.65      0.86      0.74        35
           1       0.91      0.75      0.82        65

    accuracy                           0.79       100
   macro avg       0.78      0.81      0.78       100
weighted avg       0.82      0.79      0.79       100

Complete! Validation Accuracy : 0.7589285969734192, Validation Loss : 0.6366780655724662


In [21]:
from huggingface_hub import notebook_login

In [1]:
notebook_login()

In [23]:
!git config --global credential.helper store

In [1]:
repo_name = "bert_ft_binary_chatgpt"
net.bert_layer.push_to_hub(repo_name)

In [2]:
val_set.tokenizer.push_to_hub(repo_name)

In [26]:
torch.save(net.cls_layer, "cls_layer.torch")