In [None]:
!pip install transformers



In [None]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
device

'cuda'

In [None]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [None]:
#data = pd.read_csv('/content/drive/MyDrive/BERT MODEL/DATASET/classified1.csv')
data = pd.read_csv('/content/drive/MyDrive/BERT MODEL/DATASET/fosData.csv')

In [None]:
data

Unnamed: 0,annotate,post
0,"[0, 0, 0, 1]","Ang balay sa akong silingan kay paabangan na,..."
1,"[0, 0, 0, 1]","Sa pagsugod sa iyong recipe, batili una ang m..."
2,"[1, 0, 0, 0]","Sa pagkakita nako sa iyang mga gi-post, nagdu..."
3,"[1, 0, 0, 0]",Ang iyang batan-on nga dugo nagdala og bibo s...
4,"[0, 1, 0, 0]","Dili ko gusto moapil sa group chat nila, kay ..."
...,...,...
1318,"[0, 0, 1, 0]",Nakahinomdom ko sa gitabi sa akong amiga nga ...
1319,"[0, 0, 1, 0]",Daghan ko nadungog nga mga istorya bahin sa t...
1320,"[1, 0, 0, 0]","Walay problema, pasagdi lang na kay abli ang ..."
1321,"[1, 0, 0, 0]",Naabot sa dunggan ang ngisi sa mga bata sa su...


In [None]:
#data.drop(['language'], inplace=True, axis=1)

In [None]:
data

Unnamed: 0,annotate,post
0,"[0, 0, 0, 1]","Ang balay sa akong silingan kay paabangan na,..."
1,"[0, 0, 0, 1]","Sa pagsugod sa iyong recipe, batili una ang m..."
2,"[1, 0, 0, 0]","Sa pagkakita nako sa iyang mga gi-post, nagdu..."
3,"[1, 0, 0, 0]",Ang iyang batan-on nga dugo nagdala og bibo s...
4,"[0, 1, 0, 0]","Dili ko gusto moapil sa group chat nila, kay ..."
...,...,...
1318,"[0, 0, 1, 0]",Nakahinomdom ko sa gitabi sa akong amiga nga ...
1319,"[0, 0, 1, 0]",Daghan ko nadungog nga mga istorya bahin sa t...
1320,"[1, 0, 0, 0]","Walay problema, pasagdi lang na kay abli ang ..."
1321,"[1, 0, 0, 0]",Naabot sa dunggan ang ngisi sa mga bata sa su...


In [None]:
new_df = pd.DataFrame()
new_df['text'] = data['post']
new_df['labels'] = data['annotate'].values.tolist()

In [None]:
new_df.head()

Unnamed: 0,text,labels
0,"Ang balay sa akong silingan kay paabangan na,...","[0, 0, 0, 1]"
1,"Sa pagsugod sa iyong recipe, batili una ang m...","[0, 0, 0, 1]"
2,"Sa pagkakita nako sa iyang mga gi-post, nagdu...","[1, 0, 0, 0]"
3,Ang iyang batan-on nga dugo nagdala og bibo s...,"[1, 0, 0, 0]"
4,"Dili ko gusto moapil sa group chat nila, kay ...","[0, 1, 0, 0]"


In [None]:
#convert the labels column values from string to list and remove the characters , and [ and ] from the list

import re
new_df['labels'] = new_df['labels'].apply(lambda x: re.sub(r'[\[\],\']', '', x))
new_df['labels'] = new_df['labels'].apply(lambda x: x.split())
new_df.head()


Unnamed: 0,text,labels
0,"Ang balay sa akong silingan kay paabangan na,...","[0, 0, 0, 1]"
1,"Sa pagsugod sa iyong recipe, batili una ang m...","[0, 0, 0, 1]"
2,"Sa pagkakita nako sa iyang mga gi-post, nagdu...","[1, 0, 0, 0]"
3,Ang iyang batan-on nga dugo nagdala og bibo s...,"[1, 0, 0, 0]"
4,"Dili ko gusto moapil sa group chat nila, kay ...","[0, 1, 0, 0]"


In [None]:
type(new_df['labels'][0])

list

In [None]:
for item in new_df['labels'][0]:
  print(type(item))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [None]:
#change the datatype of the items in the list into int values

new_df['labels'] = new_df['labels'].apply(lambda x: [int(item) for item in x])


In [None]:
new_df['labels'][0]

[0, 0, 0, 1]

In [None]:
for item in new_df['labels'][0]:
  print(type(item))

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>


In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 15
LEARNING_RATE = 2e-05
#use uncased distilbert tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('/content/drive/MyDrive/BERT MODEL/DistilBERT_FINAL/uncased_model/model', truncation=True, do_lower_case=True)

In [None]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.9
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (1323, 2)
TRAIN Dataset: (1191, 2)
TEST Dataset: (132, 2)


In [None]:
train_data.head()

Unnamed: 0,text,labels
0,"Nakita nimo si Sarah? Patsoy-tsoy kaayo, walay...","[0, 1, 0, 0]"
1,Ang mga tawo nga bagag lips maningkamot gyud ...,"[1, 0, 0, 0]"
2,Dili ko gusto nga mahibalaan sa akong mga gini...,"[1, 0, 0, 0]"
3,Kinsa man na imong sala nga wala ka nagtuon sa...,"[0, 1, 0, 0]"
4,Wala ka kahibalo unsa ang gibati sa mga tao b...,"[0, 0, 1, 0]"


In [None]:
training_set[0]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'ids': tensor([    2,   977,   934,   199, 17605,    37,   782,  1176,   122,    19,
          3123,   516,  1160,    18,   427,   790,   126,   192,   269,    18,
           206,    19,  1019,  2420,   441,   166,   234,   564,     7,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        #self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.l1 = DistilBertModel.from_pretrained('/content/drive/MyDrive/BERT MODEL/DistilBERT_FINAL/uncased_model/distilBERT')
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(256, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Epoch: 0, Loss:  0.7297529578208923


38it [00:06,  5.50it/s]
2it [00:00, 10.92it/s]

Epoch: 1, Loss:  0.41336485743522644


38it [00:05,  6.82it/s]
2it [00:00, 10.18it/s]

Epoch: 2, Loss:  0.34077775478363037


38it [00:05,  6.82it/s]
2it [00:00, 10.25it/s]

Epoch: 3, Loss:  0.3268541693687439


38it [00:05,  6.75it/s]
2it [00:00, 10.11it/s]

Epoch: 4, Loss:  0.19096912443637848


38it [00:05,  6.70it/s]
2it [00:00, 10.04it/s]

Epoch: 5, Loss:  0.08913826197385788


38it [00:05,  6.65it/s]
2it [00:00, 10.07it/s]

Epoch: 6, Loss:  0.03610047698020935


38it [00:05,  6.60it/s]
2it [00:00, 10.29it/s]

Epoch: 7, Loss:  0.031091604381799698


38it [00:05,  6.51it/s]
2it [00:00, 10.16it/s]

Epoch: 8, Loss:  0.02354883775115013


38it [00:05,  6.43it/s]
2it [00:00, 10.15it/s]

Epoch: 9, Loss:  0.020784907042980194


38it [00:05,  6.38it/s]
2it [00:00,  9.93it/s]

Epoch: 10, Loss:  0.016221608966588974


38it [00:05,  6.38it/s]
2it [00:00,  9.92it/s]

Epoch: 11, Loss:  0.0253440011292696


38it [00:05,  6.38it/s]
2it [00:00,  9.93it/s]

Epoch: 12, Loss:  0.013330377638339996


38it [00:05,  6.39it/s]
2it [00:00, 10.11it/s]

Epoch: 13, Loss:  0.00831431895494461


38it [00:05,  6.45it/s]
2it [00:00,  9.92it/s]

Epoch: 14, Loss:  0.00838787853717804


38it [00:05,  6.48it/s]


In [None]:
'''
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    i = 0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            print(f'{len(data)}:{data}:{fin_outputs[i]}')
            print("="*200)
            i = i + 1
    return fin_outputs, fin_targets
  '''

'\ndef validation(testing_loader):\n    model.eval()\n    fin_targets=[]\n    fin_outputs=[]\n    i = 0\n    with torch.no_grad():\n        for _, data in tqdm(enumerate(testing_loader, 0)):\n            ids = data[\'ids\'].to(device, dtype = torch.long)\n            mask = data[\'mask\'].to(device, dtype = torch.long)\n            token_type_ids = data[\'token_type_ids\'].to(device, dtype = torch.long)\n            targets = data[\'targets\'].to(device, dtype = torch.float)\n            outputs = model(ids, mask, token_type_ids)\n            fin_targets.extend(targets.cpu().detach().numpy().tolist())\n            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())\n            print(f\'{len(data)}:{data}:{fin_outputs[i]}\')\n            print("="*200)\n            i = i + 1\n    return fin_outputs, fin_targets\n  '

In [None]:
#  from the validation function, let me see the actual string being passed and not just the number or tensor conversion

def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]

    i = 0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            print(f'IDS:{ids}:{fin_outputs[i]}')
            print("="*200)
            i = i + 1
    return fin_outputs, fin_targets


In [None]:
testing_loader

<torch.utils.data.dataloader.DataLoader at 0x7ccf54c5d510>

In [None]:
testing_set[0]

{'ids': tensor([   2,  177,  468,  166,  304, 4984,  307, 3576,  752,  163,  192,   18,
          786, 1171,  176, 2756,  355,    3,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >= 0.5

5it [00:00, 17.96it/s]

IDS:tensor([[    2,   513,  5337,  ...,     0,     0,     0],
        [    2,   377, 21421,  ...,     0,     0,     0],
        [    2,   166,  3804,  ...,     0,     0,     0],
        ...,
        [    2, 18466,   301,  ...,     0,     0,     0],
        [    2,  2595,   355,  ...,     0,     0,     0],
        [    2,   343,  1440,  ...,     0,     0,     0]], device='cuda:0'):[0.9912343621253967, 0.002745784353464842, 0.010861113667488098, 0.0032037259079515934]
IDS:tensor([[    2, 26264,    19,  ...,     0,     0,     0],
        [    2,   223,    19,  ...,     0,     0,     0],
        [    2, 19193,    19,  ...,     0,     0,     0],
        ...,
        [    2,  3838,   265,  ...,     0,     0,     0],
        [    2, 14767,    18,  ...,     0,     0,     0],
        [    2,  8962,   681,  ...,     0,     0,     0]], device='cuda:0'):[0.00899861752986908, 0.9922512173652649, 0.003790445625782013, 0.0030755088664591312]
IDS:tensor([[    2,   900,   440,  ...,     0,     0,     0




In [None]:
len(final_outputs)

132

In [None]:
final_outputs[2]

array([False,  True, False, False])

In [None]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.8484848484848485
Hamming Loss = 0.06818181818181818


In [None]:
from sklearn.metrics import f1_score

In [None]:
#f1 score test
score_dat = f1_score(np.array(targets),np.array(final_outputs),average='weighted')
print(score_dat)

0.867453560371517


In [None]:
test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)
testing_params = {'batch_size': TRAIN_BATCH_SIZE,
               'shuffle': False,
               'num_workers': 2
                }
test_loader = DataLoader(test_set, **testing_params)

In [None]:
all_test_pred = []

def test(epoch):
    model.eval()

    with torch.inference_mode():

        for _, data in tqdm(enumerate(test_loader, 0)):


            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            outputs = model(ids, mask, token_type_ids)
            probas = torch.sigmoid(outputs)

            all_test_pred.append(probas)
    return probas

In [None]:
probas = test(model)

5it [00:00, 17.41it/s]


In [None]:
all_test_pred = torch.cat(all_test_pred)

In [None]:
all_test_pred

tensor([[1.2826e-03, 5.0564e-03, 9.8597e-01, 4.8129e-02],
        [9.9579e-01, 4.9877e-03, 3.6394e-03, 7.9960e-03],
        [1.3660e-02, 9.8508e-01, 2.0532e-03, 3.2903e-03],
        [6.3954e-03, 3.2585e-03, 5.4087e-03, 9.9042e-01],
        [9.9133e-01, 2.7732e-03, 1.9265e-02, 2.4169e-03],
        [9.4441e-01, 3.8950e-03, 1.0372e-02, 2.5589e-03],
        [2.0440e-03, 1.8881e-03, 3.8213e-02, 9.7690e-01],
        [1.4727e-02, 1.0098e-03, 9.7556e-01, 2.1024e-02],
        [9.8088e-01, 2.5395e-03, 5.4396e-03, 6.8487e-03],
        [1.9330e-03, 1.0127e-02, 1.0034e-02, 9.9362e-01],
        [9.9118e-01, 8.4632e-03, 6.1029e-03, 2.2989e-03],
        [9.8618e-02, 6.0984e-01, 6.9674e-03, 6.5050e-04],
        [3.0618e-03, 7.4839e-04, 3.2927e-01, 4.9361e-01],
        [3.2122e-03, 2.2648e-02, 9.9306e-01, 5.6617e-03],
        [1.5865e-03, 3.8459e-02, 9.8691e-01, 5.0884e-03],
        [1.6871e-03, 9.9509e-01, 1.3921e-02, 7.8884e-03],
        [8.2670e-03, 1.2171e-02, 9.8761e-01, 2.0620e-03],
        [9.953

In [None]:
submit_df = test_data.copy()

In [None]:
submit_df

Unnamed: 0,text,labels
0,"Ang balay sa akong silingan kay paabangan na,...","[0, 0, 0, 1]"
1,"Pagkadawat sa tamang balita, abot sa dalungga...","[1, 0, 0, 0]"
2,"Imo naman gi-interview ang tanan, CI ka?","[0, 1, 0, 0]"
3,Dili na lang ko magbutang og kwarta dinhi kay...,"[0, 0, 0, 1]"
4,Mag-istorya ra ba na siya abot sa pikas bukid...,"[1, 0, 0, 0]"
...,...,...
127,"Kanang blender sa kusina, daot na, walay ting...","[0, 0, 0, 1]"
128,Bugog papa gapinatka ra ug yawyaw,"[0, 1, 0, 0]"
129,"Kanang sekreto para bibo, wala ko'y plano nga...","[0, 1, 0, 0]"
130,"Wala koy kwarta karon, kinahanglan pa kog mang...","[0, 0, 0, 1]"


In [None]:
#label_columns = ["AURI", "COVID", "PN", "TB"]
label_columns = ["Idioms", "Catchphrases", "Euphemisms", "Literal"]

In [None]:
for i,name in enumerate(label_columns):

    submit_df[name] = all_test_pred[:, i].cpu()
    submit_df.head()

In [None]:
submit_df

Unnamed: 0,text,labels,Idioms,Catchphrases,Euphemisms,Literal
0,"Ang balay sa akong silingan kay paabangan na,...","[0, 0, 0, 1]",0.001283,0.005056,0.985971,0.048129
1,"Pagkadawat sa tamang balita, abot sa dalungga...","[1, 0, 0, 0]",0.995792,0.004988,0.003639,0.007996
2,"Imo naman gi-interview ang tanan, CI ka?","[0, 1, 0, 0]",0.013660,0.985084,0.002053,0.003290
3,Dili na lang ko magbutang og kwarta dinhi kay...,"[0, 0, 0, 1]",0.006395,0.003259,0.005409,0.990415
4,Mag-istorya ra ba na siya abot sa pikas bukid...,"[1, 0, 0, 0]",0.991334,0.002773,0.019265,0.002417
...,...,...,...,...,...,...
127,"Kanang blender sa kusina, daot na, walay ting...","[0, 0, 0, 1]",0.110954,0.386711,0.007194,0.001057
128,Bugog papa gapinatka ra ug yawyaw,"[0, 1, 0, 0]",0.000750,0.983576,0.113733,0.004514
129,"Kanang sekreto para bibo, wala ko'y plano nga...","[0, 1, 0, 0]",0.067490,0.610728,0.005254,0.001265
130,"Wala koy kwarta karon, kinahanglan pa kog mang...","[0, 0, 0, 1]",0.000216,0.165274,0.486057,0.400041


In [None]:
submit_df.to_csv('predictions.csv')

In [None]:
# Saving the files for inference

#output_model_file = '/content/drive/MyDrive/BERT MODEL/Multi_label_classification_models/multi_model1/model_file1/model1.pt'
#output_vocab_file = '/content/drive/MyDrive/BERT MODEL/Multi_label_classification_models/multi_model1/vocab1/vocab1.bin'

#torch.save(model.state_dict(), output_model_file)
#tokenizer.save_vocabulary(output_vocab_file)

#print('Saved')

In [None]:
#model = model1()
model.load_state_dict(torch.load('/content/drive/MyDrive/BERT MODEL/Multi_label_classification_models/multi_model1/model_file1/model1.pt'))
model.eval()

In [None]:
#using the model

# Preprocess the input text
input_text = "lantaw sa ko ug tv lol"
encoded_text = tokenizer.encode_plus(
    input_text,
    None,
    add_special_tokens=True,
    max_length=MAX_LEN,
    pad_to_max_length=True,
    return_token_type_ids=True
)

# Convert the input to tensors
input_ids = torch.tensor(encoded_text['input_ids']).unsqueeze(0)
input_mask = torch.tensor(encoded_text['attention_mask']).unsqueeze(0)
segment_ids = torch.tensor(encoded_text['token_type_ids']).unsqueeze(0)

# Move tensors to the device
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)

# Make predictions
with torch.no_grad():
    outputs = model(input_ids, input_mask, segment_ids)

# Apply sigmoid activation function
outputs = torch.sigmoid(outputs)

# Convert the outputs to numpy array
outputs = outputs.cpu().detach().numpy()

# Print the predictions
print(outputs)
