In [None]:
!pip install transformers



In [None]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
device

'cuda'

In [None]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [None]:
data = pd.read_csv('/content/drive/MyDrive/BERT MODEL/DATASET/classified1.csv')

In [None]:
data

Unnamed: 0,annotate,post,language
0,"[0, 0, 0, 1]",natug nga sakit ang dughan samot diay igmata,__label__ceb_Latn
1,"[0, 0, 0, 0]","nganong ni uso naman sab ning parvo run uy, ba...",__label__ceb_Latn
2,"[0, 0, 0, 0]",bay di nako musubaybay ug national finals suno...,__label__ceb_Latn
3,"[0, 0, 0, 0]","hello, its me again! kahuot na ka halang sa du...",__label__ceb_Latn
4,"[0, 0, 0, 0]","hello, its me again! kahuot na ka halang sa du...",__label__ceb_Latn
...,...,...,...
1335,"[1, 1, 1, 0]",nga a subong pa ko gi lagnat? huhu okininam,__label__ceb_Latn
1336,"[0, 0, 0, 0]","in another life ha? tangina, wag na pagod na ako",__label__war_Latn
1337,"[0, 0, 0, 0]",pagod nako.,__label__ceb_Latn
1338,"[0, 0, 0, 0]",pagod na pagod na sha,__label__war_Latn


In [None]:
data.drop(['language'], inplace=True, axis=1)

In [None]:
data

Unnamed: 0,annotate,post
0,"[0, 0, 0, 1]",natug nga sakit ang dughan samot diay igmata
1,"[0, 0, 0, 0]","nganong ni uso naman sab ning parvo run uy, ba..."
2,"[0, 0, 0, 0]",bay di nako musubaybay ug national finals suno...
3,"[0, 0, 0, 0]","hello, its me again! kahuot na ka halang sa du..."
4,"[0, 0, 0, 0]","hello, its me again! kahuot na ka halang sa du..."
...,...,...
1335,"[1, 1, 1, 0]",nga a subong pa ko gi lagnat? huhu okininam
1336,"[0, 0, 0, 0]","in another life ha? tangina, wag na pagod na ako"
1337,"[0, 0, 0, 0]",pagod nako.
1338,"[0, 0, 0, 0]",pagod na pagod na sha


In [None]:
new_df = pd.DataFrame()
new_df['text'] = data['post']
new_df['labels'] = data['annotate'].values.tolist()

In [None]:
new_df.head()

Unnamed: 0,text,labels
0,natug nga sakit ang dughan samot diay igmata,"[0, 0, 0, 1]"
1,"nganong ni uso naman sab ning parvo run uy, ba...","[0, 0, 0, 0]"
2,bay di nako musubaybay ug national finals suno...,"[0, 0, 0, 0]"
3,"hello, its me again! kahuot na ka halang sa du...","[0, 0, 0, 0]"
4,"hello, its me again! kahuot na ka halang sa du...","[0, 0, 0, 0]"


In [None]:
#convert the labels column values from string to list and remove the characters , and [ and ] from the list

import re
new_df['labels'] = new_df['labels'].apply(lambda x: re.sub(r'[\[\],\']', '', x))
new_df['labels'] = new_df['labels'].apply(lambda x: x.split())
new_df.head()


Unnamed: 0,text,labels
0,natug nga sakit ang dughan samot diay igmata,"[0, 0, 0, 1]"
1,"nganong ni uso naman sab ning parvo run uy, ba...","[0, 0, 0, 0]"
2,bay di nako musubaybay ug national finals suno...,"[0, 0, 0, 0]"
3,"hello, its me again! kahuot na ka halang sa du...","[0, 0, 0, 0]"
4,"hello, its me again! kahuot na ka halang sa du...","[0, 0, 0, 0]"


In [None]:
type(new_df['labels'][0])

list

In [None]:
for item in new_df['labels'][0]:
  print(type(item))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [None]:
# prompt: in the labels column, change the datatype of the items in the list into int values

new_df['labels'] = new_df['labels'].apply(lambda x: [int(item) for item in x])


In [None]:
new_df['labels'][0]

[0, 0, 0, 1]

In [None]:
for item in new_df['labels'][0]:
  print(type(item))

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>


In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-09
#use uncased distilbert tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('/content/drive/MyDrive/BERT MODEL/DistilBERT_FINAL/uncased_model/model', truncation=True, do_lower_case=True)

In [None]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.9
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (1340, 2)
TRAIN Dataset: (1206, 2)
TEST Dataset: (134, 2)


In [None]:
train_data.head()

Unnamed: 0,text,labels
0,gi na hilantan man pud q pero nganong dli man ...,"[0, 0, 0, 0]"
1,3 days nakong may lagnat,"[1, 1, 1, 1]"
2,du sakit lgi ni siya gamay sa dughan poide nd ...,"[0, 0, 0, 1]"
3,pagod na pagod na ako,"[0, 0, 0, 0]"
4,ubo nga gipugngan.jk,"[1, 1, 1, 0]"


In [None]:
training_set[0]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'ids': tensor([    2,   200,   192,   806, 12544,   116,   276,  6299,    61,  7035,
          1596,    48,   662,   276,   322,   331,  1857,   166,  9638,    37,
             3,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        #self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.l1 = DistilBertModel.from_pretrained('/content/drive/MyDrive/BERT MODEL/DistilBERT_FINAL/uncased_model/distilBERT')
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(256, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Epoch: 0, Loss:  0.7772393822669983


302it [00:12, 23.48it/s]
3it [00:00, 27.67it/s]

Epoch: 1, Loss:  0.80511474609375


302it [00:11, 25.79it/s]
3it [00:00, 27.56it/s]

Epoch: 2, Loss:  0.7798329591751099


302it [00:11, 25.62it/s]
3it [00:00, 27.55it/s]

Epoch: 3, Loss:  0.7746496200561523


302it [00:11, 25.61it/s]
3it [00:00, 27.55it/s]

Epoch: 4, Loss:  0.7480774521827698


302it [00:11, 25.50it/s]
3it [00:00, 25.45it/s]

Epoch: 5, Loss:  0.7225768566131592


302it [00:12, 25.15it/s]
3it [00:00, 27.55it/s]

Epoch: 6, Loss:  0.6894816160202026


302it [00:11, 25.27it/s]
3it [00:00, 27.15it/s]

Epoch: 7, Loss:  0.714937150478363


302it [00:11, 25.32it/s]
3it [00:00, 27.46it/s]

Epoch: 8, Loss:  0.7054945230484009


302it [00:11, 25.55it/s]
3it [00:00, 27.52it/s]

Epoch: 9, Loss:  0.7803914546966553


302it [00:11, 25.60it/s]


In [None]:
'''
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    i = 0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            print(f'{len(data)}:{data}:{fin_outputs[i]}')
            print("="*200)
            i = i + 1
    return fin_outputs, fin_targets
  '''

In [None]:
#  from the validation function, let me see the actual string being passed and not just the number or tensor conversion

def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]

    i = 0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            print(f'IDS:{ids}:{fin_outputs[i]}')
            print("="*200)
            i = i + 1
    return fin_outputs, fin_targets


In [None]:
testing_loader

<torch.utils.data.dataloader.DataLoader at 0x7c89004b1d20>

In [None]:
testing_set[0]

{'ids': tensor([    2, 13892,   176,  1377,   177,  3710,   580,   232,  2350,   564,
          9559,     3,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [None]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >= 0.5

12it [00:00, 58.83it/s]

IDS:tensor([[    2,   181,   211,   192,   355,     3,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     

25it [00:00, 59.38it/s]

IDS:tensor([[    2, 19017,   112, 10453, 19079,  4879, 20109,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     

34it [00:00, 59.14it/s]

IDS:tensor([[    2, 10453,   799,     3,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     




In [None]:
len(final_outputs)

134

In [None]:
final_outputs[2]

array([False, False, False, False])

In [None]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.7636815920398009
Hamming Loss = 0.14738805970149255


In [None]:
test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)
testing_params = {'batch_size': TRAIN_BATCH_SIZE,
               'shuffle': False,
               'num_workers': 2
                }
test_loader = DataLoader(test_set, **testing_params)

In [None]:
all_test_pred = []

def test(epoch):
    model.eval()

    with torch.inference_mode():

        for _, data in tqdm(enumerate(test_loader, 0)):


            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            outputs = model(ids, mask, token_type_ids)
            probas = torch.sigmoid(outputs)

            all_test_pred.append(probas)
    return probas

In [None]:
probas = test(model)

34it [00:00, 109.23it/s]


In [None]:
all_test_pred = torch.cat(all_test_pred)

In [None]:
all_test_pred

tensor([[0.3263, 0.5134, 0.4299, 0.4660],
        [0.9697, 0.8687, 0.7766, 0.8173],
        [0.8019, 0.6170, 0.3938, 0.3812],
        [0.8619, 0.6339, 0.3762, 0.3565],
        [0.7603, 0.5409, 0.1406, 0.1552],
        [0.8593, 0.6193, 0.3999, 0.3257],
        [0.6607, 0.4226, 0.0809, 0.0482],
        [0.0147, 0.0128, 0.0037, 0.0112],
        [0.3371, 0.3393, 0.2529, 0.1522],
        [0.0191, 0.0083, 0.0053, 0.0128],
        [0.0283, 0.0086, 0.0055, 0.0098],
        [0.3020, 0.3173, 0.2287, 0.2497],
        [0.2590, 0.1518, 0.0961, 0.0502],
        [0.3382, 0.3565, 0.2822, 0.2391],
        [0.2967, 0.2965, 0.2217, 0.1853],
        [0.3181, 0.3568, 0.2970, 0.2373],
        [0.0537, 0.0233, 0.0136, 0.0396],
        [0.0184, 0.0068, 0.0062, 0.0100],
        [0.0217, 0.0089, 0.0066, 0.0092],
        [0.0219, 0.0108, 0.0062, 0.0157],
        [0.0141, 0.0075, 0.0063, 0.0110],
        [0.3302, 0.3842, 0.3239, 0.3033],
        [0.0391, 0.0131, 0.0153, 0.0172],
        [0.0198, 0.0066, 0.0075, 0

In [None]:
submit_df = test_data.copy()

In [None]:
submit_df

Unnamed: 0,text,labels
0,natug nga sakit ang dughan samot diay igmata,"[0, 0, 0, 1]"
1,grabe akong hilanat. pahilot pa more. ning gaw...,"[1, 0, 0, 0]"
2,pila na ni ka adlaw ang hilanat kag ubo i miss...,"[1, 1, 1, 0]"
3,tambal sa hilanat ug subaw,"[1, 1, 0, 0]"
4,ataya ani nga hilanat oy,"[1, 1, 0, 0]"
...,...,...
129,ayyy okay nagevaporate ang lagnat,"[0, 0, 0, 0]"
130,pagod na nga tapos ganon pa maririnig,"[0, 0, 0, 0]"
131,pagod na.,"[0, 0, 0, 0]"
132,pagod na pagod na akk,"[0, 0, 0, 0]"


In [None]:
label_columns = ["AURI", "COVID", "PN", "TB"]

In [None]:
for i,name in enumerate(label_columns):

    submit_df[name] = all_test_pred[:, i].cpu()
    submit_df.head()

In [None]:
submit_df

Unnamed: 0,text,labels,AURI,COVID,PN,TB
0,natug nga sakit ang dughan samot diay igmata,"[0, 0, 0, 1]",0.326286,0.513393,0.429914,0.466029
1,grabe akong hilanat. pahilot pa more. ning gaw...,"[1, 0, 0, 0]",0.969697,0.868701,0.776618,0.817344
2,pila na ni ka adlaw ang hilanat kag ubo i miss...,"[1, 1, 1, 0]",0.801879,0.617044,0.393825,0.381176
3,tambal sa hilanat ug subaw,"[1, 1, 0, 0]",0.861898,0.633919,0.376201,0.356461
4,ataya ani nga hilanat oy,"[1, 1, 0, 0]",0.760264,0.540941,0.140581,0.155152
...,...,...,...,...,...,...
129,ayyy okay nagevaporate ang lagnat,"[0, 0, 0, 0]",0.898035,0.964393,0.795623,0.582514
130,pagod na nga tapos ganon pa maririnig,"[0, 0, 0, 0]",0.021640,0.008440,0.006722,0.035005
131,pagod na.,"[0, 0, 0, 0]",0.012877,0.007949,0.005584,0.009796
132,pagod na pagod na akk,"[0, 0, 0, 0]",0.011482,0.007161,0.005656,0.011447


In [None]:
submit_df.to_csv('predictions.csv')

In [None]:
# Saving the files for inference

output_model_file = '/content/drive/MyDrive/BERT MODEL/Multi_label_classification_models/multi_model1/model_file1/model1.pt'
output_vocab_file = '/content/drive/MyDrive/BERT MODEL/Multi_label_classification_models/multi_model1/vocab1/vocab1.bin'

torch.save(model.state_dict(), output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('Saved')

Saved


In [None]:
#model = model1()
model.load_state_dict(torch.load('/content/drive/MyDrive/BERT MODEL/Multi_label_classification_models/multi_model1/model_file1/model1.pt'))
model.eval()

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(256, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [None]:
#using the model

# Preprocess the input text
input_text = "lantaw sa ko ug tv lol"
encoded_text = tokenizer.encode_plus(
    input_text,
    None,
    add_special_tokens=True,
    max_length=MAX_LEN,
    pad_to_max_length=True,
    return_token_type_ids=True
)

# Convert the input to tensors
input_ids = torch.tensor(encoded_text['input_ids']).unsqueeze(0)
input_mask = torch.tensor(encoded_text['attention_mask']).unsqueeze(0)
segment_ids = torch.tensor(encoded_text['token_type_ids']).unsqueeze(0)

# Move tensors to the device
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)

# Make predictions
with torch.no_grad():
    outputs = model(input_ids, input_mask, segment_ids)

# Apply sigmoid activation function
outputs = torch.sigmoid(outputs)

# Convert the outputs to numpy array
outputs = outputs.cpu().detach().numpy()

# Print the predictions
print(outputs)


[[0.22535858 0.2253054  0.14383467 0.20684576]]
