In [1]:
!pip install transformers



In [2]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
device

'cuda'

In [5]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [6]:
data = pd.read_csv('/content/drive/MyDrive/BERT MODEL/DATASET/classified1.csv')

In [7]:
data

Unnamed: 0,annotate,post,language
0,"[0, 0, 0, 1]",natug nga sakit ang dughan samot diay igmata,__label__ceb_Latn
1,"[0, 0, 0, 0]","nganong ni uso naman sab ning parvo run uy, ba...",__label__ceb_Latn
2,"[0, 0, 0, 0]",bay di nako musubaybay ug national finals suno...,__label__ceb_Latn
3,"[0, 0, 0, 0]","hello, its me again! kahuot na ka halang sa du...",__label__ceb_Latn
4,"[0, 0, 0, 0]","hello, its me again! kahuot na ka halang sa du...",__label__ceb_Latn
...,...,...,...
1335,"[1, 1, 1, 0]",nga a subong pa ko gi lagnat? huhu okininam,__label__ceb_Latn
1336,"[0, 0, 0, 0]","in another life ha? tangina, wag na pagod na ako",__label__war_Latn
1337,"[0, 0, 0, 0]",pagod nako.,__label__ceb_Latn
1338,"[0, 0, 0, 0]",pagod na pagod na sha,__label__war_Latn


In [8]:
data.drop(['language'], inplace=True, axis=1)

In [9]:
data

Unnamed: 0,annotate,post
0,"[0, 0, 0, 1]",natug nga sakit ang dughan samot diay igmata
1,"[0, 0, 0, 0]","nganong ni uso naman sab ning parvo run uy, ba..."
2,"[0, 0, 0, 0]",bay di nako musubaybay ug national finals suno...
3,"[0, 0, 0, 0]","hello, its me again! kahuot na ka halang sa du..."
4,"[0, 0, 0, 0]","hello, its me again! kahuot na ka halang sa du..."
...,...,...
1335,"[1, 1, 1, 0]",nga a subong pa ko gi lagnat? huhu okininam
1336,"[0, 0, 0, 0]","in another life ha? tangina, wag na pagod na ako"
1337,"[0, 0, 0, 0]",pagod nako.
1338,"[0, 0, 0, 0]",pagod na pagod na sha


In [10]:
new_df = pd.DataFrame()
new_df['text'] = data['post']
new_df['labels'] = data['annotate'].values.tolist()

In [11]:
new_df.head()

Unnamed: 0,text,labels
0,natug nga sakit ang dughan samot diay igmata,"[0, 0, 0, 1]"
1,"nganong ni uso naman sab ning parvo run uy, ba...","[0, 0, 0, 0]"
2,bay di nako musubaybay ug national finals suno...,"[0, 0, 0, 0]"
3,"hello, its me again! kahuot na ka halang sa du...","[0, 0, 0, 0]"
4,"hello, its me again! kahuot na ka halang sa du...","[0, 0, 0, 0]"


In [12]:
#convert the labels column values from string to list and remove the characters , and [ and ] from the list

import re
new_df['labels'] = new_df['labels'].apply(lambda x: re.sub(r'[\[\],\']', '', x))
new_df['labels'] = new_df['labels'].apply(lambda x: x.split())
new_df.head()


Unnamed: 0,text,labels
0,natug nga sakit ang dughan samot diay igmata,"[0, 0, 0, 1]"
1,"nganong ni uso naman sab ning parvo run uy, ba...","[0, 0, 0, 0]"
2,bay di nako musubaybay ug national finals suno...,"[0, 0, 0, 0]"
3,"hello, its me again! kahuot na ka halang sa du...","[0, 0, 0, 0]"
4,"hello, its me again! kahuot na ka halang sa du...","[0, 0, 0, 0]"


In [13]:
type(new_df['labels'][0])

list

In [14]:
for item in new_df['labels'][0]:
  print(type(item))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [15]:
#change the datatype of the items in the list into int values

new_df['labels'] = new_df['labels'].apply(lambda x: [int(item) for item in x])


In [16]:
new_df['labels'][0]

[0, 0, 0, 1]

In [17]:
for item in new_df['labels'][0]:
  print(type(item))

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>


In [18]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 15
LEARNING_RATE = 1e-05
#use uncased distilbert tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('/content/drive/MyDrive/BERT MODEL/DistilBERT_FINAL/uncased_model/model', truncation=True, do_lower_case=True)

In [19]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [20]:
# Creating the dataset and dataloader for the neural network

train_size = 0.9
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (1340, 2)
TRAIN Dataset: (1206, 2)
TEST Dataset: (134, 2)


In [21]:
train_data.head()

Unnamed: 0,text,labels
0,gi na hilantan man pud q pero nganong dli man ...,"[0, 0, 0, 0]"
1,3 days nakong may lagnat,"[1, 1, 1, 1]"
2,du sakit lgi ni siya gamay sa dughan poide nd ...,"[0, 0, 0, 1]"
3,pagod na pagod na ako,"[0, 0, 0, 0]"
4,ubo nga gipugngan.jk,"[1, 1, 1, 0]"


In [22]:
training_set[0]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'ids': tensor([    2,   200,   192,   806, 12544,   116,   276,  6299,    61,  7035,
          1596,    48,   662,   276,   322,   331,  1857,   166,  9638,    37,
             3,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [23]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [24]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        #self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.l1 = DistilBertModel.from_pretrained('/content/drive/MyDrive/BERT MODEL/DistilBERT_FINAL/uncased_model/distilBERT')
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(256, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [25]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [26]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [27]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        loss.backward()
        optimizer.step()

In [28]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Epoch: 0, Loss:  0.6488316655158997


38it [00:06,  5.62it/s]
2it [00:00,  8.45it/s]

Epoch: 1, Loss:  0.5111367106437683


38it [00:05,  6.95it/s]
2it [00:00,  8.30it/s]

Epoch: 2, Loss:  0.421830415725708


38it [00:05,  6.94it/s]
2it [00:00,  8.22it/s]

Epoch: 3, Loss:  0.4125766158103943


38it [00:05,  6.86it/s]
2it [00:00,  8.33it/s]

Epoch: 4, Loss:  0.4152654707431793


38it [00:05,  6.91it/s]
2it [00:00,  8.06it/s]

Epoch: 5, Loss:  0.3380683660507202


38it [00:05,  6.90it/s]
2it [00:00,  8.22it/s]

Epoch: 6, Loss:  0.39669251441955566


38it [00:05,  6.88it/s]
2it [00:00,  8.21it/s]

Epoch: 7, Loss:  0.4480550289154053


38it [00:05,  6.87it/s]
2it [00:00,  8.14it/s]

Epoch: 8, Loss:  0.31162452697753906


38it [00:05,  6.89it/s]
2it [00:00,  8.18it/s]

Epoch: 9, Loss:  0.3388897776603699


38it [00:05,  6.88it/s]
2it [00:00,  8.25it/s]

Epoch: 10, Loss:  0.3796604871749878


38it [00:05,  6.87it/s]
2it [00:00,  8.26it/s]

Epoch: 11, Loss:  0.21936964988708496


38it [00:05,  6.86it/s]
2it [00:00,  8.28it/s]

Epoch: 12, Loss:  0.29467910528182983


38it [00:05,  6.85it/s]
2it [00:00,  8.25it/s]

Epoch: 13, Loss:  0.2746157646179199


38it [00:05,  6.84it/s]
2it [00:00,  8.21it/s]

Epoch: 14, Loss:  0.39140233397483826


38it [00:05,  6.83it/s]


In [29]:
'''
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    i = 0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            print(f'{len(data)}:{data}:{fin_outputs[i]}')
            print("="*200)
            i = i + 1
    return fin_outputs, fin_targets
  '''

'\ndef validation(testing_loader):\n    model.eval()\n    fin_targets=[]\n    fin_outputs=[]\n    i = 0\n    with torch.no_grad():\n        for _, data in tqdm(enumerate(testing_loader, 0)):\n            ids = data[\'ids\'].to(device, dtype = torch.long)\n            mask = data[\'mask\'].to(device, dtype = torch.long)\n            token_type_ids = data[\'token_type_ids\'].to(device, dtype = torch.long)\n            targets = data[\'targets\'].to(device, dtype = torch.float)\n            outputs = model(ids, mask, token_type_ids)\n            fin_targets.extend(targets.cpu().detach().numpy().tolist())\n            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())\n            print(f\'{len(data)}:{data}:{fin_outputs[i]}\')\n            print("="*200)\n            i = i + 1\n    return fin_outputs, fin_targets\n  '

In [30]:
#  from the validation function, let me see the actual string being passed and not just the number or tensor conversion

def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]

    i = 0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            print(f'IDS:{ids}:{fin_outputs[i]}')
            print("="*200)
            i = i + 1
    return fin_outputs, fin_targets


In [31]:
testing_loader

<torch.utils.data.dataloader.DataLoader at 0x78d672b47490>

In [32]:
testing_set[0]

{'ids': tensor([    2, 13892,   176,  1377,   177,  3710,   580,   232,  2350,   564,
          9559,     3,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [33]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >= 0.5

4it [00:00, 14.64it/s]

IDS:tensor([[    2,  1122,   799,  ...,     0,     0,     0],
        [    2,   181,   211,  ...,     0,     0,     0],
        [    2, 10453,   799,  ...,     0,     0,     0],
        ...,
        [    2,   247, 29748,  ...,     0,     0,     0],
        [    2, 17021, 11448,  ...,     0,     0,     0],
        [    2, 10453,   166,  ...,     0,     0,     0]], device='cuda:0'):[0.757794201374054, 0.48084917664527893, 0.2973741590976715, 0.38846006989479065]
IDS:tensor([[    2,  1233,  2131,  ...,   838, 22424,     3],
        [    2,   181,   211,  ...,     0,     0,     0],
        [    2,   181,   211,  ...,     0,     0,     0],
        ...,
        [    2, 18436, 13426,  ...,     0,     0,     0],
        [    2,   719,   307,  ...,     0,     0,     0],
        [    2,  4638,   276,  ...,     0,     0,     0]], device='cuda:0'):[0.019454624503850937, 0.016285838559269905, 0.012967376969754696, 0.02356971800327301]
IDS:tensor([[    2, 27674,   164,  ...,     0,     0,     0],
  

5it [00:00, 15.08it/s]

IDS:tensor([[    2,   450,   175,     7,     7,     7,  3535, 17021,   300,  9652,
          5768,  1765,  4915,  3624,  2454,    20,  6809,   782,   115,  1999,
            37,     3,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     




In [34]:
len(final_outputs)

134

In [35]:
final_outputs[2]

array([False, False, False, False])

In [36]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.7605721393034826
Hamming Loss = 0.16044776119402984


In [37]:
from sklearn.metrics import f1_score

In [38]:
#f1 score test
score_dat = f1_score(np.array(targets),np.array(final_outputs),average='weighted')
print(score_dat)

0.5564557171938974


In [39]:
test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)
testing_params = {'batch_size': TRAIN_BATCH_SIZE,
               'shuffle': False,
               'num_workers': 2
                }
test_loader = DataLoader(test_set, **testing_params)

In [40]:
all_test_pred = []

def test(epoch):
    model.eval()

    with torch.inference_mode():

        for _, data in tqdm(enumerate(test_loader, 0)):


            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            outputs = model(ids, mask, token_type_ids)
            probas = torch.sigmoid(outputs)

            all_test_pred.append(probas)
    return probas

In [41]:
probas = test(model)

5it [00:00, 18.59it/s]


In [42]:
all_test_pred = torch.cat(all_test_pred)

In [43]:
all_test_pred

tensor([[0.2517, 0.4728, 0.5209, 0.6870],
        [0.8648, 0.6153, 0.4770, 0.5255],
        [0.6785, 0.3536, 0.2533, 0.2993],
        [0.7439, 0.4160, 0.2840, 0.3786],
        [0.5250, 0.1932, 0.0754, 0.1401],
        [0.8239, 0.4405, 0.3229, 0.3411],
        [0.4198, 0.1785, 0.0789, 0.0625],
        [0.0377, 0.0434, 0.0138, 0.0251],
        [0.1669, 0.1336, 0.1476, 0.0950],
        [0.0327, 0.0227, 0.0241, 0.0519],
        [0.0548, 0.0292, 0.0293, 0.0317],
        [0.2321, 0.2001, 0.2231, 0.2179],
        [0.0668, 0.0429, 0.0582, 0.0605],
        [0.2770, 0.2085, 0.3068, 0.3571],
        [0.0783, 0.0438, 0.0600, 0.0808],
        [0.2716, 0.2111, 0.1536, 0.1394],
        [0.0249, 0.0195, 0.0266, 0.0420],
        [0.0396, 0.0198, 0.0176, 0.0316],
        [0.0276, 0.0166, 0.0217, 0.0310],
        [0.0341, 0.0249, 0.0314, 0.0445],
        [0.0354, 0.0243, 0.0298, 0.0403],
        [0.1822, 0.1758, 0.2228, 0.2747],
        [0.0387, 0.0263, 0.0253, 0.0328],
        [0.0373, 0.0251, 0.0364, 0

In [44]:
submit_df = test_data.copy()

In [45]:
submit_df

Unnamed: 0,text,labels
0,natug nga sakit ang dughan samot diay igmata,"[0, 0, 0, 1]"
1,grabe akong hilanat. pahilot pa more. ning gaw...,"[1, 0, 0, 0]"
2,pila na ni ka adlaw ang hilanat kag ubo i miss...,"[1, 1, 1, 0]"
3,tambal sa hilanat ug subaw,"[1, 1, 0, 0]"
4,ataya ani nga hilanat oy,"[1, 1, 0, 0]"
...,...,...
129,ayyy okay nagevaporate ang lagnat,"[0, 0, 0, 0]"
130,pagod na nga tapos ganon pa maririnig,"[0, 0, 0, 0]"
131,pagod na.,"[0, 0, 0, 0]"
132,pagod na pagod na akk,"[0, 0, 0, 0]"


In [46]:
label_columns = ["AURI", "COVID", "PN", "TB"]

In [47]:
for i,name in enumerate(label_columns):

    submit_df[name] = all_test_pred[:, i].cpu()
    submit_df.head()

In [48]:
submit_df

Unnamed: 0,text,labels,AURI,COVID,PN,TB
0,natug nga sakit ang dughan samot diay igmata,"[0, 0, 0, 1]",0.251655,0.472816,0.520946,0.687012
1,grabe akong hilanat. pahilot pa more. ning gaw...,"[1, 0, 0, 0]",0.864776,0.615338,0.476998,0.525525
2,pila na ni ka adlaw ang hilanat kag ubo i miss...,"[1, 1, 1, 0]",0.678504,0.353632,0.253286,0.299339
3,tambal sa hilanat ug subaw,"[1, 1, 0, 0]",0.743897,0.416011,0.283952,0.378571
4,ataya ani nga hilanat oy,"[1, 1, 0, 0]",0.524960,0.193204,0.075446,0.140146
...,...,...,...,...,...,...
129,ayyy okay nagevaporate ang lagnat,"[0, 0, 0, 0]",0.714260,0.762773,0.432582,0.338782
130,pagod na nga tapos ganon pa maririnig,"[0, 0, 0, 0]",0.019734,0.015952,0.012553,0.029234
131,pagod na.,"[0, 0, 0, 0]",0.019455,0.016286,0.012967,0.023570
132,pagod na pagod na akk,"[0, 0, 0, 0]",0.017343,0.016335,0.013033,0.024710


In [49]:
submit_df.to_csv('predictions.csv')

In [50]:
# Saving the files for inference

#output_model_file = '/content/drive/MyDrive/BERT MODEL/Multi_label_classification_models/multi_model1/model_file1/model1.pt'
#output_vocab_file = '/content/drive/MyDrive/BERT MODEL/Multi_label_classification_models/multi_model1/vocab1/vocab1.bin'

#torch.save(model.state_dict(), output_model_file)
#tokenizer.save_vocabulary(output_vocab_file)

#print('Saved')

In [51]:
#model = model1()
model.load_state_dict(torch.load('/content/drive/MyDrive/BERT MODEL/Multi_label_classification_models/multi_model1/model_file1/model1.pt'))
model.eval()

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(256, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [52]:
#using the model

# Preprocess the input text
input_text = "lantaw sa ko ug tv lol"
encoded_text = tokenizer.encode_plus(
    input_text,
    None,
    add_special_tokens=True,
    max_length=MAX_LEN,
    pad_to_max_length=True,
    return_token_type_ids=True
)

# Convert the input to tensors
input_ids = torch.tensor(encoded_text['input_ids']).unsqueeze(0)
input_mask = torch.tensor(encoded_text['attention_mask']).unsqueeze(0)
segment_ids = torch.tensor(encoded_text['token_type_ids']).unsqueeze(0)

# Move tensors to the device
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)

# Make predictions
with torch.no_grad():
    outputs = model(input_ids, input_mask, segment_ids)

# Apply sigmoid activation function
outputs = torch.sigmoid(outputs)

# Convert the outputs to numpy array
outputs = outputs.cpu().detach().numpy()

# Print the predictions
print(outputs)


[[0.22535858 0.2253054  0.14383467 0.20684576]]
