<a href="https://colab.research.google.com/github/dohyeongkim97/papers/blob/master/bert_clf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import torch
import pandas as pd
import numpy as np

import random
from transformers import set_seed

seed = 42
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
set_seed(seed)

In [3]:
torch.cuda.is_available()

True

In [4]:
import os
print(os.getcwd())

/content


In [5]:
df = pd.read_csv("./drive/MyDrive/paper_data-master/train.csv")
test = pd.read_csv("./drive/MyDrive/paper_data-master/test.csv")

In [6]:
df

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [7]:
os.listdir('./drive/MyDrive/paper_data-master/')

['sample_submission.csv',
 'test.csv',
 'train.csv',
 'judge',
 'training.1600000.processed.noemoticon.csv',
 'model_bert_classification']

In [8]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch import optim
from transformers import BertForSequenceClassification
from torch import nn
import math
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [9]:
!pip install googletrans
!pip install nltk



In [10]:
import random
from googletrans import Translator
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
def synonym_replacement(text, replace_ratio=0.1):
    words = text.split()
    new_words = words[:]
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)
    num_replacements = max(1, int(len(words) * replace_ratio))

    for random_word in random_word_list[:num_replacements]:
        synonyms = wordnet.synsets(random_word)
        synonym_words = [syn.lemmas()[0].name() for syn in synonyms]
        if synonym_words:
            synonym = random.choice(synonym_words)
            new_words = [synonym if word == random_word else word for word in new_words]

    return ' '.join(new_words)

def back_translation(text, lang='ko'):
    translated = self.translator.translate(text, src='en', dest=lang).text
    back_translated = self.translator.translate(translated, src=lang, dest='en').text
    return back_translated

def random_insertion(text, insert_word="convincingly"):
    words = text.split()
    insert_position = random.randint(0, len(words))
    words.insert(insert_position, insert_word)
    return ' '.join(words)

def order_switching( text):
    if "and" in text:
        parts = text.split("and")
        switched_text = "and".join(parts[::-1]).strip()
        return switched_text
    return text

def add_typo(text, typo_probability=0.1):
    chars = list(text)
    for i in range(len(chars)):
        if random.random() < typo_probability:
            chars[i] = random.choice("abcdefghijklmnopqrstuvwxyz")
    return ''.join(chars)

def augment_text( text):
    augmentations = [
        self.synonym_replacement,
        self.back_translation,
        self.random_insertion,
        self.order_switching,
        self.add_typo
    ]

    augmented_texts = {}
    for func in augmentations:
        augmented_texts[func.__name__] = func(text)

    return augmented_texts

In [12]:
txt = df.loc[1, 'facts']

In [13]:
txt

'Ramon Nelson was riding his bike when he suffered a lethal blow to the back of his head with a baseball bat. After two eyewitnesses identified Lawrence Owens from an array of photos and then a lineup, he was tried and convicted for Nelson’s death. Because Nelson was carrying cocaine and crack cocaine potentially for distribution, the judge at Owens’ bench trial ruled that Owens was probably also a drug dealer and was trying to “knock [Nelson] off.” Owens was found guilty of first-degree murder and sentenced to 25 years in prison.\nOwens filed a petition for a writ of habeas corpus on the grounds that his constitutional right to due process was violated during the trial. He argued that the eyewitness identification should have been inadmissible based on unreliability and that the judge impermissibly inferred a motive when a motive was not an element of the offense. The district court denied the writ of habeas corpus, and Owens appealed. The U.S. Court of Appeals for the Seventh Circuit

In [14]:
augmented = synonym_replacement(txt, replace_ratio = 0.2)

In [15]:
random.uniform(0, 1)

0.06352770615195713

In [16]:
new_df = pd.DataFrame(columns = ['facts', 'first_party_winner'])
for _ in range(0, 5):
    for i in range(len(df)):
        if random.uniform(0, 1) < 0.25:
            txt = df.loc[i, 'facts']
            augmented = synonym_replacement(txt, replace_ratio = 0.2)
            new_df.loc[len(new_df), 'first_party'] = df.loc[i, 'first_party']
            new_df.loc[len(new_df)-1, 'second_party'] = df.loc[i, 'second_party']
            new_df.loc[len(new_df)-1, 'facts'] = augmented
            new_df.loc[len(new_df)-1, 'first_party_winner'] = df.loc[i, 'first_party_winner']

In [17]:
new_df

Unnamed: 0,facts,first_party_winner,first_party,second_party
0,"A New York town, Clarkstown, allowed angstrom ...",1,"C & A Carbone, Inc., et al.",Town of Clarkstown
1,"Since its first express Constitution in 1796, ...",1,Paul A. McDaniel,"Selma Cash Paty, et al."
2,mention plaintiff James peg and others bring a...,0,"James J. Thole, et al.","U.S. Bank, N.A., et al."
3,A revision to the Texas education law in 1975 ...,0,Plyler,Doe
4,"Board No. 47, Louisville, Kentucky, denied the...",1,Clay,United States
...,...,...,...,...
3137,"in 1992, Michael Bies was convict of kidnappin...",1,"David Bobby, Warden",Michael Bies
3138,J. W. gamble exist a prisoner inch the Hunting...,1,"W. J. Estelle, Jr., Director, Texas Department...",J. W. Gamble
3139,Jeffrey Heffernan was angstrom police officer ...,1,Jeffrey Heffernan,City of Paterson
3140,"Indiana the spring of 1994, Hana Bank, A Korea...",0,"Hana Financial, Inc.","Hana Bank, et al."


In [24]:
def frame_make(df):
    if 'first_party_winner' in df.columns:
        data = pd.DataFrame(columns = ['text', 'target'])
        df['first_party_winner'] = df['first_party_winner'].astype(int)
        data['text'] = 'first_party:' + df['first_party'] + 'second_party:' + df['second_party'] + 'facts:' + df['facts'] + '\nwinner: '
        data['target'] = df['first_party_winner']
        return data
    else:
        data = pd.DataFrame(columns = ['text'])
        data['text'] = 'first_party:' + df['first_party'] + 'second_party:' + df['second_party'] + 'facts:' + df['facts'] + '\nwinner: '
        return data


In [19]:
df = df[['first_party', 'second_party', 'facts', 'first_party_winner']]

In [20]:
df = pd.concat([df,  new_df], axis=0)
df

Unnamed: 0,first_party,second_party,facts,first_party_winner
0,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...
3137,"David Bobby, Warden",Michael Bies,"in 1992, Michael Bies was convict of kidnappin...",1
3138,"W. J. Estelle, Jr., Director, Texas Department...",J. W. Gamble,J. W. gamble exist a prisoner inch the Hunting...,1
3139,Jeffrey Heffernan,City of Paterson,Jeffrey Heffernan was angstrom police officer ...,1
3140,"Hana Financial, Inc.","Hana Bank, et al.","Indiana the spring of 1994, Hana Bank, A Korea...",0


In [21]:
df = df.reset_index(drop=True)

In [23]:
df

Unnamed: 0,first_party,second_party,facts,first_party_winner
0,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...
5615,"David Bobby, Warden",Michael Bies,"in 1992, Michael Bies was convict of kidnappin...",1
5616,"W. J. Estelle, Jr., Director, Texas Department...",J. W. Gamble,J. W. gamble exist a prisoner inch the Hunting...,1
5617,Jeffrey Heffernan,City of Paterson,Jeffrey Heffernan was angstrom police officer ...,1
5618,"Hana Financial, Inc.","Hana Bank, et al.","Indiana the spring of 1994, Hana Bank, A Korea...",0


In [25]:
def make_dataset(data, tokenizer, device):
    tokenized = tokenizer(
        text = data.text.tolist(),
        padding= 'longest',
        truncation = True,
        return_tensors = 'pt'
    )
    input_ids = tokenized['input_ids'].to(device)
    attention_mask = tokenized['attention_mask'].to(device)
    if 'label' in data.columns:
        labels = torch.tensor(data['label'].values, dtype=torch.long).to(device)
        return TensorDataset(input_ids, attention_mask, labels)
    else:
        return TensorDataset(input_ids, attention_mask)

In [26]:
def get_dataloader(dataset, sampler, batch_size):
    data_sampler = sampler(dataset)
    dataloader = DataLoader(dataset, sampler = data_sampler, batch_size = batch_size)
    return dataloader

In [27]:
df = frame_make(df)

In [28]:
df.columns = ['text', 'label']

In [29]:
epochs = 5
batch_size = 16
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-multilingual-cased',
    do_lower_case = False
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [30]:
df

Unnamed: 0,text,label
0,first_party:Phil A. St. Amantsecond_party:Herm...,1
1,first_party:Stephen Duncansecond_party:Lawrenc...,0
2,first_party:Billy Joe Magwoodsecond_party:Tony...,1
3,first_party:Linklettersecond_party:Walkerfacts...,0
4,first_party:William Earl Fikessecond_party:Ala...,1
...,...,...
5615,"first_party:David Bobby, Wardensecond_party:Mi...",1
5616,"first_party:W. J. Estelle, Jr., Director, Texa...",1
5617,first_party:Jeffrey Heffernansecond_party:City...,1
5618,"first_party:Hana Financial, Inc.second_party:H...",0


In [31]:
class_0 = df[df['label'] == 0]
class_1 = df[df['label'] == 1]

class_0_oversampled = class_0.sample(len(class_1)+100, replace=True, random_state=42)
df = pd.concat([class_0_oversampled, class_1], axis=0).sample(frac=1).reset_index(drop=True)

In [32]:
df

Unnamed: 0,text,label
0,first_party:Denver Area Educational Telecommun...,0
1,first_party:Albert Ohraliksecond_party:Ohio St...,0
2,first_party:Kansassecond_party:Jonathan and Re...,1
3,first_party:Keith Lavon Burgesssecond_party:Un...,0
4,"first_party:Wisconsin Central Ltd., et al.seco...",1
...,...,...
7529,"first_party:Employment Division, Department of...",1
7530,"first_party:Brigham City, Utahsecond_party:Cha...",1
7531,first_party:Bordenkirchersecond_party:Hayesfac...,1
7532,first_party:Great-West Life and Annuity Insura...,0


In [33]:
train_df, valid, test = np.split(
    df.sample(frac = 1, random_state = 42), [int(0.6*len(df)), int(0.8*len(df))]
)

  return bound(*args, **kwds)


In [34]:
train_dataset = make_dataset(train_df, tokenizer, device)
train_dataloader = get_dataloader(train_dataset, RandomSampler, batch_size)

valid_dataset = make_dataset(valid, tokenizer, device)
valid_dataloader = get_dataloader(valid_dataset, RandomSampler, batch_size)

test_dataset = make_dataset(test, tokenizer, device)
test_dataloader = get_dataloader(test_dataset, RandomSampler, batch_size)

In [35]:
train_dataset[15]

(tensor([   101,  10422,    168,  14039,    131,  10900,  42071,    117,  10131,
            189,  10686,    119,  11132,    168,  14039,    131,  30416,    160,
            119,  10838,  11233,  11446,    117,  11164,  13634,  11104,  81839,
          24366,  13369,  39159,    131,  13735,  13634,  16277,  10193,    118,
          10296,    118,  10186,    113,    171,    114,    113,    127,    114,
            113,    100,  28498,  10186,    113,    171,    114,    113,    127,
            114,    100,    114,  11419,  10451,  83609,  10188,  11388,  25468,
          10105,  32185,  18245,  10108,  11299,  11388,  10111,  11436,  13559,
          74115,  70445,  10107,  10111,  13255,  64736,  23579,    117,  70807,
          10472,  11457,  11438,  18453,  10107,    119,  62020,  23631,  10575,
          10900,  10111,  65368,  42071,  10968,  10525,  10189,  10531,  29369,
            168,  19367,  21379,  10124,  10142,  71810,  10155,  11598,    158,
            119,    156,    

In [36]:
import datetime

In [37]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [38]:
model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path = 'bert-base-multilingual-cased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
).to(device)

optimizer = optim.AdamW(model.parameters(), lr = 2e-4, eps = 1e-4)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
def calc_accuracy(preds, labels):
    pred_flat = np.argmax(preds.cpu().numpy(), axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [40]:
def train(model, optimizer, dataloader, device):
    model = model.to(device)
    model.train()
    train_loss = 0.0
    j = 0

    for input_ids, attention_mask, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(
            input_ids = input_ids.to(device),
            attention_mask = attention_mask.to(device),
            labels = labels.to(device)
        )

        loss = outputs.loss
        train_loss += loss.item()

        # optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss

In [41]:
def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        criterion = nn.CrossEntropyLoss()
        val_loss, val_accuracy = 0.0, 0.0

        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                labels = labels
            )
            logits = outputs.logits

            loss = criterion(logits, labels)
            logtis = logits.detach().cpu().numpy()
            labels_ids = labels.to('cpu').numpy()
            accuracy = calc_accuracy(logits, labels_ids)

            # val_loss += loss
            val_loss += loss.item()
            val_accuracy += accuracy

        val_loss = val_loss/len(dataloader)
        val_accuracy = val_accuracy / len(dataloader)
        return val_loss, val_accuracy

In [None]:
model.save_pretrained('./drive/MyDrive/paper_data-master/model_bert_classification')

In [None]:
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup


model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path='bert-base-multilingual-cased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
).to(device)

optimizer = optim.AdamW(model.parameters(), lr=2e-4, eps=1e-4)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)

def train(model, optimizer, scheduler, dataloader, device):
    model.train()
    train_loss = 0.0

    for input_ids, attention_mask, labels in dataloader:
        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids.to(device),
            attention_mask=attention_mask.to(device),
            labels=labels.to(device)
        )

        loss = outputs.loss
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    train_loss /= len(dataloader)
    return train_loss

def evaluation(model, dataloader, device):
    model.eval()
    val_loss, val_accuracy = 0.0, 0.0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids=input_ids.to(device),
                attention_mask=attention_mask.to(device),
                labels=labels.to(device)
            )

            loss = outputs.loss
            val_loss += loss.item()
            logits = outputs.logits
            labels_ids = labels.cpu().numpy()
            accuracy = calc_accuracy(logits, labels_ids)
            val_accuracy += accuracy

    val_loss /= len(dataloader)
    val_accuracy /= len(dataloader)
    return val_loss, val_accuracy

In [64]:
best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_dataloader, device)
    val_loss, val_accuracy = evaluation(model, valid_dataloader)
    print(f'Epoch: {epoch+1} train loss: {train_loss:.4f} val loss: {val_loss:.4f} val_acc: {val_accuracy:.4f}')

    if val_loss < best_loss:
        best_loss = val_loss

Epoch: 1 train loss: 0.7067 val loss: 0.6996 val_acc: 0.4996
Epoch: 2 train loss: 0.6977 val loss: 0.6937 val_acc: 0.5053


KeyboardInterrupt: 

In [65]:
def predict(model, dataloader, device):
    model.to(device)
    model.eval()
    predictions = []

    with torch.no_grad():
        for input_ids, attention_mask in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask
            )
            logits = outputs.logits

            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())

    return predictions

In [66]:
test_data = pd.read_csv("./drive/MyDrive/paper_data-master/test.csv")

In [67]:
test_data = frame_make(test_data)

In [68]:
test_data

Unnamed: 0,text
0,first_party:Salernosecond_party:United Statesf...
1,first_party:Milberg Weiss Bershad Hynes and Le...
2,first_party:No. 07-582\t Title: \t Federal Com...
3,first_party:Harold Kaufman second_party:United...
4,first_party:Bergersecond_party:Hanlonfacts:In ...
...,...
1235,"first_party:Haitian Centers Council, Inc., et ..."
1236,first_party:Whitmansecond_party:American Truck...
1237,first_party:Linda A. Matteo and John J. Madiga...
1238,first_party:Washington State Apple Advertising...


In [47]:
valid.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,755
1,752


In [48]:
test_data_dataset = make_dataset(test_data, tokenizer, device)
test_data_dataloader = get_dataloader(test_data_dataset, RandomSampler, batch_size)

In [None]:
model_path = './drive/MyDrive/paper_data-master/model_bert_classification'

model = BertForSequenceClassification.from_pretrained(model_path)
# tokenizer = BertTokenizer.from_pretrained(model_path)

In [69]:
test_predicted = predict(model, test_data_dataloader, device)

In [70]:
len(test_predicted)

1240

In [71]:
sum(test_predicted)

1240

In [72]:
def predict_for_test(model, dataloader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for input_ids, attention_mask, _ in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask
            )
            logits = outputs.logits

            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())

    return predictions

In [73]:
train_predicted = predict_for_test(model, train_dataloader, device)

In [78]:
len(train_predicted)

4520

In [79]:
sum(train_predicted)

4520

In [76]:
import torch.nn.functional as F

def predict_probabilities(model, dataloader, device):
    model.to(device)
    model.eval()
    probabilities = []

    with torch.no_grad():
        for input_ids, attention_mask in dataloader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            model.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            probs = torch.sigmoid(logits).cpu().numpy()
            probabilities.extend(probs)

    return probabilities

In [77]:
train_dataset[0]

(tensor([   101,  10422,    168,  14039,    131,  10694,    156,    119,  19687,
            117,  21344,  10136,  10108,  10105,  18493,  11104,  52559,  22454,
          94917,  11132,    168,  14039,    131,  66701,  58313,  12480,  30787,
          13369,  39159,    131,    138,  18493,  11388,  14866,  61487,  66701,
          12480,  30787,  10106,  10105,  82523,  10108,  18014,  14321, 105529,
            119,  10117,  10893,  11922,  18713,  11327,  10957,  10134,  40792,
            119,  14321, 105529,    100,    187,  38877,  11736,  20750,  10662,
          10105,  28005,  10160,    169,  15034,  11825,    100,  11897,  10741,
            100,    119,  10117,    100,  11897,  10741,    100,  10134,  13213,
          10114,    169,  12117,  10741,    117,  10473,  31763,  10105,  97881,
          24087,    119,  10117,  15034,  19288,  10379,  10374,  10105,  97881,
          23763,  47320,  10107,  40792,    119,  14321, 105529,  32240,  10485,
          16342,  10165,  23

In [59]:
probs = predict_probabilities(model, test_data_dataloader, device)

In [60]:
probs2 = pd.DataFrame(probs)

In [61]:
probs2[:30]

Unnamed: 0,0,1
0,0.396066,0.455202
1,0.396024,0.45515
2,0.396212,0.455335
3,0.39624,0.455298
4,0.396132,0.455292
5,0.396353,0.454583
6,0.396252,0.455334
7,0.396396,0.455344
8,0.396208,0.455325
9,0.396218,0.455323


In [None]:
probs2[0].unique()

In [None]:
def predict_probabilities2(model, dataloader, device):
    model.eval()
    probabilities = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)

            model.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # 로짓 값과 확률 값 출력
            print("Logits:", logits)  # 로짓 값 출력
            probs = torch.sigmoid(logits).cpu().numpy()  # 확률로 변환
            print("Probabilities:", probs)  # 확률 값 출력

            probabilities.extend(probs)

    return probabilities