# Setup

In [12]:
# Install specific libraries
! pip install transformers
! pip install pycaret



In [13]:
import numpy as np
import pandas as pd
import pycaret
import transformers
from transformers import AutoModel, BertTokenizerFast
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertModel
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from sklearn.feature_extraction.text import TfidfVectorizer
import re
# specify GPU
device = torch.device("cuda")

# Dataset Preparation

In [14]:
# Load Dataset
true_data = pd.read_csv('/kaggle/input/yai25-toyproject/News _dataset/True.csv')
fake_data = pd.read_csv('/kaggle/input/yai25-toyproject/News _dataset/Fake.csv')

# Generate labels True/Fake under new Target Column in 'true_data' and 'fake_data'
true_data['label'] = 1
fake_data['label'] = 0

# Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data'
data = pd.concat([true_data, fake_data], ignore_index = True).sample(frac=1).reset_index().drop(columns = ['index'])

# See how the data looks like
print(data.shape)
data.head()

(44898, 5)


Unnamed: 0,title,text,subject,date,label
0,AFTER GM’S TAXPAYER BAILOUT AND $10 BILLION IN...,Was the GM bailout REALLY about American jobs?...,politics,"Mar 31, 2016",0
1,UN NEWS AGENCY SCRUBS TWEET Calling On America...,The United Nations News Centre the official ...,politics,"Sep 30, 2016",0
2,"Boiler Room EP #70 – Sticks, Stones & The Medi...",Tune in to the Alternate Current Radio Network...,Middle-east,"August 25, 2016",0
3,U.S. House Republican gun bill draws the ire o...,WASHINGTON (Reuters) - Republicans in the U.S....,politicsNews,"July 1, 2016",1
4,KRISPY KREME Worker REFUSES To Serve Cop: ” I ...,She doesn t do POlice until someone threaten...,politics,"May 6, 2016",0


In [15]:
# Load BERT model and tokenizer via HuggingFace Transformers
bert1 = AutoModel.from_pretrained('bert-base-uncased')
bert2 = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [16]:
# Train-Validation-Test set split into 70:15:15 ratio about 'text'
# Train-Temp split
train_text, temp_text, train_text_labels, temp_text_labels = train_test_split(data['text'], data['label'],
                                                                    random_state=2018,
                                                                    test_size=0.3,
                                                                    stratify=data['label'])
# Validation-Test split
val_text, test_text, val_text_labels, test_text_labels = train_test_split(temp_text, temp_text_labels,
                                                                random_state=2018,
                                                                test_size=0.5,
                                                                stratify=temp_text_labels)

In [17]:
# Train-Validation-Test set split into 70:15:15 ratio about 'title'
# Train-Temp split
train_title, temp_title, train_title_labels, temp_title_labels = train_test_split(data['title'], data['label'],
                                                                    random_state=2018,
                                                                    test_size=0.3,
                                                                    stratify=data['label'])
# Validation-Test split
val_title, test_title, val_title_labels, test_title_labels = train_test_split(temp_title, temp_title_labels,
                                                                random_state=2018,
                                                                test_size=0.5,
                                                                stratify=temp_title_labels)

# Preprocessing

In [18]:
# Describe data 'text'
length = data['text'].apply(len)
length.describe()

count    44898.000000
mean      2469.109693
std       2171.617091
min          1.000000
25%       1234.000000
50%       2186.000000
75%       3105.000000
max      51794.000000
Name: text, dtype: float64

In [19]:
# we set max title length as 64
MAX_LENGHT = 64
# Tokenize and encode sequences in the train set
tokens_title_train = tokenizer.batch_encode_plus(
    train_title.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the validation set
tokens_title_val = tokenizer.batch_encode_plus(
    val_title.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the test set
tokens_title_test = tokenizer.batch_encode_plus(
    test_title.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)

In [20]:
def extract_sentences(articles):
    """여러 개의 기사에서 문장을 추출하여 리스트로 반환"""
    sentences = []
    for article in articles:
        article_sentences = re.split(r'(?<=[.!?]) +', article.strip())  # 문장 분할
        sentences.append(article_sentences)
    return sentences

In [21]:
def bert_sentence_embedding(sentences):
    """BERT를 사용하여 여러 문장의 CLS 벡터를 한 번에 반환"""
    inputs = tokenizer.batch_encode_plus(
        sentences,
        return_tensors="pt",
        max_length=64,
        padding="max_length",
        truncation=True
    )

    inputs = {key: val.to(device) for key, val in inputs.items()}  # GPU 이동

    with torch.no_grad():
        with torch.cuda.amp.autocast():  # FP16 가속
            outputs = model(**inputs)

    cls_vectors = outputs.last_hidden_state[:, 0, :].to(device)  # GPU 유지
    return cls_vectors

In [22]:
import torch
import numpy as np
from transformers import BertModel, BertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# BERT 모델 Load
model = BertModel.from_pretrained("bert-base-uncased").to(device).eval()  # 평가 모드

# 점수 계산 및 핵심 문장 선택 함수
def score(articles, n=3, alpha=0.5, beta=0.3, gamma=0.2):
    """
    각 기사에서 가장 중요한 문장을 Headline으로 선정
    각 문장과 Headline의 유사도를 계산 후 상위 n개 문장 선택
    """
    selected_sentences = []
    all_articles_sentences = extract_sentences(articles)  # 기사별 문장 리스트
    count = 0
    for sentences in all_articles_sentences:
        if count % 1000 == 0:
            print("count: {}, selected_sentences Length: {}".format(count, len(selected_sentences)))

        if len(sentences) < 3:  # 문장의 길이가 3보다 작으면 해당 문장을 모두 넣어기기
            selected_sentences.append(sentences)
            count += 1
            continue



        # 🔹 Headline 선정 (TF-IDF 기준)
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(sentences)
        headline_idx = np.argmax(tfidf_matrix.sum(axis=1))  # TF-IDF 점수 최대 문장(doc) 선택
        headline = sentences[headline_idx]

        # 🔹 문장 임베딩 계산 (Batch 처리)
        sentence_embeddings = bert_sentence_embedding(sentences)  # GPU 텐서
        headline_embedding = bert_sentence_embedding([headline])  # GPU 텐서

        # 🔹 유사도 계산
        rel_scores = torch.nn.functional.cosine_similarity(sentence_embeddings, headline_embedding)

        # 🔹 TF-IDF 점수 (GPU 변환)
        sentence_tfidf = torch.tensor(tfidf_matrix.sum(axis=1)).float().squeeze().to(device)

        # 🔹 문장 위치 점수 (GPU 변환)
        position_scores = torch.tensor(1.0 / (np.arange(1, len(sentences) + 1)), dtype=torch.float32).to(device)

        # 🔹 최종 점수 계산
        worth_scores = alpha * rel_scores + beta * sentence_tfidf + gamma * position_scores
        top_indices = torch.topk(worth_scores, n).indices.cpu().tolist()  # GPU → CPU 변환 후 리스트로 변환

        selected_sentences.append([sentences[i] for i in top_indices])  # 최종 선택된 문장 저장
        count += 1

    return selected_sentences

articles = train_text.tolist()
print(np.shape(articles))

with torch.cuda.amp.autocast():  # FP16 가속
    result = score(articles, n=3)

train_top_text = result


(31428,)
count: 0, selected_sentences Length: 0
count: 1000, selected_sentences Length: 1000
count: 2000, selected_sentences Length: 2000
count: 3000, selected_sentences Length: 3000
count: 4000, selected_sentences Length: 4000
count: 5000, selected_sentences Length: 5000
count: 6000, selected_sentences Length: 6000
count: 7000, selected_sentences Length: 7000
count: 8000, selected_sentences Length: 8000
count: 9000, selected_sentences Length: 9000
count: 10000, selected_sentences Length: 10000
count: 11000, selected_sentences Length: 11000
count: 12000, selected_sentences Length: 12000
count: 13000, selected_sentences Length: 13000
count: 14000, selected_sentences Length: 14000
count: 15000, selected_sentences Length: 15000
count: 16000, selected_sentences Length: 16000
count: 17000, selected_sentences Length: 17000
count: 18000, selected_sentences Length: 18000
count: 19000, selected_sentences Length: 19000
count: 20000, selected_sentences Length: 20000
count: 21000, selected_sentenc

In [23]:
articles = val_text.tolist()
print(np.shape(articles))

with torch.cuda.amp.autocast():  # FP16 가속
    result = score(articles, n=3)

val_top_text = result

(6735,)
count: 0, selected_sentences Length: 0
count: 1000, selected_sentences Length: 1000
count: 2000, selected_sentences Length: 2000
count: 3000, selected_sentences Length: 3000
count: 4000, selected_sentences Length: 4000
count: 5000, selected_sentences Length: 5000
count: 6000, selected_sentences Length: 6000


In [24]:
articles = test_text.tolist()
print(np.shape(articles))

with torch.cuda.amp.autocast():  # FP16 가속
    result = score(articles, n=3)

test_top_text = result

(6735,)
count: 0, selected_sentences Length: 0
count: 1000, selected_sentences Length: 1000
count: 2000, selected_sentences Length: 2000
count: 3000, selected_sentences Length: 3000
count: 4000, selected_sentences Length: 4000
count: 5000, selected_sentences Length: 5000
count: 6000, selected_sentences Length: 6000


In [25]:
import copy
copy = copy.deepcopy(train_top_text)
copy = [" ".join(sublist) for sublist in copy]
np.shape(copy)

(31428,)

In [26]:
import copy
train_copy_text = copy.deepcopy(train_top_text)
train_copy_text = [" ".join(sublist) for sublist in train_copy_text]
val_copy_text = copy.deepcopy(val_top_text)
val_copy_text = [" ".join(sublist) for sublist in val_copy_text]
test_copy_text = copy.deepcopy(test_top_text)
test_copy_text = [" ".join(sublist) for sublist in test_copy_text]
print("Train shape: {}, Validation shape: {}, Test shape: {}".format(np.shape(train_copy_text), np.shape(val_copy_text),
                                                                    np.shape(test_copy_text)))

Train shape: (31428,), Validation shape: (6735,), Test shape: (6735,)


In [27]:
import csv

def save_list_to_csv(data, filename):
    """
    리스트 데이터를 CSV 파일로 저장하는 함수

    :param data: 리스트 (각 요소가 문장이어야 함)
    :param filename: 저장할 CSV 파일 이름 (예: 'output.csv')
    """
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for row in data:
            writer.writerow([row])  # 문자열을 하나의 컬럼으로 저장

# 사용 예시
data = [
    "Hello world, this is a test.",
    "Python is great for data processing!",
    "CSV files are easy to handle."
]

save_list_to_csv(data, "output.csv")
print("CSV 파일 저장 완료!")


CSV 파일 저장 완료!


In [28]:
save_list_to_csv(train_copy_text, "/kaggle/working/train_copy_text.csv")

In [29]:
save_list_to_csv(val_copy_text, "/kaggle/working/val_copy_text.csv")
save_list_to_csv(test_copy_text, "/kaggle/working/test_copy_text.csv")

In [30]:
import csv

def load_csv_to_list(filename):
    """
    CSV 파일을 읽어서 리스트로 변환하는 함수

    :param filename: 읽을 CSV 파일 이름
    :return: 리스트 (각 행이 하나의 문장으로 저장됨)
    """
    with open(filename, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        data = [row[0] for row in reader]  # 첫 번째 열만 리스트로 저장
    return data

# 사용 예시
filename = "output.csv"
loaded_data = load_csv_to_list(filename)

# 결과 출력
print("CSV 파일을 리스트로 변환:")
for row in loaded_data:
    print(row)


CSV 파일을 리스트로 변환:
Hello world, this is a test.
Python is great for data processing!
CSV files are easy to handle.


In [31]:
train_copy_text = load_csv_to_list("/kaggle/working/train_copy_text.csv")
val_copy_text = load_csv_to_list("/kaggle/working/val_copy_text.csv")
test_copy_text = load_csv_to_list("/kaggle/working/test_copy_text.csv")

In [32]:
# we set max text length as 512
MAX_LENGHT = 512
# Tokenize and encode sequences in the train set
tokens_text_train = tokenizer.batch_encode_plus(
    train_copy_text,
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the validation set
tokens_text_val = tokenizer.batch_encode_plus(
    val_copy_text,
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the test set
tokens_text_test = tokenizer.batch_encode_plus(
    test_copy_text,
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)

In [33]:
# Convert lists to tensors
train_title_seq = torch.tensor(tokens_title_train['input_ids'])
train_title_mask = torch.tensor(tokens_title_train['attention_mask'])
train_title_y = torch.tensor(train_title_labels.tolist())

val_title_seq = torch.tensor(tokens_title_val['input_ids'])
val_title_mask = torch.tensor(tokens_title_val['attention_mask'])
val_title_y = torch.tensor(val_title_labels.tolist())

test_title_seq = torch.tensor(tokens_title_test['input_ids'])
test_title_mask = torch.tensor(tokens_title_test['attention_mask'])
test_title_y = torch.tensor(test_title_labels.tolist())

In [34]:
# Convert lists to tensors
train_text_seq = torch.tensor(tokens_text_train['input_ids'])
train_text_mask = torch.tensor(tokens_text_train['attention_mask'])
train_text_y = torch.tensor(train_text_labels.tolist())

val_text_seq = torch.tensor(tokens_text_val['input_ids'])
val_text_mask = torch.tensor(tokens_text_val['attention_mask'])
val_text_y = torch.tensor(val_text_labels.tolist())

test_text_seq = torch.tensor(tokens_text_test['input_ids'])
test_text_mask = torch.tensor(tokens_text_test['attention_mask'])
test_text_y = torch.tensor(test_text_labels.tolist())

In [35]:
# Data Loader structure definition
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 32                                               #define a batch size

train_title_data = TensorDataset(train_title_seq, train_title_mask, train_title_y)    # wrap tensors
train_title_sampler = RandomSampler(train_title_data)                     # sampler for sampling the data during training
train_title_dataloader = DataLoader(train_title_data, sampler=train_title_sampler, batch_size=batch_size)
                                                              # dataLoader for train set
val_title_data = TensorDataset(val_title_seq, val_title_mask, val_title_y)            # wrap tensors
val_title_sampler = SequentialSampler(val_title_data)                     # sampler for sampling the data during training
val_title_dataloader = DataLoader(val_title_data, sampler = val_title_sampler, batch_size=batch_size)
                                                              # dataLoader for validation set

In [36]:
train_text_seq.shape, train_text_mask.shape, train_text_y.reshape(-1, 1).shape

(torch.Size([31428, 512]), torch.Size([31428, 512]), torch.Size([31428, 1]))

In [37]:
# Data Loader structure definition
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 32                                               #define a batch size

train_text_data = TensorDataset(train_text_seq, train_text_mask, train_text_y.reshape(-1, 1))    # wrap tensors
train_text_sampler = RandomSampler(train_text_data)                     # sampler for sampling the data during training
train_text_dataloader = DataLoader(train_text_data, sampler=train_text_sampler, batch_size=batch_size)
                                                              # dataLoader for train set
val_text_data = TensorDataset(val_text_seq, val_text_mask, val_text_y)            # wrap tensors
val_text_sampler = SequentialSampler(val_text_data)                     # sampler for sampling the data during training
val_text_dataloader = DataLoader(val_text_data, sampler = val_text_sampler, batch_size=batch_size)
                                                              # dataLoader for validation set

In [38]:
class BERT_Arch(nn.Module):
    def __init__(self, bert1, bert2):
      super(BERT_Arch, self).__init__()
      self.bert1 = bert1
      self.bert2 = bert2
      self.dropout = nn.Dropout(0.1)            # dropout layer
      self.relu =  nn.ReLU()                    # relu activation function
      self.fc1 = nn.Linear(1536,768)             # dense layer 1
      self.fc2 = nn.Linear(768,2)              # dense layer 2 (Output layer)
      self.softmax = nn.LogSoftmax(dim=1)       # softmax activation function

    def forward(self, sent_id1, mask1, sent_id2, mask2):           # define the forward pass
      cls_hs = self.bert1(sent_id1, attention_mask=mask1)['pooler_output'] # headline
      cls_text = self.bert2(sent_id2, attention_mask=mask2)['pooler_output']
      cls = torch.concat((cls_hs, cls_text), dim = 1).to(device)

      x = self.fc1(cls)                         # pass the inputs to the model
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)                           # output layer
      x = self.softmax(x)                       # apply softmax activation
      return x

model = BERT_Arch(bert1, bert2).to(device)
# Defining the hyperparameters (optimizer, weights of the classes and the epochs)
# Define the optimizer

optimizer = AdamW(model.parameters(),
                  lr = 1e-5)          # learning rate
# Define the loss function
cross_entropy  = nn.NLLLoss().to(device)
# Number of training epochs
epochs = 2

In [39]:
# Defining training and evaluation functions
def train(train_title_dataloader, train_text_dataloader):
  model.train()
  total_loss, total_accuracy = 0, 0

  for step, (batch1, batch2) in enumerate(zip(train_title_dataloader, train_text_dataloader)):                # iterate over batches
    if step % 50 == 0 and not step == 0:                        # progress update after every 50 batches.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_title_dataloader)))
    #batch = [r for r in batch]                                  # push the batch to gpu
    sent_id1, mask1, labels1 = batch1
    sent_id1 = sent_id1.to(device)
    mask1 = mask1.to(device)
    labels1 = labels1.to(device)

    sent_id2, mask2, labels2 = batch2
    sent_id2 = sent_id2.to(device)
    mask2 = mask2.to(device)
    labels2 = labels2.to(device) # label1 == label2 : True

    model.zero_grad()                                           # clear previously calculated gradients
    preds = model(sent_id1, mask1, sent_id2, mask2).to(device)             # get model predictions for current batch
    loss = cross_entropy(preds, labels1)                         # compute loss between actual & predicted values
    total_loss = total_loss + loss.item()                       # add on to the total loss
    loss.backward()                                             # backward pass to calculate the gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)     # clip gradients to 1.0. It helps in preventing exploding gradient problem
    optimizer.step()                                            # update parameters
    preds=preds.detach().cpu().numpy()                          # model predictions are stored on GPU. So, push it to CPU

  avg_loss = total_loss / len(train_title_dataloader)                 # compute training loss of the epoch
                                                                # reshape predictions in form of (# samples, # classes)
  return avg_loss                                 # returns the loss and predictions

def evaluate(val_title_dataloader, val_text_dataloader):
  print("\nEvaluating...")
  model.eval()                                    # Deactivate dropout layers
  total_loss, total_accuracy = 0, 0
  for step, (batch1, batch2) in enumerate(zip(val_title_dataloader, val_text_dataloader)):    # Iterate over batches
    if step % 50 == 0 and not step == 0:          # Progress update every 50 batches.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_title_dataloader)))
                                                  # Report progress
    #batch = [t for t in batch]                    # Push the batch to GPU

    sent_id1, mask1, labels1 = batch1
    sent_id1 = sent_id1.to(device)
    mask1 = mask1.to(device)
    labels1 = labels1.to(device)

    sent_id2, mask2, labels2 = batch2
    sent_id2 = sent_id2.to(device)
    mask2 = mask2.to(device)
    labels2 = labels2.to(device)



    with torch.no_grad():                         # Deactivate autograd
      preds = model(sent_id1, mask1, sent_id2, mask2).to(device)                # Model predictions
      loss = cross_entropy(preds, labels1)          # Compute the validation loss between actual and predicted values
      total_loss = total_loss + loss.item()
      preds = preds.detach().cpu().numpy()
  avg_loss = total_loss / len(val_title_dataloader)         # compute the validation loss of the epoch
  return avg_loss

### model fine-tuning??

In [42]:
# Train and predict
# Head line
best_valid_loss = float('inf')
train_losses=[]                   # empty lists to store training and validation loss of each epoch
valid_losses=[]

for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss = train(train_title_dataloader, train_text_dataloader) # train model
    valid_loss = evaluate(val_title_dataloader, val_text_dataloader)  # evaluate model
    if valid_loss < best_valid_loss:              # save the best model
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/kaggle/working/c1_dual_bert_model_weights.pt')
    train_losses.append(train_loss)               # append training and validation loss
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 2
  Batch    50  of    983.
  Batch   100  of    983.
  Batch   150  of    983.
  Batch   200  of    983.
  Batch   250  of    983.
  Batch   300  of    983.
  Batch   350  of    983.
  Batch   400  of    983.
  Batch   450  of    983.
  Batch   500  of    983.
  Batch   550  of    983.
  Batch   600  of    983.
  Batch   650  of    983.
  Batch   700  of    983.
  Batch   750  of    983.
  Batch   800  of    983.
  Batch   850  of    983.
  Batch   900  of    983.
  Batch   950  of    983.

Evaluating...
  Batch    50  of    211.
  Batch   100  of    211.
  Batch   150  of    211.
  Batch   200  of    211.

Training Loss: 0.073
Validation Loss: 0.053

 Epoch 2 / 2
  Batch    50  of    983.
  Batch   100  of    983.
  Batch   150  of    983.
  Batch   200  of    983.
  Batch   250  of    983.
  Batch   300  of    983.
  Batch   350  of    983.
  Batch   400  of    983.
  Batch   450  of    983.
  Batch   500  of    983.
  Batch   550  of    983.
  Batch   600  of    983.
  

In [43]:
# load weights of best model
path = '/kaggle/working/c1_dual_bert_model_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [44]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [45]:
len(test_copy_text), len(train_copy_text), len(val_copy_text)

(6735, 31428, 6735)

In [46]:
len(test_title_seq), len(test_title_mask), len(test_text_seq), len(test_text_mask), len(test_title_y)

(6735, 6735, 6735, 6735, 6735)

In [None]:
batch_size = 8  # GPU 메모리 상황에 따라 적절한 배치 크기로 조정
preds_list = []

model.eval()  # 평가 모드로 전환
with torch.no_grad():
    for i in range(0, len(test_title_seq), batch_size):
        # 배치 단위로 데이터를 선택하고 GPU로 이동
        batch_title_seq = test_title_seq[i:i+batch_size].to(device)
        batch_title_mask = test_title_mask[i:i+batch_size].to(device)
        batch_text_seq = test_text_seq[i:i+batch_size].to(device)
        batch_text_mask = test_text_mask[i:i+batch_size].to(device)
        
        # 모델 추론
        batch_preds = model(batch_title_seq, batch_title_mask, batch_text_seq, batch_text_mask)
        preds_list.append(batch_preds.detach().cpu().numpy())

# 모든 배치의 결과 연결
preds = np.concatenate(preds_list, axis=0)
preds = np.argmax(preds, axis=1)
print(classification_report(test_title_y, preds))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3523
           1       0.99      0.98      0.98      3212

    accuracy                           0.99      6735
   macro avg       0.99      0.98      0.98      6735
weighted avg       0.99      0.99      0.99      6735



In [55]:
print(classification_report(test_title_y, preds, digits=5))

              precision    recall  f1-score   support

           0    0.98498   0.98638   0.98568      3523
           1    0.98503   0.98350   0.98427      3212

    accuracy                        0.98500      6735
   macro avg    0.98501   0.98494   0.98497      6735
weighted avg    0.98500   0.98500   0.98500      6735



In [None]:
# with torch.no_grad():
#   test_title_seq = test_title_seq.to(device)
#   test_title_mask = test_title_mask.to(device)
#   test_text_seq = test_text_seq.to(device)
#   test_text_mask = test_text_mask.to(device)
#   preds = model(test_title_seq, test_title_mask, test_text_seq, test_text_mask)
#   preds = preds.detach().cpu().numpy()

# preds = np.argmax(preds, axis = 1)
# print(classification_report(test_title_y, preds))