# 0. 데이터 엑셀 파일 만들기

In [None]:
import pandas as pd

# 웹 페이지에서 CSV 데이터를 읽을 때 첫 번째 열을 인덱스로 설정
url = "https://zenodo.org/records/4561253/files/WELFake_Dataset.csv"
df = pd.read_csv(url, sep=",", index_col=0, quoting=1)

# 데이터 프레임 확인
df

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...,...
72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


In [None]:
# 엑셀 파일로 저장
df.to_excel("WELFake_Dataset.xlsx", index=False)

NameError: name 'df' is not defined

# 1. 데이터 전처리

In [None]:
print(df.head())

                                               title  \
0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1                                                NaN   
2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3  Bobby Jindal, raised Hindu, uses story of Chri...   
4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
1     Did they post their votes for Hillary already?      1  
2   Now, most of the demonstrators gathered last ...      1  
3  A dozen politically active pastors came here f...      0  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1  


In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 72134 entries, 0 to 72133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   71576 non-null  object
 1   text    72095 non-null  object
 2   label   72134 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.2+ MB
None


In [None]:
print(df['title'].isnull().sum())
print(df['text'].isnull().sum())
print(df['label'].isnull().sum())

558
39
0


In [None]:
data=df.dropna(axis=0)

In [None]:
print(data['title'].isnull().sum())
print(data['text'].isnull().sum())
print(data['label'].isnull().sum())

0
0
0


In [None]:
# 텍스트와 타이틀 결합
data['text'] = data['title'] + " " + data['text']
data = data[['text', 'label']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['title'] + " " + data['text']


In [None]:
data

Unnamed: 0,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",0
4,SATAN 2: Russia unvelis an image of its terrif...,1
5,About Time! Christian Group Sues Amazon and SP...,1
...,...,...
72129,Russians steal research on Trump in hack of U....,0
72130,WATCH: Giuliani Demands That Democrats Apolog...,1
72131,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,Trump tussle gives unpopular Mexican leader mu...,0


In [None]:
# 레이블 카운트
label_counts = df['label'].value_counts()
print(label_counts)

# 클래스별 비율 확인
label_ratios = df['label'].value_counts(normalize=True)
print(label_ratios)

label
1    37106
0    35028
Name: count, dtype: int64
label
1    0.514404
0    0.485596
Name: proportion, dtype: float64


# 토큰 임베딩

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [None]:
import torch
from transformers import BartTokenizer, BartModel
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd

# 최적화 설정
torch.backends.cudnn.benchmark = True  # GPU 최적화
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 데이터셋 정의
class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx, 0]
        encoding = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}  # 배치 차원 제거
        return encoding

# 데이터 로드 및 전처리
model = SentenceTransformer('all-MiniLM-L6-v2')

# GPU 사용 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from tqdm import tqdm

# 배치 사이즈 설정
batch_size = 128
total_batches = len(data) // batch_size + 1

# 벡터 초기화
all_vectors = []

# 벡터화 수행 및 진행 상황 출력
for i in tqdm(range(0, len(data), batch_size), desc="Processing batches"):
    batch_texts = data['text'][i:i+batch_size].tolist()
    embeddings = model.encode(batch_texts, batch_size=batch_size, convert_to_tensor=True, device=device)
    all_vectors.extend(embeddings.cpu().numpy())

    # print(f"Processed batch {i//batch_size + 1}/{total_batches}")

Processing batches: 100%|██████████| 559/559 [02:08<00:00,  4.35it/s]


In [None]:
len(all_vectors)

71537

In [None]:
# 벡터를 DataFrame으로 변환
vector_df = pd.DataFrame(all_vectors)
vector_df['label'] = data['label'].values

In [None]:
vector_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,label
0,-0.008818,-0.001956,-0.022637,0.030399,0.096721,0.024922,0.062340,0.002199,-0.009299,-0.026329,...,-0.091296,0.015135,-0.046467,-0.046307,0.015934,0.026134,-0.030865,-0.047281,0.012353,1
1,0.068621,0.043289,-0.059886,0.029838,0.058201,0.012054,0.046609,-0.056433,-0.018829,0.089386,...,-0.056936,0.013663,0.021834,0.013222,0.029766,0.011869,-0.031871,-0.034029,0.031087,1
2,-0.008437,0.067539,-0.085811,-0.008455,0.057599,0.005268,0.017612,-0.037805,0.093543,-0.018007,...,-0.029729,0.006173,0.031705,0.002552,-0.077032,0.027433,-0.085631,-0.052072,-0.045564,0
3,0.036160,0.035061,-0.062981,-0.000053,0.018366,-0.010113,-0.041939,0.068333,-0.022210,-0.018656,...,0.067490,-0.011613,-0.096311,-0.039798,-0.012028,0.017144,-0.146528,-0.050490,0.049469,1
4,-0.055806,0.045611,-0.076916,0.005441,0.057404,0.030541,0.010850,-0.089523,0.103785,0.009073,...,-0.028265,0.011932,0.032236,-0.032224,-0.003333,0.013883,-0.121875,-0.006950,0.012722,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71532,-0.046689,-0.043160,0.005569,-0.026756,0.061967,-0.022007,0.083302,-0.046447,0.080071,0.016410,...,0.023046,0.034673,-0.040586,-0.018765,-0.025881,-0.042462,-0.011427,0.012590,-0.041516,0
71533,-0.037746,0.082985,0.004951,0.018544,0.005917,-0.016058,0.012583,-0.065917,-0.022187,-0.051978,...,-0.015544,0.003241,-0.015267,0.016063,0.066616,-0.009672,-0.027563,-0.017914,-0.025669,1
71534,-0.010403,0.031983,-0.035177,0.065481,0.093416,0.032548,-0.013071,0.062305,-0.033010,-0.063703,...,-0.046014,-0.066694,0.005924,-0.013901,0.011132,-0.045269,0.040030,-0.044760,0.035846,0
71535,-0.012462,0.035592,0.023589,0.000602,0.069685,-0.048896,0.074785,0.062296,0.024901,-0.074493,...,-0.017525,-0.021166,-0.000190,0.003494,-0.020333,0.019231,-0.055103,-0.024711,0.062653,0


In [None]:
import torch
import numpy as np
from tqdm import tqdm

# position embedding 함수
def add_positional_embeddings(embeddings, max_len=384, device="cuda"):
    batch_size, embed_dim = embeddings.size(0), embeddings.size(1)

    position_ids = torch.arange(0, max_len, device=device).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, embed_dim, 2, device=device) * -(np.log(10000.0) / embed_dim))
    position_embeddings = torch.zeros((max_len, embed_dim), device=device)
    position_embeddings[:, 0::2] = torch.sin(position_ids * div_term)
    position_embeddings[:, 1::2] = torch.cos(position_ids * div_term)

    position_embeddings = position_embeddings[:embeddings.size(0), :]
    return embeddings + position_embeddings

# segment embedding 함수
def add_segment_embeddings(embeddings, device="cuda"):
    batch_size, embed_dim = embeddings.size(0), embeddings.size(1)
    segment_embeddings = torch.zeros((batch_size, embed_dim), device=device)

    return embeddings + segment_embeddings

# 최종 임베딩 처리
def process_embeddings(embeddings, max_len=384, device="cuda"):
    embeddings = torch.tensor(np.array(embeddings), device=device)

    # 위치 임베딩 추가
    embeddings = add_positional_embeddings(embeddings, max_len=max_len, device=device)

    # 세그먼트 임베딩 추가
    embeddings = add_segment_embeddings(embeddings, device=device)

    return embeddings.cpu().numpy()

# 벡터화된 데이터프레임에 임베딩 추가
def process_all_embeddings(all_vectors, batch_size=128):
    final_embeddings = []
    num_batches = len(all_vectors) // batch_size + 1

    for i in tqdm(range(0, len(all_vectors), batch_size), desc="Processing batches"):
        batch_vectors = all_vectors[i:i + batch_size]
        batch_embeddings = process_embeddings(batch_vectors)
        final_embeddings.extend(batch_embeddings)

    return np.array(final_embeddings)

In [None]:
# 벡터화된 데이터프레임에 임베딩 추가
final_embeddings = process_all_embeddings(all_vectors)

Processing batches: 100%|██████████| 559/559 [00:00<00:00, 1388.18it/s]


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# 데이터를 PyTorch Tensor로 변환 및 결합
final_embeddings_tensor = torch.tensor(final_embeddings, dtype=torch.float32)
labels_tensor = torch.tensor(vector_df['label'].values, dtype=torch.long)

# TensorDataset을 이용하여 결합
dataset = TensorDataset(final_embeddings_tensor, labels_tensor)

# DataLoader 생성
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
dataset

<torch.utils.data.dataset.TensorDataset at 0x7c1f327d2320>

## 데이터 저장

In [None]:
import numpy as np

# 벡터화된 데이터와 레이블을 함께 저장하기 위해 결합
final_embeddings_with_labels = np.hstack((final_embeddings, vector_df['label'].values.reshape(-1, 1)))

# 벡터화된 데이터를 NumPy 배열로 저장
np.save('final_embeddings.npy', final_embeddings)

# 레이블도 함께 저장
np.save('labels.npy', vector_df['label'].values)

# 또는 .csv 형식으로 저장
np.savetxt('final_embeddings_with_labels.csv', final_embeddings_with_labels, delimiter=',')

In [None]:
# 벡터화된 데이터를 DataFrame으로 변환
final_embeddings_df = pd.DataFrame(final_embeddings)
final_embeddings_df['label'] = vector_df['label'].values

# 엑셀 파일로 저장
final_embeddings_df.to_excel('final_embeddings.xlsx', index=False)

# 데이터 불러오기

In [1]:
import numpy as np
import pandas as pd
import torch

from torch.utils.data import TensorDataset, DataLoader

In [5]:
# 저장된 데이터를 불러오기
final_embeddings = np.load('final_embeddings.npy', allow_pickle=True)
labels = np.load('labels.npy')

# PyTorch TensorDataset으로 변환
final_embeddings_tensor = torch.tensor(final_embeddings, dtype=torch.float32)
labels_tensor = torch.tensor(labels, dtype=torch.long)

# TensorDataset을 이용하여 결합
dataset = TensorDataset(final_embeddings_tensor, labels_tensor)

# DataLoader 생성
# train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
from torch.utils.data import TensorDataset, DataLoader

# 엑셀 파일에서 데이터 불러오기
loaded_df = pd.read_excel('final_embeddings.xlsx')

# 데이터와 레이블 분리
final_embeddings = loaded_df.drop('label', axis=1).values
labels = loaded_df['label'].values

# PyTorch Tensor로 변환
final_embeddings_tensor = torch.tensor(final_embeddings, dtype=torch.float32)
labels_tensor = torch.tensor(labels, dtype=torch.long)

# TensorDataset을 이용하여 결합
dataset = TensorDataset(final_embeddings_tensor, labels_tensor)

# DataLoader 생성
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
from torch.utils.data import TensorDataset, DataLoader


# 또는 .csv 파일 불러오기
loaded_embeddings  = np.loadtxt('final_embeddings_with_labels.csv', delimiter=',')

# 임베딩과 레이블 분리
final_embeddings = loaded_embeddings[:, :-1]  # 임베딩
labels = loaded_embeddings[:, -1].astype(int)  # 레이블

# PyTorch Tensor로 변환
final_embeddings_tensor = torch.tensor(final_embeddings, dtype=torch.float32)
labels_tensor = torch.tensor(labels, dtype=torch.long)

# TensorDataset을 이용하여 결합
dataset = TensorDataset(final_embeddings_tensor, labels_tensor)

# DataLoader 생성
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 트랜스포머 모델

## 모델 구조

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

### position encoding

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=700):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]


### attetion masking

In [8]:
# 패딩 마스크 생성
def create_padding_mask(seq):
    # 시퀀스의 패딩 위치를 찾고 마스크 생성 (패딩은 0으로 표시)
    seq = seq.eq(0)
    return seq.unsqueeze(1).unsqueeze(2)  # 배치 차원과 헤드 차원을 추가하여 마스크 모양을 맞춤

# Look-ahead 마스크 생성
def create_look_ahead_mask(size):
    # Look-ahead 마스크를 생성하여 미래의 토큰을 가리는 삼각형 형태의 마스크를 만듦
    mask = torch.tril(torch.ones(size, size)).to(torch.bool)
    return mask

### token masking

In [9]:
# 마스킹 레이어 추가
def mask_tokens(inputs, tokenizer, mask_prob=0.15):
    """
    입력 시퀀스의 일부 토큰을 마스킹하는 함수.
    - inputs: 토큰화된 입력 시퀀스
    - tokenizer: 사용 중인 토크나이저
    - mask_prob: 마스킹 확률
    """
    labels = inputs.clone()
    # 확률에 따라 마스크할 위치 결정
    masked_indices = torch.bernoulli(torch.full(labels.shape, mask_prob)).bool()
    labels[~masked_indices] = -100  # 마스크되지 않은 토큰은 손실 계산에서 제외

    # 80%는 [MASK] 토큰으로 대체
    inputs[masked_indices] = tokenizer.mask_token_id

    # 나머지 20%는 10%는 무작위 토큰, 10%는 원래 토큰 유지
    return inputs, labels

In [10]:
import torch

def mask_vectors(inputs, mask_prob=0.15):
    """
    벡터화된 입력 데이터에서 일부 벡터를 마스킹하는 함수.
    - inputs: 벡터화된 입력 데이터 (텐서 형태)
    - mask_prob: 마스킹 확률
    """
    masked_indices = torch.bernoulli(torch.full(inputs.shape[:-1], mask_prob)).bool()
    mask_value = torch.zeros(inputs.size(-1)).to(inputs.device)  # 마스킹 값 (0 벡터)
    labels = inputs.clone()
    labels[~masked_indices] = -100  # 마스크되지 않은 벡터는 손실 계산에서 제외

    # 마스킹된 위치에 대해 0 벡터로 대체
    inputs[masked_indices] = mask_value

    return inputs, labels

### self attetion

In [11]:
import torch.nn as nn

class SelfAttention(nn.Module):
    def __init__(self, d_model):
        super(SelfAttention, self).__init__()
        # Query, Key, Value를 계산하기 위한 선형 변환 레이어
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        # 스케일링 팩터를 설정하여 점곱 계산 시 안정성을 높임
        self.scale = 1 / (d_model ** 0.5)

    def forward(self, x):
        # Query, Key, Value 계산
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)
        # Query와 Key의 점곱 계산 후 스케일링 적용
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        # Softmax를 사용하여 Attention Weight 계산
        attn = torch.nn.functional.softmax(scores, dim=-1)
        # Attention Weight를 사용하여 Value의 가중합을 계산
        context = torch.matmul(attn, V)
        # Residual Connection 적용 후 반환
        return context + x

### Scaled Dot-Product Attention

In [12]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k):
        super(ScaledDotProductAttention, self).__init__()
        self.d_k = d_k  # 스케일링을 위한 차원 크기

    def forward(self, Q, K, V, mask=None):
        # Query와 Key의 점곱을 스케일링하여 Attention Score 계산
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))
        if mask is not None:
            # 마스크 적용 (마스크된 부분에 -inf를 할당)
            scores = scores.masked_fill(mask == 0, -1e9)
        # Softmax를 적용하여 Attention Weight 계산
        attn = torch.nn.functional.softmax(scores, dim=-1)
        # Attention Weight를 사용하여 최종 Value 계산
        return torch.matmul(attn, V), attn

### Multi-Head Attention

In [13]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads  # 어텐션 헤드 수
        self.d_k = d_model // num_heads  # 각 헤드의 차원 크기
        self.d_model = d_model

        # Query, Key, Value 계산을 위한 선형 변환 레이어
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        # 최종 출력 계산을 위한 선형 변환 레이어
        self.fc = nn.Linear(d_model, d_model)

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)

        # 선형 변환 후 헤드 수에 따라 분리
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

        # 스케일드 점곱 어텐션 적용
        out, attn = ScaledDotProductAttention(self.d_k)(Q, K, V, mask)

        # 여러 헤드를 결합하고 최종 선형 변환을 적용
        out = out.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        out = self.fc(out)

        return out

### Position-wise Feed Forward Network

In [14]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        # 두 개의 선형 레이어로 구성된 피드 포워드 네트워크
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        # 드롭아웃 레이어
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # 활성화 함수와 드롭아웃을 적용하여 두 번째 선형 레이어를 통과
        return self.fc2(self.dropout(torch.nn.functional.relu(self.fc1(x))))

### Encoder

In [15]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward, dropout, activation_function=nn.GELU()):
        super(EncoderLayer, self).__init__()
        # 멀티헤드 어텐션 레이어
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        # 피드 포워드 네트워크
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            activation_function,
            nn.Linear(dim_feedforward, d_model)
        )
        # 레이어 정규화와 드롭아웃 레이어
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, src_mask=None):
        # 멀티헤드 어텐션 통과 후 Residual Connection 적용
        attn_output = self.self_attn(x, x, x, src_mask)
        x = self.norm1(x + self.dropout(attn_output))
        # 피드 포워드 네트워크 통과 후 Residual Connection 적용
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_output))
        return x

### Decoder

In [16]:
# Decoder 레이어
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward, dropout, activation_function=nn.GELU()):
        super(DecoderLayer, self).__init__()
        # 멀티헤드 어텐션 레이어 (자기 자신)
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        # 멀티헤드 어텐션 레이어 (인코더-디코더)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        # 피드 포워드 네트워크
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            activation_function,
            nn.Linear(dim_feedforward, d_model)
        )
        # 레이어 정규화와 드롭아웃 레이어
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, memory, tgt_mask=None, memory_mask=None):
        # 자기 자신에 대한 멀티헤드 어텐션 통과 후 Residual Connection 적용
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        # 인코더의 출력에 대한 멀티헤드 어텐션 통과 후 Residual Connection 적용
        cross_attn_output = self.cross_attn(x, memory, memory, memory_mask)
        x = self.norm2(x + self.dropout(cross_attn_output))
        # 피드 포워드 네트워크 통과 후 Residual Connection 적용
        ffn_output = self.ffn(x)
        x = self.norm3(x + self.dropout(ffn_output))
        return x

### FND-NS 모델 결합

In [17]:
# FND-NS 모델 결합
class FNDNSModel(nn.Module):
    def __init__(self, input_dim, d_model, num_heads, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout, num_labels, seq_length, activation_function=nn.GELU()):
        super(FNDNSModel, self).__init__()
        self.d_model = d_model

        # 입력 벡터를 d_model 크기로 변환하기 위한 프로젝션 레이어
        self.input_projection = nn.Linear(input_dim, d_model)

        # 위치 인코딩 추가
        self.positional_encoding = PositionalEncoding(d_model, max_len=seq_length)

        # 인코더 레이어들을 생성
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, dim_feedforward, dropout, activation_function)
            for _ in range(num_encoder_layers)
        ])

        # 디코더 레이어들을 생성
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, dim_feedforward, dropout, activation_function)
            for _ in range(num_decoder_layers)
        ])

        # 최종 출력 레이어: 디코더의 마지막 출력에서 최종 클래스를 예측
        self.fc_out = nn.Linear(d_model, num_labels)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # 입력 벡터를 d_model 크기로 변환
        src = self.input_projection(src)
        tgt = self.input_projection(tgt)

        # 위치 인코딩 적용: 입력 시퀀스에 위치 정보를 추가
        src = self.positional_encoding(src)
        tgt = self.positional_encoding(tgt)

        # 인코더 레이어들을 순차적으로 통과
        for layer in self.encoder_layers:
            src = layer(src, src_mask)

        # 인코더의 출력을 메모리로 저장 (디코더의 입력으로 사용)
        memory = src

        # 디코더 레이어들을 순차적으로 통과
        for layer in self.decoder_layers:
            tgt = layer(tgt, memory, tgt_mask)

        # 디코더의 마지막 출력에서 최종 클래스를 예측
        output = self.fc_out(tgt[:, -1, :])

        # 소프트맥스 함수로 클래스 확률 계산
        return torch.softmax(output, dim=-1)

## 학습

In [18]:
# 하이퍼파라미터 설정
input_dim = 384  # 입력 데이터 크기 (임베딩된 벡터의 차원)
d_model = 512  # 모델의 임베딩 차원
num_heads = 8  # 어텐션 헤드 수
num_encoder_layers = 12  # 인코더 레이어 수
num_decoder_layers = 12  # 디코더 레이어 수
dim_feedforward = 1024  # FFN 차원
dropout = 0.1  # 드롭아웃 확률
num_labels = 2  # 클래스 수
batch_size = 16  # 배치 크기
learning_rate = 1e-3  # 학습률
epochs = 10  # 에포크 수
seq_length = 512  # 시퀀스 길이
warmup_steps = 500  # 워밍업 스텝 수
activation_function = nn.GELU()  # 활성화 함수

In [19]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [20]:
# 데이터를 PyTorch Tensor로 변환 및 결합
#final_embeddings_tensor = torch.tensor(final_embeddings, dtype=torch.float32)
#labels_tensor = torch.tensor(vector_df['label'].values, dtype=torch.long)

# TensorDataset을 이용하여 결합
dataset = TensorDataset(final_embeddings_tensor, labels_tensor)

import torch
from torch.utils.data import TensorDataset, random_split

# 데이터셋의 전체 크기
n_total = len(final_embeddings_tensor)

# 나눌 비율에 맞는 크기 계산
n_test = int(0.15 * n_total)
n_val = int(0.10 * n_total)
n_train = n_total - n_test - n_val

# 데이터를 무작위로 섞음
indices = torch.randperm(n_total)

# 무작위로 섞인 데이터를 비율에 따라 나누기
train_indices = indices[:n_train]
val_indices = indices[n_train:n_train + n_val]
test_indices = indices[n_train + n_val:]

train_embeddings = final_embeddings_tensor[train_indices]
val_embeddings = final_embeddings_tensor[val_indices]
test_embeddings = final_embeddings_tensor[test_indices]

train_labels = labels_tensor[train_indices]
val_labels = labels_tensor[val_indices]
test_labels = labels_tensor[test_indices]

# TensorDataset을 이용하여 결합
train_dataset = TensorDataset(train_embeddings, train_labels)
val_dataset = TensorDataset(val_embeddings, val_labels)
test_dataset = TensorDataset(test_embeddings, test_labels)

# DataLoader 생성
from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=8,  # 코랩에서는 2~4로 설정
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=8,  # 코랩에서는 2~4로 설정
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=8,  # 코랩에서는 2~4로 설정
    pin_memory=True
)

In [21]:
from transformers import get_linear_schedule_with_warmup
# import torch_xla.core.xla_model as xm

# 모델 정의
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = xm.xla_device()
model = FNDNSModel(input_dim, d_model, num_heads, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout, num_labels, seq_length, activation_function).to(device)

# 옵티마이저 및 손실 함수 정의
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# 스케줄러 설정
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

In [23]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# 학습 루프
# 학습 코드에서 마스킹 적용
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
        # 데이터 타입 및 GPU로 전송
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()

        # 벡터화된 입력 데이터에 마스킹 적용
        inputs, masked_labels = mask_vectors(inputs)

        # 모델에 입력을 넣고 출력 얻기
        outputs = model(inputs, inputs)  # outputs shape: (batch_size, seq_length, num_labels)

        # 마스킹된 레이블을 1D로 변환
        masked_labels = masked_labels.view(-1)  # (batch_size * seq_length)

        # 유효한 위치 (마스킹되지 않은 위치)를 찾기 위한 인덱스
        valid_indices = masked_labels != -100

        # 모델 출력도 동일한 방식으로 1D로 변환
        outputs = outputs.view(-1, num_labels)  # (batch_size * seq_length, num_labels)

        # 유효한 위치의 출력과 레이블만 선택
        valid_outputs = outputs[valid_indices]
        valid_labels = masked_labels[valid_indices]

        # 유효한 위치에서의 손실 계산 및 역전파
        loss = criterion(valid_outputs, valid_labels)
        loss.backward()

        # optimizer step
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(valid_outputs, 1)
        correct_predictions += (predicted == valid_labels).sum().item()
        total_predictions += valid_labels.size(0)

    epoch_accuracy = correct_predictions / total_predictions
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {epoch_accuracy:.4f}')


Epoch 1/10:   0%|          | 0/3354 [00:00<?, ?it/s]


IndexError: The shape of the mask [6144] at index 0 does not match the shape of the indexed tensor [16, 2] at index 0

In [24]:
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()

        # 모델 예측
        outputs = model(inputs, inputs)

        # 손실 계산 및 역전파
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()  # 학습률 스케줄링 업데이트

        running_loss += loss.item()

        # 정확도 계산
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

    # 에포크별 손실 및 정확도 출력
    epoch_accuracy = correct_predictions / total_predictions
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {epoch_accuracy:.4f}')

Epoch 1/10: 100%|██████████| 3354/3354 [05:54<00:00,  9.47it/s]


Epoch 1/10, Loss: 0.6976, Accuracy: 0.5035


Epoch 2/10: 100%|██████████| 3354/3354 [05:54<00:00,  9.46it/s]


Epoch 2/10, Loss: 0.6941, Accuracy: 0.5036


Epoch 3/10: 100%|██████████| 3354/3354 [05:54<00:00,  9.46it/s]


Epoch 3/10, Loss: 0.6942, Accuracy: 0.5033


Epoch 4/10: 100%|██████████| 3354/3354 [05:55<00:00,  9.44it/s]


Epoch 4/10, Loss: 0.6937, Accuracy: 0.5048


Epoch 5/10: 100%|██████████| 3354/3354 [05:55<00:00,  9.45it/s]


Epoch 5/10, Loss: 0.6936, Accuracy: 0.5049


Epoch 6/10: 100%|██████████| 3354/3354 [05:55<00:00,  9.45it/s]


Epoch 6/10, Loss: 0.6933, Accuracy: 0.5071


Epoch 7/10:  67%|██████▋   | 2246/3354 [03:57<01:56,  9.48it/s]Process Process-65:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 317, in _bootstrap
    util._exit_function()
  File "/usr/lib/python3.10/multiprocessing/util.py", line 360, in _exit_function
    _run_finalizers()
  File "/usr/lib/python3.10/multiprocessing/util.py", line 300, in _run_finalizers
    finalizer()
  File "/usr/lib/python3.10/multiprocessing/util.py", line 224, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/util.py", line 133, in _remove_temp_dir
    rmtree(tempdir)
  File "/usr/lib/python3.10/shutil.py", line 725, in rmtree
    _rmtree_safe_fd(fd, path, onerror)
  File "/usr/lib/python3.10/shutil.py", line 630, in _rmtree_safe_fd
    entries = list(scandir_it)
KeyboardInterrupt
Epoch 7/10:  67%|██████▋   | 2246/3354 [03:57<01:57,  9.45it/s]


RuntimeError: DataLoader worker (pid 11021) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace.

In [95]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


3. Mixed Precision Training
GPU 메모리를 절약하기 위해 혼합 정밀도 학습(Mixed Precision Training)을 사용하는 방법도 있습니다. 이를 위해 PyTorch의 torch.cuda.amp를 사용할 수 있습니다.

In [38]:
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# GradScaler 추가
scaler = GradScaler()

# 학습, 검증, 테스트 루프
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    # 학습 루프
    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()

        with autocast():
            outputs = model(inputs, inputs)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

    epoch_accuracy = correct_predictions / total_predictions
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {epoch_accuracy:.4f}')

    # 검증 루프
    model.eval()
    val_running_loss = 0.0
    val_correct_predictions = 0
    val_total_predictions = 0

    with torch.no_grad():
        for inputs, labels in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}/{epochs}"):
            inputs, labels = inputs.to(device), labels.to(device)
            with autocast():
                outputs = model(inputs, inputs)
                loss = criterion(outputs, labels)

            val_running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_correct_predictions += (predicted == labels).sum().item()
            val_total_predictions += labels.size(0)

    val_epoch_accuracy = val_correct_predictions / val_total_predictions
    print(f'Validation Epoch {epoch + 1}/{epochs}, Loss: {val_running_loss/len(val_loader):.4f}, Accuracy: {val_epoch_accuracy:.4f}')



Epoch 1/10: 100%|██████████| 3354/3354 [07:35<00:00,  7.36it/s]


Epoch 1/10, Loss: 0.6978, Accuracy: 0.5017


Validation Epoch 1/10: 100%|██████████| 448/448 [00:17<00:00, 25.43it/s]


Validation Epoch 1/10, Loss: 0.6933, Accuracy: 0.4865


Epoch 2/10:  79%|███████▊  | 2640/3354 [05:58<01:36,  7.36it/s]


KeyboardInterrupt: 

In [None]:
# 테스트 루프
model.eval()
test_running_loss = 0.0
test_correct_predictions = 0
test_total_predictions = 0

with torch.no_grad():
    for inputs, labels in tqdm(test_loader, desc="Testing"):
        inputs, labels = inputs.to(device), labels.to(device)
        with autocast():
            outputs = model(inputs, inputs)
            loss = criterion(outputs, labels)

        test_running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_correct_predictions += (predicted == labels).sum().item()
        test_total_predictions += labels.size(0)

test_accuracy = test_correct_predictions / test_total_predictions
print(f'Test Loss: {test_running_loss/len(test_loader):.4f}, Test Accuracy: {test_accuracy:.4f}')

4. Gradient Accumulation
배치 크기를 줄이는 대신, 작은 배치로 여러 번의 기울기(gradient)를 계산한 후에 한 번의 역전파를 수행하는 방법입니다.

In [None]:
accumulation_steps = 4  # 실제 배치 크기 = batch_size * accumulation_steps

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    optimizer.zero_grad()
    for i, (inputs, labels) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")):
        inputs, labels = inputs.to(device), labels.to(device)

        with autocast():
            outputs = model(inputs, inputs)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()

        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()

        running_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

    epoch_accuracy = correct_predictions / total_predictions
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {epoch_accuracy:.4f}')


Epoch 1/10: 100%|██████████| 2236/2236 [04:35<00:00,  8.11it/s]


Epoch 1/10, Loss: 0.6932, Accuracy: 0.5055


Epoch 2/10: 100%|██████████| 2236/2236 [04:37<00:00,  8.07it/s]


Epoch 2/10, Loss: 0.6931, Accuracy: 0.5069


Epoch 3/10: 100%|██████████| 2236/2236 [04:37<00:00,  8.06it/s]


Epoch 3/10, Loss: 0.6932, Accuracy: 0.5078


Epoch 4/10:  31%|███       | 698/2236 [01:27<03:13,  7.95it/s]


KeyboardInterrupt: 

In [None]:
accumulation_steps = 2  # 기울기 누적 단계 수

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs, inputs)
        loss = criterion(outputs, labels)
        loss = loss / accumulation_steps
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')


## tpu

In [31]:
# TPU를 사용하려면 런타임 설정에서 TPU를 선택하세요.
# 코드에서 아래와 같이 TPU 장치를 설정할 수 있습니다.
import torch_xla
import torch_xla.core.xla_model as xm
device = xm.xla_device()

print(f'Using device: {device}')


Using device: xla:0


In [43]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=1,  # TPU 환경에서 안정적으로 작동할 수 있도록 설정
    pin_memory=False
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=1,  # TPU 환경에서 안정적으로 작동할 수 있도록 설정
    pin_memory=False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=1,  # TPU 환경에서 안정적으로 작동할 수 있도록 설정
    pin_memory=False
)

In [None]:
# TPU 디버깅 로그 활성화
import torch_xla.debug.metrics as met


# 학습 루프 수정
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()

        outputs = model(inputs, inputs)
        loss = criterion(outputs, labels)
        loss.backward()

        xm.optimizer_step(optimizer, barrier=True)  # TPU에서 barrier 옵션 추가

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

        # 학습 코드 내에서 로그 출력
        # print(met.metrics_report())


    epoch_accuracy = correct_predictions / total_predictions
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {epoch_accuracy:.4f}')


Epoch 1/10:  11%|█▏        | 190/1677 [03:24<17:55,  1.38it/s]

# 학습 평가

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# 평가 함수 정의
def evaluate_model(model, data_loader):
    model.eval()  # 평가 모드로 전환
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs, inputs)
            _, predicted = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    # 성능 지표 계산
    precision = precision_score(all_labels, all_predictions, average='binary')
    recall = recall_score(all_labels, all_predictions, average='binary')
    f1 = f1_score(all_labels, all_predictions, average='binary')
    conf_matrix = confusion_matrix(all_labels, all_predictions)

    # 성능 지표 출력
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')
    print(f'Confusion Matrix:\n{conf_matrix}')

# 학습 후 평가 함수 호출
evaluate_model(model, test_loader)