<a href="https://colab.research.google.com/github/dohyeongkim97/papers/blob/master/gan_augmentator_en_train_de_faire.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
data_path = './drive/MyDrive/paper_data-master/'
df = pd.read_csv(data_path+'train.csv')
test = pd.read_csv(data_path+'test.csv')

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

In [4]:
def make_dataset(data, tokenizer, device):
    tokenized = tokenizer(
        text = data.text.tolist(),
        padding= 'longest',
        truncation = True,
        return_tensors = 'pt'
    )
    input_ids = tokenized['input_ids'].to(device)
    attention_mask = tokenized['attention_mask'].to(device)
    if 'label' in data.columns:
        labels = torch.tensor(data['label'].values, dtype=torch.long).to(device)
        return TensorDataset(input_ids, attention_mask, labels)
    else:
        return TensorDataset(input_ids, attention_mask)

In [5]:
def get_dataloader(dataset, sampler, batch_size):
    data_sampler = sampler(dataset)
    dataloader = DataLoader(dataset, sampler = data_sampler, batch_size = batch_size)
    return dataloader

In [27]:
def make_gan_dataset(data, tokenizer, device, max_length=128):
    tokenized_data = tokenizer(
        text=data['text'].tolist(),
        padding='max_length',
        max_length=max_length,
        truncation=True,
        return_tensors='pt'
    )

    input_ids = tokenized_data['input_ids'].to(device)
    attention_mask = tokenized_data['attention_mask'].to(device)

    return TensorDataset(input_ids, attention_mask)

In [6]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch import optim
from transformers import BertForSequenceClassification
from torch import nn
import math
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-multilingual-cased',
    do_lower_case = False
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



In [8]:
df

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [9]:
def frame_make(df):
    if 'first_party_winner' in df.columns:
        data = pd.DataFrame(columns = ['text', 'target'])
        df['first_party_winner'] = df['first_party_winner'].astype(int)
        data['text'] = 'first_party:' + df['first_party'] + 'second_party:' + df['second_party'] + 'facts:' + df['facts'] + '\nwinner: '
        data['target'] = df['first_party_winner']
        return data
    else:
        data = pd.DataFrame(columns = ['text'])
        data['text'] = 'first_party:' + df['first_party'] + 'second_party:' + df['second_party'] + 'facts:' + df['facts'] + '\nwinner: '
        return data

In [10]:
df_data = frame_make(df)
df_data = make_dataset(df_data, tokenizer, device)
df_data = get_dataloader(df_data, RandomSampler, 16)

In [11]:
df_data

<torch.utils.data.dataloader.DataLoader at 0x7b9ea833a140>

In [12]:
df_data = frame_make(df)

In [13]:
df_data

Unnamed: 0,text,target
0,first_party:Phil A. St. Amantsecond_party:Herm...,1
1,first_party:Stephen Duncansecond_party:Lawrenc...,0
2,first_party:Billy Joe Magwoodsecond_party:Tony...,1
3,first_party:Linklettersecond_party:Walkerfacts...,0
4,first_party:William Earl Fikessecond_party:Ala...,1
...,...,...
2473,"first_party:HollyFrontier Cheyenne Refining, L...",1
2474,"first_party:Grupo Mexicano de Desarrollo, S. A...",1
2475,first_party:Peguerosecond_party:United Statesf...,0
2476,first_party:Immigration and Naturalization Ser...,0


In [20]:
df = frame_make(df)

In [22]:
epochs = 5
batch_size = 16
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-multilingual-cased',
    do_lower_case = False
)



train_df, valid, test = np.split(
    df.sample(frac = 1, random_state = 42), [int(0.6*len(df)), int(0.8*len(df))]
)

train_gan_dataset = make_gan_dataset(train_df, tokenizer, device)
train_dataloader = get_dataloader(train_gan_dataset, RandomSampler, batch_size)

valid_gan_dataset = make_gan_dataset(valid, tokenizer, device)
valid_dataloader = get_dataloader(valid_gan_dataset, RandomSampler, batch_size)

test_gan_dataset = make_gan_dataset(test, tokenizer, device)
test_dataloader = get_dataloader(test_gan_dataset, RandomSampler, batch_size)

In [29]:
optimizer_G = optim.Adam(G.parameters(), lr=0.001)
optimizer_D = optim.Adam(D.parameters(), lr=0.001)

In [30]:
class ConditionalTransformerGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(ConditionalTransformerGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.transformer = nn.Transformer(d_model=embedding_dim, nhead=4, num_encoder_layers=2, num_decoder_layers=2)
        self.fc = nn.Linear(embedding_dim, vocab_size)

    def forward(self, noise, labels):
        embedded = self.embedding(noise)
        embedded = embedded.permute(1, 0, 2)
        transformer_out = self.transformer(embedded, embedded)
        output = self.fc(transformer_out.permute(1, 0, 2))
        return output

class ConditionalTransformerDiscriminator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(ConditionalTransformerDiscriminator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.transformer = nn.Transformer(d_model=embedding_dim, nhead=4, num_encoder_layers=2)
        self.fc = nn.Linear(embedding_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, sequence, labels):
        embedded = self.embedding(sequence).permute(1, 0, 2)
        transformer_out = self.transformer(embedded, embedded)
        transformer_out = transformer_out[-1, :, :]
        output = self.sigmoid(self.fc(transformer_out))
        return output

In [35]:
vocab_size = 30522 # bert model base vocab size
embedding_dim = 256
hidden_dim = 512

G = ConditionalTransformerGenerator(vocab_size, embedding_dim, hidden_dim)
D = ConditionalTransformerDiscriminator(vocab_size, embedding_dim, hidden_dim)



In [36]:
optimizer_G = optim.Adam(G.parameters(), lr=0.001)
optimizer_D = optim.Adam(D.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [37]:
epochs = 10
batch_size = 16

In [39]:
for epoch in range(epochs):
    for real_data, real_labels in train_dataloader:
        batch_size = real_data.size(0)

        # 5-1. 실 데이터 학습
        real_labels = torch.ones(batch_size, 1)
        fake_labels = torch.zeros(batch_size, 1)

        # 5-2. 생성자 학습
        noise = torch.randint(0, vocab_size, (batch_size, 128), dtype=torch.long)  # 노이즈 생성
        fake_data = G(noise, real_labels).argmax(dim=-1)  # 생성자에서 가짜 데이터 생성

        # 판별자 학습
        real_output = D(real_data, real_labels)
        fake_output = D(fake_data.detach(), fake_labels)
        loss_D = (criterion(real_output, real_labels) + criterion(fake_output, fake_labels)) / 2

        optimizer_D.zero_grad()
        loss_D.backward()
        optimizer_D.step()

        # 생성자 학습
        fake_output = D(fake_data, real_labels)
        loss_G = criterion(fake_output, real_labels)

        optimizer_G.zero_grad()
        loss_G.backward()
        optimizer_G.step()

    if epoch % 100 == 0:
        print(f"Epoch {epoch} - Loss D: {loss_D.item():.4f}, Loss G: {loss_G.item():.4f}")


IndexError: index out of range in self