## TorchText

https://pytorch.org/text/stable/index.html

The torchtext package consists of data processing utilities and popular datasets for natural language.



In [1]:
import torchtext
import torch
SEED = 1234

torch.manual_seed(SEED)
torchtext.__version__

'0.17.2+cpu'

In [2]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch import nn

import torch.optim as optim

import time

In [None]:
!wget https://raw.githubusercontent.com/sanajlee/langcomp/main/clickbait_train.tsv
!wget https://raw.githubusercontent.com/sanajlee/langcomp/main/clickbait_test.tsv

--2024-04-13 03:16:27--  https://raw.githubusercontent.com/sanajlee/langcomp/main/clickbait_train.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1477057 (1.4M) [text/plain]
Saving to: ‘clickbait_train.tsv.4’


2024-04-13 03:16:27 (25.0 MB/s) - ‘clickbait_train.tsv.4’ saved [1477057/1477057]

--2024-04-13 03:16:27--  https://raw.githubusercontent.com/sanajlee/langcomp/main/clickbait_test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 341065 (333K) [text/plain]
Saving to: ‘clickbait_test.tsv.4’


2024-04-13 03:16:28 (9.79 

In [3]:
# 파일 열어서 [text, label] 형식의 목록으로 저장하기
def read_file(fname):
    list_iter = []
    with open(fname, 'r', encoding='UTF8') as f:
        lines = f.readlines()
        for line in lines[1:]:
            text, label = line.strip().split('\t')
            list_iter.append([text, label])

    return list_iter

In [4]:
train_iter = read_file('clickbait_train.tsv')
test_iter = read_file('clickbait_test.tsv')

In [5]:
# 데이터 확인
for line in train_iter:
  print(line)

['Bell Canada Enterprises might be taken private', '0']
['Job Losses Push Safer Mortgages to Foreclosure', '0']
["A Monkey In India Has Adopted A Puppy And They're Adorably Inseparable", '1']
['Golf Shoes to Improve Your Style, and  Your Game', '0']
['Iran Claims Gains in Nuclear Program', '0']
['Uplifting Aerial Photos Of Beaches In Italy That Will Remind You Of Summer', '1']
['Venezuela Will Push U.S. to Hand Over Man Tied to Plane Bombing', '0']
['North Korean Leader, Thin and Limping, Returns to Assembly and Gains New Term', '0']
['A.I.G. Sues Government for Return of $306 Million in Tax Payments', '0']
['President Bush nominates John Roberts as Chief Justice of the U.S.', '0']
['Which "Goldbergs" Character Are You', '1']
['Mexican helicopter crash leaves soldier dead', '0']
['British Army soldiers to be court martialled for war crimes', '0']
['Michael Schumacher returns to Formula One to replace injured Massa', '0']
['Tropical Storm Dolores now active', '0']
['Air France to launch

In [7]:
# A map-style dataset is one that implements the __getitem__() and __len__() protocols,
# and represents a map from (possibly non-integral) indices/keys to data samples.
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

In [8]:
# map-style 데이터 확인: __len__(), __getitem__(idx) 메서드 사용 가능
print(train_dataset.__len__())
print(train_dataset.__getitem__(15431))

26000
["'Criminal in uniform': Senior London policeman jailed for attempting to frame Iraqi", '0']


In [9]:
# train : valid = 9 : 1로 split
num_train = int(len(train_dataset) * 0.9)
train_dataset, valid_dataset = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

In [10]:
print(len(train_dataset), len(valid_dataset), len(test_dataset))

23400 2600 6000


In [11]:
# let's use a pre-trained tokenizer from torchtext
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield tokenizer(text.lower()) # yield: return과 유사함. 결과값을 순차적으로 메모리에 할당

In [14]:
# build a vocabulary from training dataset
voc = build_vocab_from_iterator(yield_tokens(train_iter),
                                specials=["<unk>", "<pad>"],
                                max_tokens= 25000)

In [15]:
# set default indices for the special tokens
voc.set_default_index(voc['<unk>'])
# voc.set_default_index(voc['<pad>'])

In [16]:
# check the vocabulary!
voc.get_itos()

['<unk>',
 '<pad>',
 'to',
 'in',
 'the',
 "'",
 'you',
 'of',
 ',',
 'a',
 's',
 'for',
 '.',
 'and',
 'on',
 'your',
 'are',
 'is',
 'that',
 'this',
 'with',
 'at',
 'will',
 'from',
 'what',
 'new',
 'about',
 'who',
 'people',
 'things',
 'how',
 'can',
 'we',
 'which',
 'as',
 'it',
 'know',
 'by',
 'us',
 'make',
 'after',
 'be',
 '17',
 '21',
 'u',
 'do',
 'should',
 'these',
 'have',
 '19',
 'all',
 'based',
 'actually',
 't',
 'times',
 'their',
 'over',
 'an',
 'up',
 'here',
 'if',
 'was',
 'first',
 'most',
 'out',
 'like',
 'or',
 'more',
 'one',
 '2015',
 'world',
 'best',
 'when',
 'need',
 'life',
 'll',
 'i',
 'just',
 'has',
 'his',
 'time',
 '23',
 'dead',
 '15',
 'dies',
 're',
 'killed',
 '18',
 'her',
 'get',
 'every',
 'ever',
 'day',
 'everyone',
 'two',
 'love',
 'president',
 'uk',
 'man',
 'women',
 'says',
 'into',
 'were',
 'real',
 'too',
 'only',
 'zodiac',
 'not',
 'british',
 'photos',
 'pictures',
 '22',
 '13',
 'australian',
 '16',
 'year',
 'now',
 

In [17]:
# 'voc' converts words into indices
voc(['hello', 'my', 'name', 'is', 'mary'])

[2006, 295, 248, 17, 6232]

In [18]:
# '<unk>' processed
voc(['hello', 'my', 'name', 'is', 'sana'])

[2006, 295, 248, 17, 0]

In [19]:
# tokenize a sequence -> convert the words into indices
text_pipeline = lambda x: voc(tokenizer(x))

In [20]:
#collate_function: process the list of samples to form a batch.
# https://androidkt.com/create-dataloader-with-collate_fn-for-variable-length-input-in-pytorch/

def custom_collate_fn(batch):
    text_list, label_list= [], []
    for text, label in batch:
        label_list.append(int(label))
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)

    label_list = torch.tensor(label_list, dtype=torch.int64).unsqueeze(dim=1)
    text_list = pad_sequence(text_list, batch_first=True, padding_value = 1) #pad_sequence : 문장의 길이 맞춰주기, 빈자리는 pad라는 토큰으로 맞춰라. / padding_value : pad token을 숫자로 바꿀 때 1로 해라
    return text_list, label_list

In [21]:
# construct a DataLoader: making a batch, data shuffling, padding, etc.
train_dataloader = DataLoader(train_dataset, batch_size=16,
                              shuffle=True, collate_fn=custom_collate_fn)

valid_dataloader = DataLoader(valid_dataset, batch_size=16,
                              shuffle=True, collate_fn=custom_collate_fn)

test_dataloader = DataLoader(test_dataset, batch_size=16,
                              shuffle=True, collate_fn=custom_collate_fn)

In [26]:
# constructed batch
for _, (text, label) in enumerate(valid_dataloader):
    print(text, label)
    break

tensor([[    6,     5,   221,   256,  2365,     4,   730,   192,     3,    15,
          7094,    19,  1192,    80],
        [   47,   140,  1351,    20,  8592,    22,    39,     6,  1161,     1,
             1,     1,     1,     1],
        [   49,   110,   404,    28,    22,   149,   127,     1,     1,     1,
             1,     1,     1,     1],
        [   42,   135,   107,     2,  5906,     1,     1,     1,     1,     1,
             1,     1,     1,     1],
        [ 4212,    23,   276,    17,   432,     4,   556,   289,     1,     1,
             1,     1,     1,     1],
        [   72,     6,    95,    15,  2090,    67,   160,    28,     1,     1,
             1,     1,     1,     1],
        [  111,   306,    18,    16,   118,   104,   103,    11,    93,    27,
          1743,     3,    57,  6311],
        [   24,    17,   481,    52,   990,   216,     1,     1,     1,     1,
             1,     1,     1,     1],
        [    4,  2272,     7,    74,     8,   790,     2,    49,

The dataset is prepared.

Now we have to construct a model and its training and evaluation process.

In [27]:
class LSTMModel(nn.Module):
  def __init__(self, input_dim, emb_dim, hidden_dim, layer_dim, output_dim):
    super(LSTMModel, self).__init__()

    self.input_dim = input_dim
    self.emb_dim = emb_dim
    self.hidden_dim = hidden_dim
    self.layer_dim = layer_dim
    self.output_dim = output_dim

    # make the embeddings
    # to transform the input sparse vector into a dense vector
    self.embedding = nn.Embedding(self.input_dim, self.emb_dim) #input_dim으로부터 emb_dim으로 만들어라 input_dim은 vocab의 크기 10, 300 300차원의 임베딩을 10개 만들어라

    # LSTM
    self.lstm = nn.LSTM(self.emb_dim, self.hidden_dim, self.layer_dim, batch_first=True)

    # Readout layer
    self.fc = nn.Linear(self.hidden_dim, self.output_dim)

  def forward(self, x):

    x = self.embedding(x)

    # initialize hidden state with zeros
    h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(device)

    # initialize cell state
    c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(device)

    # 17 timesteps here (padded sequence length 17)
    out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
    output = self.fc(out[:, -1, :].squeeze(0)) #yhat1 = clickbait일때의 logit, yhat2 = clickbait가 아닐때의 logit

    return output


In [28]:
# instantiate the model

input_dim = len(voc)  # vocab size
emb_dim = 300 # any number you want to represent
hidden_dim = 128
layer_dim = 2
output_dim = 2

model = LSTMModel(input_dim, emb_dim, hidden_dim, layer_dim, output_dim)

In [34]:
# take a look at the initialized model parameters

def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,990,314 trainable parameters


In [39]:
list(model.parameters())

[('embedding.weight',
  Parameter containing:
  tensor([[ 1.2950,  1.1662,  0.0393,  ..., -2.0037,  1.3463,  0.1661],
          [-0.6288,  1.0922,  0.6010,  ..., -0.0423,  1.3690,  1.3866],
          [ 0.3771, -0.5500, -0.9609,  ..., -0.0509,  0.5247, -0.7593],
          ...,
          [ 0.4772, -1.1656,  0.7006,  ...,  0.0259,  1.1535, -0.2858],
          [-0.0655, -0.1562,  0.1893,  ...,  0.4075,  0.0150,  0.3893],
          [ 1.4222,  0.9594,  0.2803,  ..., -0.1113,  0.4366, -0.5970]],
         requires_grad=True)),
 ('lstm.weight_ih_l0',
  Parameter containing:
  tensor([[ 0.0398, -0.0038, -0.0289,  ...,  0.0695,  0.0316, -0.0315],
          [-0.0066, -0.0832, -0.0469,  ..., -0.0414,  0.0520,  0.0558],
          [-0.0797,  0.0560, -0.0091,  ..., -0.0779, -0.0760,  0.0789],
          ...,
          [ 0.0869,  0.0315, -0.0549,  ..., -0.0345,  0.0740, -0.0663],
          [ 0.0217,  0.0319, -0.0336,  ...,  0.0147,  0.0286, -0.0598],
          [ 0.0332, -0.0095, -0.0275,  ..., -0.0602, 

In [35]:
# prepare some elements to train the model

optimizer = optim.SGD(model.parameters(), lr=1e-3) # lr : learning rate, 크면 파러미터 조정이 큰폭으로 이루어짐 근데 불안정, 작으면 파러미터 조정이 작은폭 오래걸림
criterion = nn.CrossEntropyLoss()

In [32]:
# get accuracy

def binary_accuracy(preds, gold):
  # round predictions to the closest integer: 0 or 1
  rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (rounded_preds==gold).float()  # convert into float for division
  acc = correct.sum() / 2 / (len(preds))
  return acc

In [33]:
# if you're using a GPU
# let the model and the loss function use it

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

cpu


In [36]:
model = model.to(device)
criterion = criterion.to(device)

In [37]:
# set a function for model training

def train(model, dataloader, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.train() # let the model in a training mode (updating parameters)

  for text, label in dataloader:  # batch: (text, label)
    text = text.to(device)
    # make the label as the form of one-hot float vector (like probability distribution)
    label = torch.nn.functional.one_hot(label, num_classes=2).view(-1, 2).to(torch.float) #label을 binary로
    label = label.to(device)

    optimizer.zero_grad()
    predictions = model(text).to(device)
    loss = criterion(predictions, label)
    acc = binary_accuracy(predictions, label)


    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(dataloader), epoch_acc / len(dataloader)


In [38]:
# set a function for model evaluation

def evaluate(model, dataloader, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.eval()  # let the model in a test mode (not updating parameters)

  with torch.no_grad():
    for text, label in dataloader:
      text = text.to(device)
      # make the label as the form of one-hot float vector (like probability distribution)
      label = torch.nn.functional.one_hot(label, num_classes=2).view(-1, 2).to(torch.float)
      label = label.to(device)

      predictions = model(text).to(device)
      loss = criterion(predictions, label)
      acc = binary_accuracy(predictions, label)

      epoch_loss += loss.item()
      epoch_acc += acc.item()

  return epoch_loss / len(dataloader), epoch_acc / len(dataloader)


In [40]:
# a function to check elapsed time per epoch

def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [41]:
# for each epoch
# run the training and evaluation functions

N_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
  start_time = time.time()

  train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_dataloader, criterion)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  # saving the best model with the smallest valid loss
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'tutorialModel.pt') # 중간 평가 성능이 가장 좋은 모델을 저장하자

  print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\tValid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}%')


Epoch: 01 | Epoch Time: 0m 20s
	Train Loss: 0.693 | Train Acc: 49.66%
	Valid Loss: 0.690 | Valid Acc: 50.00%
Epoch: 02 | Epoch Time: 0m 20s
	Train Loss: 0.689 | Train Acc: 50.91%
	Valid Loss: 0.688 | Valid Acc: 51.61%
Epoch: 03 | Epoch Time: 0m 20s
	Train Loss: 0.686 | Train Acc: 52.71%
	Valid Loss: 0.684 | Valid Acc: 55.14%
Epoch: 04 | Epoch Time: 0m 20s
	Train Loss: 0.682 | Train Acc: 57.01%
	Valid Loss: 0.681 | Valid Acc: 56.86%
Epoch: 05 | Epoch Time: 0m 42s
	Train Loss: 0.680 | Train Acc: 57.91%
	Valid Loss: 0.680 | Valid Acc: 56.98%
Epoch: 06 | Epoch Time: 0m 42s
	Train Loss: 0.677 | Train Acc: 58.28%
	Valid Loss: 0.676 | Valid Acc: 58.82%
Epoch: 07 | Epoch Time: 0m 41s
	Train Loss: 0.673 | Train Acc: 59.06%
	Valid Loss: 0.671 | Valid Acc: 59.93%
Epoch: 08 | Epoch Time: 0m 29s
	Train Loss: 0.672 | Train Acc: 59.40%
	Valid Loss: 0.670 | Valid Acc: 59.87%
Epoch: 09 | Epoch Time: 0m 20s
	Train Loss: 0.669 | Train Acc: 59.66%
	Valid Loss: 0.667 | Valid Acc: 60.18%
Epoch: 10 | Epoch T

KeyboardInterrupt: 

In [43]:
# inference on the test set

model.load_state_dict(torch.load('tutorialModel.pt'))

test_loss, test_acc = evaluate(model, test_dataloader, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.614 | Test Acc: 67.28%
