In [1]:
%pwd

'/content'

In [2]:
import torch
import torch.nn as nn

torch.manual_seed(1)
rnn_layer = nn.RNN(input_size=5, hidden_size=2,
                   num_layers=1, batch_first=True, bidirectional=False)
w_xh = rnn_layer.weight_ih_l0
w_hh = rnn_layer.weight_hh_l0
b_xh = rnn_layer.bias_ih_l0
b_hh = rnn_layer.bias_hh_l0

print(rnn_layer.state_dict().keys())
print('W_xh shape:', w_xh.shape)
print('W_hh shape:', w_hh.shape)
print('b_xh shape:', b_xh.shape)
print('b_hh shape:', b_hh.shape)

odict_keys(['weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0'])
W_xh shape: torch.Size([2, 5])
W_hh shape: torch.Size([2, 2])
b_xh shape: torch.Size([2])
b_hh shape: torch.Size([2])


In [3]:
x_seq = torch.tensor([[1.0]*5, [2.0]*5, [3.0]*5]).float()

output, hn = rnn_layer(torch.reshape(x_seq, (1, 3, 5)))

out_man = []

for t in range(3):
    # xt = torch.reshape(x_seq[t], (1, 5))
    xt = x_seq[t]
    print(f'Time step {t} =>')
    print('    Input        :', xt.numpy())
    if t > 0:
        prev_h = out_man[t-1]
    else:
        # prev_h = torch.zeros((ht.shape))
        prev_h = torch.zeros((hn.shape))
    # torch.transpose(w_hh, 0, 1)
    zt = torch.matmul(xt, w_xh.T) + b_xh + \
         torch.matmul(prev_h, w_hh.T) + b_hh
    ht = torch.tanh(zt)
    print('    Hidden       :', ht.detach().numpy())
    out_man.append(ht)
    print('    RNN output      :', output[:, t].detach().squeeze(0).numpy())
    print()

Time step 0 =>
    Input        : [1. 1. 1. 1. 1.]
    Hidden       : [[[-0.3519801   0.52525216]]]
    RNN output      : [-0.3519801   0.52525216]

Time step 1 =>
    Input        : [2. 2. 2. 2. 2.]
    Hidden       : [[[-0.68424344  0.76074266]]]
    RNN output      : [-0.68424344  0.76074266]

Time step 2 =>
    Input        : [3. 3. 3. 3. 3.]
    Hidden       : [[[-0.8649416  0.9046636]]]
    RNN output      : [-0.8649416  0.9046636]



In [4]:
from datasets import load_dataset

IMDB = load_dataset("stanfordnlp/imdb")
train_dataset = IMDB['train']
test_dataset = IMDB['test']

print(train_dataset[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [5]:
type(train_dataset), len(train_dataset)

(datasets.arrow_dataset.Dataset, 25000)

In [6]:
from torch.utils.data import Dataset

class HFDatasetWrapper(Dataset):
    def __init__(self, hf_dataset):
        self.hf_dataset = hf_dataset

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        return self.hf_dataset[idx]['text'], self.hf_dataset[idx]['label']

train_dataset = HFDatasetWrapper(train_dataset)
test_dataset = HFDatasetWrapper(test_dataset)

In [7]:
from torch.utils.data.dataset import random_split
torch.manual_seed(1)
train_dataset, valid_dataset = random_split(train_dataset, [20000, 5000])

In [8]:
import re
from collections import Counter, OrderedDict

token_counts = Counter()

def tokenizer(text):
    text = re.sub(r'<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub(r'[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized


for text, label in train_dataset:
    tokens = tokenizer(text)
    token_counts.update(tokens)

print('Vocab-size:', len(token_counts))

Vocab-size: 69023


In [9]:
class SimpleVocab:
    def __init__(self, token_counts, specials=("<pad>", "<unk>")):
        # 排序
        sorted_items = sorted(token_counts.items(),
                              key=lambda x: x[1],
                              reverse=True)
        self.stoi = {}
        self.itos = []

        # 添加特殊符号
        for special in specials:
            self.stoi[special] = len(self.stoi)
            self.itos.append(special)

        # 添加普通词
        for token, _ in sorted_items:
            if token not in self.stoi:
                self.stoi[token] = len(self.stoi)
                self.itos.append(token)

        # 设定默认索引（<unk> 的位置）
        self.unk_idx = self.stoi["<unk>"]

    def __getitem__(self, token):
        return self.stoi.get(token, self.unk_idx)

    def __len__(self):
        return len(self.itos)

# 使用方式
vocab = SimpleVocab(token_counts)
print([vocab[token] for token in ['this', 'is', 'an', 'example']])
print(len(vocab))

[11, 7, 35, 457]
69025


In [10]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = "cpu"

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
# label_pipeline = lambda x: 1. if x == 'pos' else 0.
label_pipeline = lambda x: 1. if x == 1 else 0.

## Step 3-B: wrap the encode and transformation function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _text, _label in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text),
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [11]:
## Take a small batch
from torch.utils.data import DataLoader

dataloader = DataLoader(train_dataset, batch_size=4,
                        shuffle=False, collate_fn=collate_batch)
text_batch, label_batch, length_batch = next(iter(dataloader))

print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)

tensor([[   35,  1739,     7,   449,   721,     6,   301,     4,   787,     9,
             4,    18,    44,     2,  1705,  2460,   186,    25,     7,    24,
           100,  1874,  1739,    25,     7, 34415,  3568,  1103,  7517,   787,
             5,     2,  4991, 12401,    36,     7,   148,   111,   939,     6,
         11598,     2,   172,   135,    62,    25,  3199,  1602,     3,   928,
          1500,     9,     6,  4601,     2,   155,    36,    14,   274,     4,
         42945,     9,  4991,     3,    14, 10296,    34,  3568,     8,    51,
           148,    30,     2,    58,    16,    11,  1893,   125,     6,   420,
          1214,    27, 14542,   940,    11,     7,    29,   951,    18,    17,
         15994,   459,    34,  2480, 15211,  3713,     2,   840,  3200,     9,
          3568,    13,   107,     9,   175,    94,    25,    51, 10297,  1796,
            27,   712,    16,     2,   220,    17,     4,    54,   722,   238,
           395,     2,   787,    32,    27,  5236,  

In [12]:
batch_size = 32

train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)

In [13]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,
                                      embed_dim,
                                      padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,
                           batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(),
                                                enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model = model.to(device)
print(model)

RNN(
  (embedding): Embedding(69025, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [14]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        model.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [15]:
num_epochs = 10

torch.manual_seed(1)

for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.6096 val_accuracy: 0.6852
Epoch 1 accuracy: 0.7257 val_accuracy: 0.7452
Epoch 2 accuracy: 0.7466 val_accuracy: 0.6284
Epoch 3 accuracy: 0.7253 val_accuracy: 0.5366
Epoch 4 accuracy: 0.7972 val_accuracy: 0.7492
Epoch 5 accuracy: 0.8619 val_accuracy: 0.7784
Epoch 6 accuracy: 0.8911 val_accuracy: 0.8040
Epoch 7 accuracy: 0.9162 val_accuracy: 0.8574
Epoch 8 accuracy: 0.9328 val_accuracy: 0.8598
Epoch 9 accuracy: 0.9504 val_accuracy: 0.8634


In [16]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}')

test_accuracy: 0.8571
