In [266]:
import torch
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader
from torch import nn
import time
import torchtext
import pandas as pd
from torch.utils.data.dataset import random_split

In [267]:
train_iter = AG_NEWS(split='train')
df = pd.DataFrame(list(train_iter), columns=['class', 'text'])
df

Unnamed: 0,class,text
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."
...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...
119996,2,Renteria signing a top-shelf deal Red Sox gene...
119997,2,Saban not going to Dolphins yet The Miami Dolp...
119998,2,Today's NFL games PITTSBURGH at NY GIANTS Time...


In [268]:
df_class = df.iloc[:, 0]
df_class.unique()  # 4分类

array([3, 4, 2, 1], dtype=int64)

In [269]:
train_iter = AG_NEWS(split='train')

# – the name of tokenizer function. If None, it returns split() function, which splits the string sentence by space. If basic_english, it returns _basic_english_normalize() function, which normalize the string first and split by space.
tokenizer = get_tokenizer(tokenizer='basic_english')


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)  # 分词


vocab = build_vocab_from_iterator(yield_tokens(train_iter))  # Build a Vocab from an iterator.
vocab.insert_token("<unk>", 0)
vocab.insert_token("<pad>", 1)
vocab.set_default_index(0)

In [270]:
# 预训练词向量
vec1 = torchtext.vocab.Vectors(name="glove.6B.50d.txt",
                               cache=r'C:\Users\duanm\Music\GitHubProjects\MLNote\E_PyTorch\高阶操作及深度学习相关理论\torchtext自然语言处理\.vector_cache')

print(vec1.vectors.shape)
print(vec1.vectors)

torch.Size([400000, 50])
tensor([[ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
        [ 0.0134,  0.2368, -0.1690,  ..., -0.5666,  0.0447,  0.3039],
        [ 0.1516,  0.3018, -0.1676,  ..., -0.3565,  0.0164,  0.1022],
        ...,
        [-0.5118,  0.0587,  1.0913,  ..., -0.2500, -1.1250,  1.5863],
        [-0.7590, -0.4743,  0.4737,  ...,  0.7895, -0.0141,  0.6448],
        [ 0.0726, -0.5139,  0.4728,  ..., -0.1891, -0.5902,  0.5556]])


In [271]:
pre_trained = vec1.get_vecs_by_tokens(vocab.get_itos())
pre_trained

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1516,  0.3018, -0.1676,  ..., -0.3565,  0.0164,  0.1022],
        ...,
        [ 1.1296, -1.0693,  0.1338,  ...,  0.3478, -0.8490,  0.5595],
        [-0.1712, -0.2531,  0.6790,  ...,  0.5299,  0.1299,  0.5768],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [272]:
text_pipeline = lambda x: [vocab([i])[0] for i in tokenizer(x)]
label_pipeline = lambda x: int(x) - 1  # 使分类标签从0开始

In [273]:
# he vocabulary block converts a list of tokens into integers.
text_pipeline('here is the an example')

[476, 22, 3, 31, 5298]

In [274]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [275]:
def collate_batch(batch):
    label_list = []  # 分类标签
    text_list = []
    offsets = [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)


train_iter = AG_NEWS(split='train')
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [276]:
for m, n, z in dataloader:  # collate_batch功能测试
    print(m)
    print(n)
    print(z)
    break

tensor([2, 2, 2, 2, 2, 2, 2, 2], device='cuda:0')
tensor([  432,   426,     2,  1606, 14839,   114,    67,     3,   849,    14,
           28,    15,    28,    16, 50726,     4,   432,   375,    17,    10,
        67508,     7, 52259,     4,    43,  4010,   784,   326,     2, 15875,
         1073,   855,  1311,  4251,    14,    28,    15,    28,    16,   930,
          798,   321, 15875,    99,     4, 27658,    29,     6,  4460,    12,
          565, 52791,     9, 80618,  2126,     8,     3,   526,   242,     4,
           29,  3891, 82815,  6575,    11,   207,   360,     7,     3,   127,
            2,    59,     9,   348,  4583,   152,    17,   739,    14,    28,
           15,    28,    16,  2385,   453,    93,  2060, 27361,     3,   348,
            9,     3,   739,    12,   272,    43,   241, 51954,    39,     3,
          295,   127,   113,    86,   221,     3,  7857,     7, 40067, 15381,
            2,    71,  7377,    59,  1811,    30,   906,   538,  2847,    14,
           28,

In [277]:
def to_map_style_dataset(iter_data):
    r"""Convert iterable-style dataset to map-style dataset.
    """

    class _MapStyleDataset(torch.utils.data.Dataset):

        def __init__(self, iter_data):
            # TODO Avoid list issue #1296
            self._data = list(iter_data)

        def __len__(self):
            return len(self._data)

        def __getitem__(self, idx):
            return self._data[idx]

    return _MapStyleDataset(iter_data)

In [278]:
BATCH_SIZE = 64  # batch size for training
total_accu = None
train_iter, test_iter = AG_NEWS()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)  # 训练数据集
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)  # 验证数据集
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)  # 测试数据集

In [279]:
class TextClassificationModel(nn.Module):
    def __init__(self, pre_trained_embed, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag.from_pretrained(pre_trained_embed)
        self.fc = nn.Linear(pre_trained_embed.shape[1], num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [280]:
train_iter = AG_NEWS(split='train')
num_class = len(set([label for (label, text) in train_iter]))
model = TextClassificationModel(pre_trained, num_class).to(device)

In [281]:
# 冻结fc1层的参数
for name, param in model.named_parameters():
    if "embedding" in name:
        param.requires_grad = False

In [282]:
# Hyperparameters
EPOCHS = 20  # epoch
LR = 5  # learning rate

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

In [283]:
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc / total_count))
            total_acc, total_count = 0, 0


def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

In [284]:
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   500/ 1782 batches | accuracy    0.824
| epoch   1 |  1000/ 1782 batches | accuracy    0.856
| epoch   1 |  1500/ 1782 batches | accuracy    0.863
-----------------------------------------------------------
| end of epoch   1 | time: 16.79s | valid accuracy    0.852 
-----------------------------------------------------------
| epoch   2 |   500/ 1782 batches | accuracy    0.862
| epoch   2 |  1000/ 1782 batches | accuracy    0.861
| epoch   2 |  1500/ 1782 batches | accuracy    0.859
-----------------------------------------------------------
| end of epoch   2 | time: 16.68s | valid accuracy    0.878 
-----------------------------------------------------------
| epoch   3 |   500/ 1782 batches | accuracy    0.863
| epoch   3 |  1000/ 1782 batches | accuracy    0.861
| epoch   3 |  1500/ 1782 batches | accuracy    0.862
-----------------------------------------------------------
| end of epoch   3 | time: 16.83s | valid accuracy    0.847 
-------------------------------

In [285]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.876


In [286]:
ag_news_label = {1: "World",
                 2: "Sports",
                 3: "Business",
                 4: "Sci/Tec"}


def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1


ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

model = model.to("cpu")

print("This is a %s news" % ag_news_label[predict(ex_text_str, text_pipeline)])

This is a Sports news
