In [48]:
import pandas as pd
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext
import torch
from torch.utils.data import DataLoader
import numpy as np

In [49]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [50]:
train_df = pd.read_csv('../datasets/train.csv')
train_df

Unnamed: 0,class,text
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."
...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...
119996,2,Renteria signing a top-shelf deal Red Sox gene...
119997,2,Saban not going to Dolphins yet The Miami Dolp...
119998,2,Today's NFL games PITTSBURGH at NY GIANTS Time...


In [51]:
test_df = pd.read_csv('../datasets/test.csv')
test_df

Unnamed: 0,class,text
0,3,Fears for T N pension after talks Unions repre...
1,4,The Race is On: Second Private Team Sets Launc...
2,4,Ky. Company Wins Grant to Study Peptides (AP) ...
3,4,Prediction Unit Helps Forecast Wildfires (AP) ...
4,4,Calif. Aims to Limit Farm-Related Smog (AP) AP...
...,...,...
7595,1,Around the world Ukrainian presidential candid...
7596,2,Void is filled with Clement With the supply of...
7597,2,Martinez leaves bitter Like Roger Clemens did ...
7598,3,5 of arthritis patients in Singapore take Bext...


In [52]:
# – the name of tokenizer function. If None, it returns split() function, which splits the string sentence by space. If basic_english, it returns _basic_english_normalize() function, which normalize the string first and split by space.
tokenizer = get_tokenizer(tokenizer='basic_english')


def yield_tokens(data_iter):
    for _, ser in data_iter:
        yield tokenizer(ser['text'])  # 分词


# Build a Vocab from an iterator.
vocab = build_vocab_from_iterator(yield_tokens(train_df.iterrows()))

vocab.insert_token("<unk>", 0)
vocab.insert_token("<pad>", 1)
vocab.set_default_index(vocab['<unk>'])  # 不在词表中的token用'<unk>'的index表示

In [53]:
train_split_sentence = pd.Series(list(yield_tokens(train_df.iterrows())))
train_split_sentence

0         [wall, st, ., bears, claw, back, into, the, bl...
1         [carlyle, looks, toward, commercial, aerospace...
2         [oil, and, economy, cloud, stocks, ', outlook,...
3         [iraq, halts, oil, exports, from, main, southe...
4         [oil, prices, soar, to, all-time, record, ,, p...
                                ...                        
119995    [pakistan, ', s, musharraf, says, won, ', t, q...
119996    [renteria, signing, a, top-shelf, deal, red, s...
119997    [saban, not, going, to, dolphins, yet, the, mi...
119998    [today, ', s, nfl, games, pittsburgh, at, ny, ...
119999    [nets, get, carter, from, raptors, indianapoli...
Length: 120000, dtype: object

In [54]:
def com_sentence_len(text):
    return len(text)


train_sentence_len = train_split_sentence.apply(com_sentence_len)
train_sentence_len

0         29
1         42
2         40
3         40
4         43
          ..
119995    47
119996    62
119997    47
119998    81
119999    40
Length: 120000, dtype: int64

In [55]:
# 文本长度大部分(99.9%)141以内(可以以此进行文本最大长度截断)
np.percentile(train_sentence_len.values, q=99.9)

141.0

In [56]:
# 加载预训练词向量文件
vec1 = torchtext.vocab.Vectors(name="glove.6B.50d.txt",
                               cache='../../extra/glove_vector/')

print(vec1.vectors.shape)
print(vec1.vectors)

torch.Size([400000, 50])
tensor([[ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
        [ 0.0134,  0.2368, -0.1690,  ..., -0.5666,  0.0447,  0.3039],
        [ 0.1516,  0.3018, -0.1676,  ..., -0.3565,  0.0164,  0.1022],
        ...,
        [-0.5118,  0.0587,  1.0913,  ..., -0.2500, -1.1250,  1.5863],
        [-0.7590, -0.4743,  0.4737,  ...,  0.7895, -0.0141,  0.6448],
        [ 0.0726, -0.5139,  0.4728,  ..., -0.1891, -0.5902,  0.5556]])


In [57]:
# 模型预训练词向量矩阵
pre_trained = vec1.get_vecs_by_tokens(vocab.get_itos())
pre_trained

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1516,  0.3018, -0.1676,  ..., -0.3565,  0.0164,  0.1022],
        ...,
        [ 1.1296, -1.0693,  0.1338,  ...,  0.3478, -0.8490,  0.5595],
        [-0.1712, -0.2531,  0.6790,  ...,  0.5299,  0.1299,  0.5768],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [58]:
text_pipeline = lambda line: [vocab([i])[0] for i in tokenizer(line)]


def truncate_pad(line, text_max_len, padding_token):
    """截断或填充文本序列"""
    if len(line) > text_max_len:
        return line[:text_max_len]  # 句子截断
    return line + [padding_token] * (text_max_len - len(line))  # 句子填充


label_pipeline = lambda label: int(label) - 1  # 使分类标签从0开始

In [59]:
def to_map_style_dataset(df):
    r"""Convert DataFrame to map-style dataset.
    """

    class _MapStyleDataset(torch.utils.data.Dataset):

        def __init__(self, df):
            # TODO Avoid list issue #1296
            self._data = df.values

        def __len__(self):
            return self._data.shape[0]

        def __getitem__(self, idx):
            return self._data[idx]

    return _MapStyleDataset(df)


test_map_data = to_map_style_dataset(test_df)
for label, text in test_map_data:
    print(label)
    print(text)
    print(text_pipeline(text))
    print(truncate_pad(text_pipeline(text), 30, vocab['<pad>']))
    break

3
Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
[870, 12, 84, 138, 1482, 35, 174, 1753, 4059, 401, 21, 6558, 38435, 234, 68, 43, 17, 4478, 17, 35, 174, 19, 11302, 2448, 321, 195, 9840, 2]
[870, 12, 84, 138, 1482, 35, 174, 1753, 4059, 401, 21, 6558, 38435, 234, 68, 43, 17, 4478, 17, 35, 174, 19, 11302, 2448, 321, 195, 9840, 2, 1, 1]


In [60]:
def collate_batch(batch):
    label_list = []  # 分类标签
    text_list = []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(truncate_pad(text_pipeline(_text), 141, vocab['<pad>']), dtype=torch.int64)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list)
    return label_list.to(device), text_list.to(device)


test_dataloader = DataLoader(test_map_data, batch_size=16, shuffle=False, collate_fn=collate_batch)
for i in test_dataloader:
    print(i[0])
    print(i[1].shape)
    print(i[1])
    break

tensor([2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0')
torch.Size([16, 141])
tensor([[  870,    12,    84,  ...,     1,     1,     1],
        [    3,   494,    22,  ...,     1,     1,     1],
        [10971,     2,    55,  ...,     1,     1,     1],
        ...,
        [ 2169, 27755,  7961,  ...,     1,     1,     1],
        [88099,  7745, 14368,  ...,     1,     1,     1],
        [ 5078,    84,     2,  ...,     1,     1,     1]], device='cuda:0')
