In [1]:
import pandas as pd
import re
import collections
import torch
from torch.utils.data import DataLoader
import numpy as np
import jsonlines
from tqdm.notebook import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
train_lst = []

with open('train.jsonl', encoding='utf-8') as fp:
    for item in jsonlines.Reader(fp):
        train_lst.append(item)
train_df = pd.DataFrame(train_lst)
train_df

Unnamed: 0,text,label,label_text
0,Wall St. Bears Claw Back Into the Black (Reute...,2,Business
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2,Business
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2,Business
3,Iraq Halts Oil Exports from Main Southern Pipe...,2,Business
4,"Oil prices soar to all-time record, posing new...",2,Business
...,...,...,...
119995,Pakistan's Musharraf Says Won't Quit as Army C...,0,World
119996,Renteria signing a top-shelf deal Red Sox gene...,1,Sports
119997,Saban not going to Dolphins yet The Miami Dolp...,1,Sports
119998,Today's NFL games PITTSBURGH at NY GIANTS Time...,1,Sports


In [4]:
test_lst = []

with open('test.jsonl', encoding='utf-8') as fp:
    for item in jsonlines.Reader(fp):
        test_lst.append(item)
test_df = pd.DataFrame(test_lst)
test_df

Unnamed: 0,text,label,label_text
0,Fears for T N pension after talks Unions repre...,2,Business
1,The Race is On: Second Private Team Sets Launc...,3,Sci/Tech
2,Ky. Company Wins Grant to Study Peptides (AP) ...,3,Sci/Tech
3,Prediction Unit Helps Forecast Wildfires (AP) ...,3,Sci/Tech
4,Calif. Aims to Limit Farm-Related Smog (AP) AP...,3,Sci/Tech
...,...,...,...
7595,Around the world Ukrainian presidential candid...,0,World
7596,Void is filled with Clement With the supply of...,1,Sports
7597,Martinez leaves bitter Like Roger Clemens did ...,1,Sports
7598,5 of arthritis patients in Singapore take Bext...,2,Business


In [5]:
def tokenizer(line):
    """基础英文分词器"""
    _patterns = [r"\'", r"\"", r"\.", r"<br \/>", r",", r"\(", r"\)", r"\!", r"\?", r"\;", r"\:", r"\s+"]
    _replacements = [" '  ", "", " . ", " ", " , ", " ( ", " ) ", " ! ", " ? ", " ", " ", " "]
    _patterns_dict = list((re.compile(p), r) for p, r in zip(_patterns, _replacements))
    line = line.lower()
    for pattern_re, replaced_str in _patterns_dict:
        line = pattern_re.sub(replaced_str, line)
    return line.split()


class Vocab:
    """
    Vocabulary for text
    """

    def __init__(self, tokens=None, min_freq=2, reserved_tokens=None):
        # tokens: 单词tokens
        # min_freq: The minimum frequency needed to include a token in the vocabulary.
        # reserved_tokens: 自定义tokens
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        counter = collections.Counter(tokens)
        # Sort according to frequencies
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        # The index for the unknown token is 0
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)  # 未在字典中则返回'<unk>'
        return [self.__getitem__(token) for token in tokens]  # 递归

    def to_tokens(self, indices):
        """第indices位置处的token"""
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):
        """Index for the unknown token"""
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs


split_list = []
for _, ser in train_df.iterrows():
    split_list.extend(tokenizer(ser['text']))

vocab = Vocab(split_list, min_freq=1, reserved_tokens=['<pad>'])

In [6]:
train_split_sentence = train_df['text'].map(lambda x: tokenizer(x))
train_split_sentence

0         [wall, st, ., bears, claw, back, into, the, bl...
1         [carlyle, looks, toward, commercial, aerospace...
2         [oil, and, economy, cloud, stocks, ', outlook,...
3         [iraq, halts, oil, exports, from, main, southe...
4         [oil, prices, soar, to, all-time, record, ,, p...
                                ...                        
119995    [pakistan, ', s, musharraf, says, won, ', t, q...
119996    [renteria, signing, a, top-shelf, deal, red, s...
119997    [saban, not, going, to, dolphins, yet, the, mi...
119998    [today, ', s, nfl, games, pittsburgh, at, ny, ...
119999    [nets, get, carter, from, raptors, indianapoli...
Name: text, Length: 120000, dtype: object

In [7]:
def com_sentence_len(text):
    return len(text)


train_sentence_len = train_split_sentence.apply(com_sentence_len)
train_sentence_len

0         29
1         42
2         40
3         40
4         43
          ..
119995    47
119996    62
119997    47
119998    81
119999    40
Name: text, Length: 120000, dtype: int64

In [8]:
# 文本长度大部分(99.9%)141以内(可以以此进行文本最大长度截断)
np.percentile(train_sentence_len.values, q=99.9)

141.0

In [9]:
class Vectors:
    def __init__(self, name, max_vectors=None) -> None:
        self.vectors = None
        self.name = name
        self.max_vectors = max_vectors
        self.itos = None
        self.stoi = None
        self.cache()

    def cache(self):
        with open(self.name, "r", encoding='utf-8') as f:
            read_value = f.readlines()

        all_value, itos = [], []
        for i in tqdm(range(len(read_value))):
            l_split = read_value[i].split(' ')
            itos.append(l_split[0])
            all_value.append([float(i.strip()) for i in l_split[1: ]])
        all_value = torch.tensor(all_value)
        self.vectors = all_value
        self.itos = itos
        num_lines = len(self.vectors)
        if not self.max_vectors or self.max_vectors > num_lines:
            self.max_vectors = num_lines
        self.vectors = self.vectors[:self.max_vectors, :]
        self.itos = self.itos[:self.max_vectors]
        self.stoi = {word: i for i, word in enumerate(self.itos)}

    def __len__(self):
        return len(self.vectors)
    
    def __getitem__(self, token):
        if token in self.stoi:
            return self.vectors[self.stoi[token]]
        else:
            dim = self.vectors.shape[1]
            return torch.Tensor.zero_(torch.Tensor(dim))
        
    def get_vecs_by_tokens(self, tokens):
        indices = [self[token] for token in tokens]
        vecs = torch.stack(indices)
        return vecs

# 预训练词向量
vec1 = Vectors(name="glove.6B.50d.txt")
print(vec1.vectors.shape)
print(vec1.vectors)

  0%|          | 0/400000 [00:00<?, ?it/s]

torch.Size([400000, 50])
tensor([[ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
        [ 0.0134,  0.2368, -0.1690,  ..., -0.5666,  0.0447,  0.3039],
        [ 0.1516,  0.3018, -0.1676,  ..., -0.3565,  0.0164,  0.1022],
        ...,
        [-0.5118,  0.0587,  1.0913,  ..., -0.2500, -1.1250,  1.5863],
        [-0.7590, -0.4743,  0.4737,  ...,  0.7895, -0.0141,  0.6448],
        [ 0.0726, -0.5139,  0.4728,  ..., -0.1891, -0.5902,  0.5556]])


In [10]:
# 模型预训练词向量矩阵
pre_trained = vec1.get_vecs_by_tokens(vocab.idx_to_token)
pre_trained

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1516,  0.3018, -0.1676,  ..., -0.3565,  0.0164,  0.1022],
        ...,
        [ 0.6241,  0.0100, -0.8427,  ...,  2.2487, -0.5393,  0.3112],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [11]:
text_pipeline = lambda line: [vocab[token] for token in tokenizer(line)]

def truncate_pad(line, text_max_len, padding_token):
    """截断或填充文本序列"""
    if len(line) > text_max_len:
        return line[:text_max_len]  # 句子截断
    return line + [padding_token] * (text_max_len - len(line))  # 句子填充

In [12]:
def to_map_style_dataset(df):
    r"""Convert DataFrame to map-style dataset.
    """

    class _MapStyleDataset(torch.utils.data.Dataset):

        def __init__(self, df):
            self._data = df.values

        def __len__(self):
            return self._data.shape[0]

        def __getitem__(self, idx):
            return self._data[idx]

    return _MapStyleDataset(df)


test_map_data = to_map_style_dataset(test_df)
for text, label, _ in test_map_data:
    print(label)
    print(text)
    print(text_pipeline(text))
    print(truncate_pad(text_pipeline(text), 30, vocab['<pad>']))
    break

2
Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
[870, 12, 84, 138, 1483, 35, 174, 1752, 4058, 401, 21, 6557, 36027, 234, 68, 43, 17, 4486, 17, 35, 174, 19, 11151, 2452, 321, 195, 9804, 2]
[870, 12, 84, 138, 1483, 35, 174, 1752, 4058, 401, 21, 6557, 36027, 234, 68, 43, 17, 4486, 17, 35, 174, 19, 11151, 2452, 321, 195, 9804, 2, 1, 1]


In [13]:
def collate_batch(batch):
    label_list = []  # 分类标签
    text_list = []
    for _text, _label, _ in batch:
        print(_label, _text)
        label_list.append(_label)
        processed_text = torch.tensor(truncate_pad(text_pipeline(_text), 141, vocab['<pad>']), dtype=torch.int64)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list)
    return label_list.to(device), text_list.to(device)


test_dataloader = DataLoader(test_map_data, batch_size=16, shuffle=False, collate_fn=collate_batch)
for i in test_dataloader:
    print(i[0])
    print(i[1].shape)
    print(i[1])
    break

2 Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
3 The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\privately funded suborbital space flight, has officially announced the first\launch date for its manned rocket.
3 Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins.
3 Prediction Unit Helps Forecast Wildfires (AP) AP - It's barely dawn when Mike Fitzpatrick starts his shift with a blur of colorful maps, figures and endless charts, but already he knows what the day will bring. Lightning will strike in places he e