# 获取Embedding

In [1]:
import bz2
import random
from tqdm import tqdm
from icecream import ic
import torch

In [2]:
WORD_EMBEDDING_FILE = '../../../pt/sgns_weibo/sgns.weibo.word.bz2'
token2embedding = {}

In [3]:
with bz2.open(WORD_EMBEDDING_FILE) as f:
    token_vectors = f.readlines()
    vob_size, dim = token_vectors[0].split()

print('load embedding file: {} end!'.format(WORD_EMBEDDING_FILE))

load embedding file: ../../../pt/sgns_weibo/sgns.weibo.word.bz2 end!


In [4]:
def get_embedding(vocabulary: set):
    for line in token_vectors[1:]:
        tokens = line.split()
        token = tokens[0].decode('utf-8')
        if token in vocabulary:
            token2embedding[token] = list(map(float, tokens[1:]))
            assert len(token2embedding[token]) == int(dim)
            
    UNK, PAD, BOS, EOS = '<unk> <pad> <bos> <eos>'.split()
    special_token_num = 4
    token2id = {token: _id for _id, token in enumerate(token2embedding.keys(), special_token_num)}
    
    token2id[PAD] = 0
    token2id[UNK] = 1
    token2id[BOS] = 2
    token2id[EOS] = 3
    
    id2vec = {token2id[token]: embedding for token, embedding in token2embedding.items()}
    id2vec[0] = [0.] * int(dim)
    id2vec[1] = [0.] * int(dim)
    id2vec[2] = [random.uniform(-1, 1)] * int(dim)
    id2vec[3] = [random.uniform(-1, 1)] * int(dim)
    
    embedding = [id2vec[_id] for _id in range(len(id2vec))]
    
    return torch.tensor(embedding, dtype=torch.float), token2id, len(vocabulary) + 4

# 定义TextCNN

In [5]:
import torch
from torch import nn
import torch.nn.functional as F
import jieba

In [20]:
class TextCNN(nn.Module):
    def __init__(self, word_embedding, each_filter_num, filter_heights, drop_out, num_classes):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(word_embedding, freeze=True)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels = 1, out_channels=each_filter_num, 
                     kernel_size=(h, word_embedding.shape[1]))
            for h in filter_heights
        ])
        
        self.dropout = nn.Dropout(drop_out)
        self.fc = nn.Linear(each_filter_num * len(filter_heights), num_classes)
        
    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        
        return x
    
    def forward(self, input_ids=None):
        word_embeddings = self.embedding(input_ids)
        sentence_embedding = word_embeddings.unsqueeze(1)
        
        out = torch.cat([self.conv_and_pool(sentence_embedding, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc(out)

        outputs = (out, )

        return outputs

In [23]:
some_text_sentence = '今天股市大跌，明天不知啥情况'
words = list(jieba.cut(some_text_sentence))
embedding, token2id, _ = get_embedding(set(words))

In [24]:
text_cnn_model = TextCNN(embedding, each_filter_num=128, filter_heights=[2, 3, 5], drop_out=0.3,
                         num_classes=15)
ids =[token2id[w] for w in words]

out = text_cnn_model(torch.tensor([ids]))
print(out)

(tensor([[-0.0667, -0.2550, -0.1036,  0.0592, -0.1217, -0.0764,  0.0159,  0.0916,
         -0.2430,  0.1574, -0.0650, -0.0265, -0.0346,  0.0223, -0.0917]],
       grad_fn=<AddmmBackward0>),)


# 定义data_loader

In [28]:
import numpy as np
import pandas as pd
import jieba
from collections import defaultdict
import torch
from operator import add
from functools import reduce
from collections import Counter
from torch.utils.data import DataLoader
from icecream import ic

In [29]:
def add_with_print(all_corpus):
    add_with_print.i = 0

    def _wrap(a, b):
        add_with_print.i += 1
        if add_with_print.i % 2000 == 0:
            print('{}/{}'.format(add_with_print.i, len(all_corpus)), end=' ')
        if add_with_print.i % 10000 == 0:
            print()
        return a + b

    return _wrap



In [42]:
def get_all_vocabulary(train_file_path, vocab_size):
    CUT, SENTENCE = 'cut', 'sentence'

    corpus = pd.read_csv(train_file_path)
    corpus[CUT] = corpus[SENTENCE].apply(lambda s: ' '.join(list(jieba.cut(s))))
    sentence_counters = map(Counter, map(lambda s: s.split(), corpus[CUT].values))
    chose_words = reduce(add_with_print(corpus), sentence_counters).most_common(vocab_size)

    return [w for w, _ in chose_words]

In [44]:
def tokenizer(sentence, vocab: dict):
    UNK = 1
    ids = [vocab.get(word, UNK) for word in jieba.cut(sentence)]

    return ids

In [56]:
def get_train_data(train_file, vocab2ids):
    val_ratio = 0.2
    content = pd.read_csv(train_file)
    num_val = int(len(content) * val_ratio)

    LABEL, SENTENCE = 'label', 'sentence'

    labels = content[LABEL].values
    content['input_ids'] = content[SENTENCE].apply(lambda s: ' '.join([str(id_) for id_ in tokenizer(s, vocab2ids)]))
    sentence_ids = np.array([[int(id_) for id_ in v.split()] for v in content['input_ids'].values])
    
    ids = np.random.choice(range(len(content)), size=len(content))
    
    train_ids = ids[num_val:]
    val_ids = ids[:num_val]
    
    X_train, y_train = sentence_ids[train_ids], labels[train_ids]
    X_val, y_val = sentence_ids[val_ids], labels[val_ids]

    label2id = {label: i for i, label in enumerate(np.unique(y_train))}
    id2label = {i: label for label, i in label2id.items()}
    y_train = torch.tensor([label2id[y] for y in y_train], dtype=torch.long)
    y_val = torch.tensor([label2id[y] for y in y_val], dtype=torch.long)

    return X_train, y_train, X_val, y_val, label2id, id2label

In [57]:
def build_dataloader(X_train, y_train, X_val, y_val, batch_size):
    train_dataloader = DataLoader([(x,y) for x,y in zip(X_train, y_train)],
                                 batch_size=batch_size, num_workers=0, shuffle=True)
    val_dataloader =DataLoader([(x,y) for x, y in zip(X_val, y_val)],
                              batch_size=batch_size, num_workers=0, shuffle=True)
    return train_dataloader, val_dataloader

In [59]:
f = open('../../../data/toutiao_news_cls/vocabulary.txt', 'r')
vocabulary = f.readlines()
vocabulary = [v.strip() for v in vocabulary]
embedding, token2id, vocab_size = get_embedding(set(vocabulary))
X_train, y_train, X_val, y_val, label2id, id2label = get_train_data('../../../data/toutiao_news_cls/train.csv', vocab2ids=token2id)

print(X_train, y_train, X_val, y_val, label2id, id2label)


[list([5710, 5936, 6760, 1, 7, 38, 1921, 12, 1969, 242, 60, 7850, 14])
 list([9054, 8171, 3476, 1909, 404, 3182, 7, 79, 856, 28, 35, 2944, 15, 14, 4913, 19, 1, 144, 14])
 list([1, 49, 1, 40, 22, 1, 1, 1, 1]) ...
 list([1, 16, 1, 149, 1366, 8926, 14, 7137, 819, 1, 1, 4888, 160, 1, 24])
 list([7771, 2596, 17, 836, 1264, 754, 619, 507, 4699, 2591])
 list([1, 1, 1, 1, 2079, 944, 385, 2683, 2661, 7, 2072, 19, 54, 303, 14])] tensor([4, 6, 2,  ..., 3, 4, 7]) [list([1, 1, 2633, 24]) list([1, 1, 17, 49, 1, 40])
 list([1, 1, 188, 4426, 17, 166, 380, 1, 22, 2284, 7250, 4252, 7, 3382, 3663, 1246, 521, 1])
 ...
 list([1, 48, 4558, 1908, 36, 1309, 12, 1, 7, 4957, 20, 2685, 304, 498, 26, 5296, 4558, 24])
 list([3830, 12, 1942, 20, 1355, 1, 4198, 7, 1, 1, 2087, 1])
 list([1458, 25, 1, 1061, 29, 1479, 2477, 1, 1, 221, 2596, 71, 3152, 343, 6800, 7412])] tensor([ 9,  0,  9,  ..., 11, 10,  6]) {100: 0, 101: 1, 102: 2, 103: 3, 104: 4, 106: 5, 107: 6, 108: 7, 109: 8, 110: 9, 112: 10, 113: 11, 114: 12, 115: 

  # Remove the CWD from sys.path while we load stuff.


In [60]:
train_loader, val_loader = build_dataloader(X_train, y_train, X_val, y_val, batch_size=128)
for i, (x, y) in enumerate(train_loader):
    ic(x)
    ic(y)
    if i > 3: break

RuntimeError: each element in list of batch should be of equal size