In [1]:
import math
import numpy as np
from pathlib import Path

from tqdm import tqdm

import torch

import zipfile

import seaborn as sns

from data.data import Downloader, Parser

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip -P ./data

In [2]:
DATA_PATH = './data'

In [20]:
downloader = Downloader(data_path=DATA_PATH)

In [None]:
downloader.run()

In [3]:
parser = Parser(data_path=DATA_PATH)

In [4]:
unlabeled, train, valid = parser.run()

Reading: 100%|██████████| 38/38 [02:43<00:00,  4.29s/it]


In [49]:
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import wordpunct_tokenize

class TextClassificationDataset(Dataset):
    
    def __init__(self, texts, targets, vocab, pad_index=0, max_length=32):
        super().__init__()
        
        self.texts = texts
        self.targets = targets
        self.vocab = vocab
        
        self.pad_index = pad_index
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def tokenization(self, text):
        
        tokens = wordpunct_tokenize(text)
        
        token_indices = [self.vocab[tok] for tok in tokens if tok in self.vocab]
        
        return token_indices
    
    def padding(self, tokenized_text):
        
        tokenized_text = tokenized_text[:self.max_length]
        
        tokenized_text += [self.pad_index] * (self.max_length - len(tokenized_text))
        
        return tokenized_text
    
    def __getitem__(self, index):
        
        text = self.texts[index]        
        target = self.targets[index]
        
        tokenized_text = self.tokenization(text)
        tokenized_text = self.padding(tokenized_text)
        
        tokenized_text = torch.tensor(tokenized_text)
        
        return tokenized_text, torch.tensor(target), tokenized_text.size()[0]

In [6]:
import zipfile
import numpy as np

from tqdm import tqdm

def load_embeddings(zip_path, filename, pad_token='PAD', max_words=100_000, verbose=True):
    
    vocab = dict()
    embeddings = list()

    with zipfile.ZipFile(zip_path) as zipped_file:
        with zipped_file.open(filename) as file_object:

            vocab_size, embedding_dim = file_object.readline().decode('utf-8').strip().split()

            vocab_size = int(vocab_size)
            embedding_dim = int(embedding_dim)
            
            # в файле 1 000 000 слов с векторами, давайте ограничим для простоты этот словарь
            max_words = vocab_size if max_words <= 0 else max_words
            
            # добавим пад токен и эмбеддинг в нашу матрицу эмбеддингов и словарь
            vocab[pad_token] = len(vocab)
            embeddings.append(np.zeros(embedding_dim))

            progress_bar = tqdm(total=max_words, disable=not verbose)

            for line in file_object:
                parts = line.decode('utf-8').strip().split()

                token = ' '.join(parts[:-embedding_dim]).lower()
                
                if token in vocab:
                    continue
                
                word_vector = np.array(list(map(float, parts[-embedding_dim:])))

                vocab[token] = len(vocab)
                embeddings.append(word_vector)

                progress_bar.update()
                
                if len(vocab) == max_words:
                    break

            progress_bar.close()

    embeddings = np.stack(embeddings)
    
    return vocab, embeddings

In [9]:
vocab, embeddings = load_embeddings(Path(DATA_PATH).joinpath('wiki-news-300d-1M.vec.zip'), 'wiki-news-300d-1M.vec', max_words=100_000)

100%|█████████▉| 99999/100000 [00:14<00:00, 6968.97it/s]


# Classes to index

In [10]:
# проверим, что в трейне и валидации одинаковые категории
set(train.category.unique().tolist()) == set(valid.category.unique().tolist())

True

In [11]:
unique_categories = set(train.category.unique().tolist() + valid.category.unique().tolist())

In [12]:
category2index = {category: index for index, category in enumerate(unique_categories)}

In [13]:
train['target'] = train.category.map(category2index)
valid['target'] = valid.category.map(category2index)

In [14]:
train_x = list(train.question)
train_y = list(train.target)

valid_x = list(valid.question)
valid_y = list(valid.target)

In [50]:
train_dataset = TextClassificationDataset(texts=train_x, targets=train_y, vocab=vocab)
val_dataset = TextClassificationDataset(texts=valid_x, targets=valid_y, vocab=vocab)

# Neural network

In [16]:
from torch import nn

In [17]:
embedding_layer = nn.Embedding(num_embeddings=len(vocab), 
                               embedding_dim=embeddings.shape[-1],
                               padding_idx=0)

In [51]:
embeddings = torch.tensor(embeddings).float()

In [47]:
def train_model(model, train_dl, val_dl, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item() * y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [55]:
class TextClassifier(nn.Module):
    
    def __init__(self, embeddings, hidden_dim, n_classes):
        super().__init__()
        
        self.embeddings = nn.Embedding.from_pretrained(embeddings, padding_idx=0)
        self.batch_norm = nn.BatchNorm1d(num_features=embeddings.shape[-1])
        self.lstm = nn.LSTM(embeddings.shape[-1], hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, n_classes)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [56]:
model = TextClassifier(torch.from_numpy(embeddings), 100, len(category2index))

In [57]:
train_model(model, train_dataset, val_dataset, epochs=30, lr=0.05)

RuntimeError: input must have 3 dimensions, got 2