**Что попробовал:**
1. поменять размер вектора (100, 200, 300) - на 300 было лучше - другие модели не подходили по памяти
2. Использовать разницу среднего по векторам (типа разница между оригинальным постом и комментариями) - сильно хуже, чем просто комментарий
3. использовать склеенные тексты - вроде чуть лучше, но чуть-чуть
4. поменять токенайзер, чтобы вернуть знаки препинания (только это побило бейзлайн)

In [1]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import nltk
import gensim
import spacy
from tqdm import tqdm_notebook

from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, ReversibleField, TabularDataset



SEED = 42
np.random.seed(SEED)

In [2]:
import spacy


spacy_en = spacy.load('en')
spacy_en.remove_pipe('tagger')
spacy_en.remove_pipe('ner')

def tokenizer(text): # create a tokenizer function
    return [tok.lemma_ for tok in spacy_en.tokenizer(text)] #if tok.text.isalpha()]            

In [3]:
device = tt.device('cuda:0' if tt.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [4]:
#classes={
#    0:0,
#    '1':1
#}

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english')
            )
LABEL = LabelField(dtype=tt.int64)

dataset = TabularDataset('../input/train-balanced-sarcasm.csv', format='csv', 
                         fields=[('label', LABEL), ('text', TEXT),
                                 (None, None),(None, None),(None, None),(None, None),
                                 (None, None),(None, None),(None, None),
                                 ('parental', TEXT)], 
                                 #(None, None)], 
                         skip_header=True)

In [5]:
TEXT.build_vocab(dataset, min_freq=5, vectors="glove.6B.300d")
#TEXT.build_vocab(dataset, min_freq=5)
len(TEXT.vocab.itos)

.vector_cache/glove.6B.zip: 862MB [03:42, 3.88MB/s]                           
100%|█████████▉| 399506/400000 [00:50<00:00, 8093.24it/s]

68344

In [6]:
TEXT.vocab.itos[:2]

['<unk>', '<pad>']

In [7]:
LABEL.build_vocab(dataset)

In [8]:
LABEL.vocab.itos[:3]

['0', '1']

In [9]:
train, test = dataset.split(0.8, stratified=True)
train, valid = train.split(0.9, stratified=True)

In [10]:
np.unique([x.label for x in train.examples], return_counts=True)

100%|█████████▉| 399506/400000 [01:10<00:00, 8093.24it/s]

(array(['0', '1'], dtype='<U1'), array([363897, 363897]))

In [11]:
np.unique([x.label for x in valid.examples], return_counts=True)

(array(['0', '1'], dtype='<U1'), array([40433, 40433]))

In [12]:
np.unique([x.label for x in test.examples], return_counts=True)

(array(['0', '1'], dtype='<U1'), array([101083, 101083]))

In [13]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, weights_matrix):
        super(MyModel, self).__init__()
        #self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding = nn.Embedding.from_pretrained(weights_matrix)
        self.embedding.requires_grad = False
        self.device = device
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2 *2, 2)
        
    def forward(self, batch):
        #batch = batch.to(self.device)
        x, x_lengths = batch.text
        x = x.to(self.device)
        x = self.embedding(x)
        #y, y_lengths = batch.parental
        #y = y.to(self.device)
        #y = self.embedding(y)
        #s = [y.shape[0], 1, y.shape[1]]
        #y = y.resize_(*s)
        #s = [x.shape[0], 1, x.shape[1]]
        #x = x.resize_(*s)
        #print (x.shape, y.shape)
        #x = tt.cat([x,y], dim=1)
        #print (x.shape, x)
        #if x_lengths is not None:
        #    x_lengths = x_lengths.view(-1).tolist()
        #    x = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
        batch.label = batch.label.to(self.device)
            
        _, (hidden, cell) = self.rnn(x)
        x = x.detach()
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        return x

In [14]:
tt.cuda.empty_cache()
#tt.set_default_tensor_type('torch.cuda.FloatTensor')
batch_size = 32

model = MyModel(len(TEXT.vocab.itos),
                embed_size=300,
                hidden_size=128,
                weights_matrix = TEXT.vocab.vectors
               )
#device = tt.device('cuda')
model = model.to(device)

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, 100),
    shuffle=True,
    sort_key=lambda x: len(x.text)+len(x.parental),
    sort_within_batch=True,
)

optimizer = optim.Adam(model.parameters())
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss()

In [15]:
from sklearn.metrics import accuracy_score
def count_accuracy(model, iterator):
    model.eval()
    acc_sum = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in tqdm_notebook(iterator, leave=False):
            #pred = model(batch)
            A = tt.nn.functional.softmax(model(batch), dim=1).cpu().numpy().argmax(axis=1)
            B = batch.label.cpu()
            acc = accuracy_score(A, B)
            acc_sum += acc

    return acc_sum / n_batches

In [16]:
def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        pred = model(batch)
        loss = criterion(pred, batch.label.detach())
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            pred = model(batch)
            loss = criterion(pred, batch.label)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0, test_iterator=None):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        valid_loss = _test_epoch(model, valid_iterator, criterion)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)
        print (count_accuracy(model, test_iterator))

Лучше всего получилось на 2 эпохах, это и есть итог

In [17]:
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler, 
        n_epochs=2, early_stopping=6, test_iterator=test_iterator)

HBox(children=(IntProgress(value=0, description='epoch 0', max=22744, style=ProgressStyle(description_width='i…

In [18]:
count_accuracy(model, test_iterator)

HBox(children=(IntProgress(value=0, max=2022), HTML(value='')))

0.7099169738932407