In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#!cp -r drive/My\ Drive/Non-sequential\ Text\ Generation/data data

In [0]:
import torch
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


SEED = 43
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [0]:
columns = ["text", "parent_text", "score"]
df = pd.concat([
    pd.read_csv("data/comments_positive.csv", usecols=columns, na_filter=False),
    pd.read_csv("data/comments_negative.csv", usecols=columns, na_filter=False)
], ignore_index=True)

In [0]:
y = df['score']
df.drop(columns='score', inplace=True)
X = df

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=SEED)

# To be sure we don't use indices to predict something
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

print("Train shape: {}".format(X_train.shape))
print("Test shape: {}".format(X_test.shape))

Train shape: (3800000, 2)
Test shape: (200000, 2)


In [0]:
X_train.head(5)

Unnamed: 0,text,parent_text
0,"I was a whole bunch of folks on Recess, yup. :...",Your IMDB just blew my mind. You were Upside D...
1,"I'm not quite sure how that was relevant, but ...",http://i.imgur.com/GKLI7.jpg
2,Sounds like you weren't paying attention.,Sadly high school history doesn't teach you ab...
3,&gt;Polygamy doesn't mean that there's just wo...,&gt; from a feminist perspective \n\nSo what i...
4,The New Orleans Gargle-Glub-Glubs.,My high school's mascot was water...


### Simple baseline code

In [0]:
y_pred = np.full(y_test.shape, y_train.mean())

### Metric

In [0]:
mean_squared_error(y_test, y_pred)

44239.11158943143

### Imports

In [0]:
!pip install -q torch transformers

In [0]:
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam 
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Model, GPT2Tokenizer, DistilBertTokenizer, DistilBertModel
from transformers.optimization import AdamW
from tqdm.auto import tqdm
from collections import namedtuple

from time import perf_counter
from google.colab import files

In [0]:
def build_tokenizer(pretrained_model):
    tokenizer = DistilBertTokenizer.from_pretrained(pretrained_model)

    return tokenizer

def build_bert(pretrained_model):
    bert = DistilBertModel.from_pretrained(pretrained_model)
    
    return bert


In [0]:
class RegressionHead(torch.nn.Module):
    def __init__(self, input_size, hidden_size=512):
        super().__init__()
        self.hidden_size = hidden_size
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        return self.fc3(x)

class Regressor(torch.nn.Module):
    def __init__(self, pretrained_model='distilbert-base-cased', hidden_size=512):
        super().__init__()
        self.tokenizer = build_tokenizer(pretrained_model)
        self.bert = build_bert(pretrained_model)
        self.embed_size = self.bert.config.hidden_size
        self.regression_head = RegressionHead(self.embed_size, hidden_size)
        self.device = torch.device('cuda' if torch.cuda.is_available()
                                          else 'cpu')
        self.to(self.device)

    def train_custom(self):
        for param in self.bert.parameters():
            param.requires_grad = False
        self.regression_head.train()

    def forward(self, ids, attention_mask=None):
        hidden = self.bert(ids, attention_mask=attention_mask)[0]
        hidden_class = hidden[:, 0, :]

        return self.regression_head(hidden_class)

    def save(self, path='temp_weights'):
        torch.save(self.regression_head.state_dict(),
                   path)
    
    def load(self, path='temp_weights'):
        reg_state = torch.load(path)
        self.regression_head.load_state_dict(reg_state)


In [0]:
class RedditDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_length, self_length_rate=0.65):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.self_length_rate = self_length_rate
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sample, target = self.X.iloc[idx], self.y.iloc[idx]
        
        self_ids = self.tokenizer.encode(sample.text,
                                         max_length=int(self.max_length * self.self_length_rate)
        )
        parent_ids = self.tokenizer.encode(sample.parent_text,
                                           max_length=(self.max_length - len(self_ids))
        )

        ids = parent_ids + self_ids[1:]
        
        return ids, target


In [0]:
def get_dataloader(dataset, batch_size, pad_elem):
    def pad(seq, max_len, pad_elem):
        return seq + [pad_elem] * max(0, max_len - len(seq))


    def collate_fn(batch_data):
        batch_ids, batch_target = list(zip(*batch_data))
        batch_len = max(map(len, batch_ids))
        batch_ids = torch.tensor(
                        [pad(ids, batch_len, pad_elem) 
                         for ids in batch_ids]
        ).long()
        batch_mask = batch_ids.ne(pad_elem).int()
        batch_target = torch.tensor(batch_target).float()

        return batch_ids, batch_mask, batch_target

    return DataLoader(
        dataset=dataset, batch_size=batch_size, shuffle=True,
        collate_fn=collate_fn, pin_memory=True, num_workers=0 
    )

In [0]:
def train_epoch(dataloader, model, optimizer):
    torch.cuda.empty_cache()

    pbar = tqdm(enumerate(dataloader), total=len(dataloader), leave=False)
    for idx_batch, batch in pbar:
        ids, mask, targets = \
            [x.to(model.device) for x in batch]

        y_hat = model(ids, attention_mask=mask)

        loss = F.mse_loss(y_hat, targets.view(y_hat.shape))
        loss.backward()#(retain_graph=True)

        pbar.set_description(f'loss:{loss.item():.4f}')
        
        optimizer.step()
        optimizer.zero_grad()
        
        if (idx_batch + 1) % 200 == 0:
            model.save()

    model.save()


In [0]:
def train(model, X, y, n_epochs=5, batch_size=16, max_seq_length=32, lr=1e-2):
    model.train_custom()

    optimizer = Adam(
        model.regression_head.parameters(), 
        lr=lr
    )

    reddit_dataset = RedditDataset(X, y, model.tokenizer, max_seq_length)
    dataloader = get_dataloader(reddit_dataset, batch_size, 0)

    for epoch in range(n_epochs):
        train_epoch(dataloader, model, optimizer)
        print(f'Epoch #{epoch} finished')


In [0]:
model = Regressor()

In [21]:
train(model, X_train, y_train, batch_size=512, max_seq_length=32)

HBox(children=(IntProgress(value=0, max=7422), HTML(value='')))

KeyboardInterrupt: ignored

In [0]:
def eval(model, X, y, batch_size=512, max_seq_length=32):
    model.eval()

    reddit_dataset = RedditDataset(X, y, model.tokenizer, max_seq_length)
    dataloader = get_dataloader(reddit_dataset, batch_size, 0)

    loss_value = 0
    torch.cuda.empty_cache()

    pbar = tqdm(enumerate(dataloader), total=len(dataloader), leave=False)
    for idx_batch, batch in pbar:
        ids, mask, targets = \
            [x.to(model.device) for x in batch]

        y_hat = model(ids, attention_mask=mask)

        loss = F.mse_loss(y_hat, targets.view(y_hat.shape))

        loss_value += loss.item() * len(ids)
        pbar.set_description(f'loss:{loss.item():.4f}')

    return loss_value / len(dataloader.dataset)


In [23]:
mse = eval(model, X_test, y_test)

HBox(children=(IntProgress(value=0, max=391), HTML(value='')))

In [24]:
mse

41605.34202125

К сожалению, обучить модель до конца не удалось, colab постоянно вылетал и сбрасывал прогресс.