In [1]:
from src.data_utils import make_clean_data, tokenize_data, train_test_val_split
from src.next_token_dataset import TweetsDataset
from src.eval_transformer_pipeline import evaluate_pipeline
from src.lstm_model import LSTM
from src.lstm_train import lstm_train
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


# LSTM

In [None]:
make_clean_data('data/tweets.txt')

In [3]:
tokenize_data('data/tweets_processed.txt')

In [2]:
train, test, val = train_test_val_split()

In [3]:
max_len = 128
train_ds = TweetsDataset(train[:10000], max_len)
test_ds = TweetsDataset(test[:1000], max_len)
val_ds = TweetsDataset(val[:1000], max_len)

train_dl = DataLoader(train_ds, batch_size=50, shuffle=False)
test_dl = DataLoader(test_ds, batch_size=50, shuffle=False)
val_dl = DataLoader(val_ds, batch_size=50, shuffle=False)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilgpt2')
vocab_size = tokenizer.vocab_size

In [5]:
model = LSTM(vocab_size)

lstm_train(model, train_dl, val_dl)

  return torch.tensor(x), torch.tensor(y)
 10%|█         | 1/10 [01:57<17:41, 117.94s/it]

Epoch 1 | Train Loss: 1.0362 | Val Loss: 0.797 | Val Accuracy: 89.01% | ROUGE-1 SCORE: 14.55% | ROUGE-2 SCORE: 1.30%


 20%|██        | 2/10 [05:21<22:25, 168.13s/it]

Epoch 2 | Train Loss: 0.7602 | Val Loss: 0.757 | Val Accuracy: 89.47% | ROUGE-1 SCORE: 25.98% | ROUGE-2 SCORE: 2.81%


 30%|███       | 3/10 [08:45<21:30, 184.42s/it]

Epoch 3 | Train Loss: 0.7049 | Val Loss: 0.746 | Val Accuracy: 89.63% | ROUGE-1 SCORE: 28.62% | ROUGE-2 SCORE: 3.68%


 40%|████      | 4/10 [12:07<19:08, 191.45s/it]

Epoch 4 | Train Loss: 0.6638 | Val Loss: 0.738 | Val Accuracy: 89.71% | ROUGE-1 SCORE: 31.25% | ROUGE-2 SCORE: 4.29%


 50%|█████     | 5/10 [15:35<16:27, 197.40s/it]

Epoch 5 | Train Loss: 0.6306 | Val Loss: 0.748 | Val Accuracy: 89.68% | ROUGE-1 SCORE: 33.04% | ROUGE-2 SCORE: 4.24%


 60%|██████    | 6/10 [19:03<13:25, 201.27s/it]

Epoch 6 | Train Loss: 0.6035 | Val Loss: 0.758 | Val Accuracy: 89.71% | ROUGE-1 SCORE: 34.67% | ROUGE-2 SCORE: 4.49%


 70%|███████   | 7/10 [22:39<10:17, 205.87s/it]

Epoch 7 | Train Loss: 0.5785 | Val Loss: 0.767 | Val Accuracy: 89.69% | ROUGE-1 SCORE: 34.22% | ROUGE-2 SCORE: 4.46%


 80%|████████  | 8/10 [26:05<06:51, 205.88s/it]

Epoch 8 | Train Loss: 0.5589 | Val Loss: 0.779 | Val Accuracy: 89.67% | ROUGE-1 SCORE: 35.32% | ROUGE-2 SCORE: 4.70%


 90%|█████████ | 9/10 [29:41<03:29, 209.02s/it]

Epoch 9 | Train Loss: 0.5391 | Val Loss: 0.786 | Val Accuracy: 89.65% | ROUGE-1 SCORE: 36.21% | ROUGE-2 SCORE: 4.77%


100%|██████████| 10/10 [33:16<00:00, 199.68s/it]

Epoch 10 | Train Loss: 0.5213 | Val Loss: 0.791 | Val Accuracy: 89.69% | ROUGE-1 SCORE: 36.56% | ROUGE-2 SCORE: 4.50%





In [6]:
tokenizer.decode(model.generate(tokenizer.encode('hi how'), 10))

' i m sorry i ll be able to go to'

In [7]:
torch.save(model.state_dict(), 'models/lstm_model')

# Transformers pipeline

In [8]:
pretrained_model = AutoModelForCausalLM.from_pretrained('../distilgpt2').to('cuda')
generator = pipeline("text-generation", model=pretrained_model, tokenizer=tokenizer)

Device set to use cuda:0


In [9]:
results, rouge = evaluate_pipeline(generator)

  0%|          | 0/10000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/10000 [00:00<44:31,  3.74it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 4/10000 [00:00<14:57, 11.14it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 6/10000 [00:00<12:14, 13.60it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 9/10000 [00:00<09:18, 17.89it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
You seem to be using the pipelines sequenti

In [10]:
results[0:10]

['awww that s a bummer you shoulda got david carr of thay.\n\nMitts\nNathan\nMy thoughts',
 'is upset that he can t update his facebook by texting it and might cry as a resounding voice.',
 'i dived many times for the ball managed to save 50 the reels.\n\n\n\n\nSo I thought I could help the',
 'my whole body feels itchy and like a cat.”\nAs I',
 'no it s not behaving at all i m mad why am i here because i can t ive been on a plane and i am not on a plane and i am not',
 'not the whole ㅠ�',
 'need a hunch on',
 'hey long time no see yes rains a bit only a bit lol i m fi m u cant use a light bulb to light a lot more then I',
 'nope they didn t h.\n\nIt',
 'que me mu, p']

In [11]:
print(rouge)

{'rouge1': np.float64(0.6375416956534943), 'rouge2': np.float64(0.5839876818800434), 'rougeL': np.float64(0.6368535018799146), 'rougeLsum': np.float64(0.6360259566412538)}


# Итоги

Не удалось подключиться к ВМ, поэтому для экономии времени обучение проводилось на 10000 примерах и валидация - на 1000. Метрики ROUGE-1 и ROUGE-2 гораздо выше у предобученной модели трансформера (36.56% ROUGE-1 у LSTM против 63.75% у трансформера и 4.5% ROUGE-2 у LSTM против 58.4% у трансформера). Качество генерации текста (если сравнивать "на глаз") при этом у моделей примерно одинаковое. Однако стоит учесть, что distilgpt2 на текстах из sentiment140-датасета не обучалась (либо, если обучалась, то не только на них), и если нам необходима генерация текстов, специфичных для конкретного домена (в нашем случае, twitter), то лучше использовать собственноручно обученную LSTM (но обучить на полном датасете).