In [None]:
from src.data_utils import make_clean_data, tokenize_data, train_test_val_split
from src.next_token_dataset import TweetsDataset
from src.eval_transformer_pipeline import evaluate_pipeline
from src.lstm_model import LSTM
from src.lstm_train import lstm_train
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


# LSTM

In [2]:
make_clean_data('data/tweets.txt')

In [3]:
tokenize_data('data/tweets_processed.txt')

In [50]:
train, test, val = train_test_val_split()

In [51]:
max_len = 128
train_ds = TweetsDataset(train[:10000], max_len)
test_ds = TweetsDataset(test[:200], max_len)
val_ds = TweetsDataset(val[:200], max_len)

train_dl = DataLoader(train_ds, batch_size=32, shuffle=False)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=False)
val_dl = DataLoader(val_ds, batch_size=32, shuffle=False)

In [2]:
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilgpt2')
vocab_size = tokenizer.vocab_size

In [None]:
model = LSTM(vocab_size)

lstm_train(model, train_dl, val_dl)

  return torch.tensor(x), torch.tensor(y)
 10%|█         | 1/10 [00:56<08:27, 56.36s/it]

Epoch 1 | Train Loss: 1.1809 | Val Loss: 0.896 | Val Accuracy: 88.13% | ROUGE-1 SCORE: 7.02% | ROUGE-2 SCORE: 0.00%


 20%|██        | 2/10 [01:54<07:37, 57.14s/it]

Epoch 2 | Train Loss: 0.8161 | Val Loss: 0.866 | Val Accuracy: 88.45% | ROUGE-1 SCORE: 16.96% | ROUGE-2 SCORE: 0.81%


 30%|███       | 3/10 [02:51<06:42, 57.51s/it]

Epoch 3 | Train Loss: 0.7794 | Val Loss: 0.840 | Val Accuracy: 88.67% | ROUGE-1 SCORE: 19.22% | ROUGE-2 SCORE: 1.59%


 40%|████      | 4/10 [03:49<05:46, 57.70s/it]

Epoch 4 | Train Loss: 0.7487 | Val Loss: 0.824 | Val Accuracy: 88.77% | ROUGE-1 SCORE: 22.25% | ROUGE-2 SCORE: 1.90%


 50%|█████     | 5/10 [04:46<04:46, 57.25s/it]

Epoch 5 | Train Loss: 0.7226 | Val Loss: 0.814 | Val Accuracy: 88.84% | ROUGE-1 SCORE: 23.72% | ROUGE-2 SCORE: 2.08%


 60%|██████    | 6/10 [05:44<03:49, 57.50s/it]

Epoch 6 | Train Loss: 0.6982 | Val Loss: 0.806 | Val Accuracy: 88.92% | ROUGE-1 SCORE: 26.65% | ROUGE-2 SCORE: 2.52%


 70%|███████   | 7/10 [06:42<02:52, 57.58s/it]

Epoch 7 | Train Loss: 0.6734 | Val Loss: 0.805 | Val Accuracy: 89.00% | ROUGE-1 SCORE: 27.90% | ROUGE-2 SCORE: 2.81%


 80%|████████  | 8/10 [07:39<01:54, 57.35s/it]

Epoch 8 | Train Loss: 0.6489 | Val Loss: 0.803 | Val Accuracy: 89.05% | ROUGE-1 SCORE: 29.52% | ROUGE-2 SCORE: 2.85%


 90%|█████████ | 9/10 [08:36<00:57, 57.26s/it]

Epoch 9 | Train Loss: 0.6250 | Val Loss: 0.800 | Val Accuracy: 88.99% | ROUGE-1 SCORE: 29.96% | ROUGE-2 SCORE: 3.08%


100%|██████████| 10/10 [09:33<00:00, 57.38s/it]

Epoch 10 | Train Loss: 0.6007 | Val Loss: 0.803 | Val Accuracy: 88.98% | ROUGE-1 SCORE: 30.64% | ROUGE-2 SCORE: 3.53%





In [16]:
tokenizer.decode(model.generate(tokenizer.encode('hi what'), 10))

' i had to go to bed i m not a'

In [15]:
torch.save(model.state_dict(), 'models/lstm_model')

# Transformers pipeline

In [None]:
pretrained_model = AutoModelForCausalLM.from_pretrained('../repos/distilgpt2').to('cuda')
generator = pipeline("text-generation", model=pretrained_model, tokenizer=tokenizer)

In [None]:
results, rouge = evaluate_pipeline(generator)

In [13]:
results[0:5]

['sampleThe following steps are needed to enable the use of the API.',
 "sample, I don't want to be a part of the problem. It's just a matter of time",
 'sampleThe basic information provided by the authors is as follows:\n\n\n',
 'sample the following.\n\nThe following is',
 'sample" : "I want to ask you what you are saying about me. I']

In [None]:
print(rouge)

{'rouge1': np.float64(0.039633381636651664), 'rouge2': np.float64(0.0012389934628330677), 'rougeL': np.float64(0.03606547663266661), 'rougeLsum': np.float64(0.036081814541972346)}
