In [1]:
import pickle

from tweet_processor import TweetProcessor
from word_embedding_model import WordEmbeddingModel
from sentence_embedding_model import SentenceEmbeddingModel
import dataset_handler
from tweet import Tweet

dataset_file_path = 'datasets/hashtags-en-tweets.jsonl'

tweet_processor = TweetProcessor()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# word_emb_tweets = tweet_processor.preprocess_tweets_for_word_embedding(dataset_file_path)

# word_embedding_model = WordEmbeddingModel()
# word_embedding_model.train(word_emb_tweets)

# pickle.dump(word_embedding_model, open('save_files/word_emb_model.pkl', 'wb'))

word_embedding_model = pickle.load(open('save_files/word_emb_model.pkl', 'rb'))

In [3]:
sent_emb_tweets = tweet_processor.preprocess_tweets_for_sentence_embedding(dataset_file_path)

In [4]:
from functools import lru_cache

sent_emb_model = SentenceEmbeddingModel()

@lru_cache(maxsize=None)
def get_sent_emb_for_tweet_text(tweet_text: str):
    return sent_emb_model._generate_embedding(tweet_text)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
from collections import defaultdict
from typing import List

from tweet import Tweet

hashtag_to_tweets = defaultdict(list)
for tweet in sent_emb_tweets:
    for hashtag in tweet.hashtags:
        hashtag_to_tweets[hashtag].append(tweet)

def get_tweets_by_hashtag(hashtag: str) -> List[Tweet]:
    return hashtag_to_tweets[hashtag]

In [20]:
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader

class Dataset(torch.utils.data.Dataset):
    def __init__(self, hashtags: List[str], word_emb_model: WordEmbeddingModel):
        self.hashtags = hashtags
        self.word_emb_model = word_emb_model

    def __len__(self):
        return len(self.hashtags)

    def __getitem__(self, idx):
        hashtag = self.hashtags[idx]

        hashtag_emb = torch.tensor(self.word_emb_model.get_embedding(hashtag))
        tweets_containing_hashtag = get_tweets_by_hashtag(hashtag)
        sent_embs = [get_sent_emb_for_tweet_text(tweet.text) for tweet in tweets_containing_hashtag]
        # have to wrap the mean in another tensor, otherwise the following exception occurs when training
        # "RuntimeError: Trying to backward through the graph a second time"
        avg_sent_emb = torch.tensor(torch.mean(torch.stack(sent_embs), dim=0))
        
        return {'x': hashtag_emb, 'y': avg_sent_emb}

class DataModule(pl.LightningDataModule):
    def __init__(
        self, train_hashtags: List[str], val_hashtags: List[str], test_hashtags: List[str], 
        word_emb_model: WordEmbeddingModel, batch_size: int = 2
    ):
        # super.__init__()
        self.prepare_data_per_node = False
        
        self.train_hashtags = train_hashtags
        self.val_hashtags = val_hashtags
        self.test_hashtags = test_hashtags
        self.word_emb_model = word_emb_model
        self.batch_size = batch_size

        self.train_dataset = None
        self.test_dataset = None

    def prepare_data(self):
        pass
    
    def setup(self, stage: str = None):
        self.train_dataset = Dataset(self.train_hashtags, self.word_emb_model)
        self.val_dataset = Dataset(self.val_hashtags, self.word_emb_model)
        self.test_dataset = Dataset(self.test_hashtags, self.word_emb_model)

    @property
    def in_features(self) -> int:
        return self.train_dataset[0]['x'].shape[0]

    @property
    def out_features(self) -> int:
        return self.train_dataset[0]['y'].shape[0]

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

unique_hashtags = list(set(hashtag for tweet in sent_emb_tweets for hashtag in tweet.hashtags))
unique_hashtags = word_embedding_model.remove_hashtags_not_part_of_the_vocab(unique_hashtags)

# TODO split into train, val, test
train_hashtags = unique_hashtags[:2]
val_hashtags = unique_hashtags[2:4]
test_hashtags = unique_hashtags[4:6]

data_module = DataModule(train_hashtags, val_hashtags, test_hashtags, word_embedding_model)
data_module.prepare_data()
data_module.setup()

In [21]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from hashtag_to_sent_mapper import Hashtag2SentMapper

torch.autograd.set_detect_anomaly(True)

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# from torchvision import datasets, transforms

# transform=transforms.ToTensor()
# dataset1 = datasets.MNIST('../data', train=True, download=True,
#                     transform=transform)
# dataset2 = datasets.MNIST('../data', train=False,
#                     transform=transform)
# train_loader = torch.utils.data.DataLoader(dataset1)
# test_loader = torch.utils.data.DataLoader(dataset2)
# hashtag2SentMapper = Hashtag2SentMapper(28*28, 10, learning_rate=0.01)

hashtag2SentMapper = Hashtag2SentMapper(data_module.in_features, data_module.out_features, learning_rate=0.01)
# .to(device)




trainer = pl.Trainer(gpus=0, max_epochs=50, callbacks=[EarlyStopping(monitor='val_loss', patience=100)],
 track_grad_norm=2, detect_anomaly=True, 
 log_every_n_steps=1) 
trainer.fit(
    hashtag2SentMapper, 
    # train_loader,
    # test_loader
    data_module.train_dataloader(),
    data_module.val_dataloader() # Also nur der Val dataloader scheint es nicht zu sein
)
trainer.test(dataloaders=[data_module.test_dataloader()])

# TODO dataloader mit Beispiel Daten voll klatschen z.b. nur 1 oder random. Dann testen
# TODO 1) detach/repackage the hidden state in between batches. There are (at least) three ways to do this (and I chose this solution):

#  hidden.detach_()
#  hidden = hidden.detach()
# ? Obwohl das für ein LSTM zu gelten scheint

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name                  | Type                | Params
--------------------------------------------------------------
0 | _model                | Sequential          | 333 K 
1 | _cosine_distance_loss | CosineEmbeddingLoss | 0     
--------------------------------------------------------------
333 K     Trainable params
0         Non-trainable params
333 K     Total params
1.333     Total estimated model params size (MB)


Epoch 1:   0%|          | 0/2 [00:00<?, ?it/s, loss=0.993, v_num=2]         

  File "/usr/local/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/local/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/vscode/.local/lib/python3.7/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/vscode/.local/lib/python3.7/site-packages/traitlets/config/application.py", line 976, in launch_instance
    app.start()
  File "/home/vscode/.local/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/home/vscode/.local/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/usr/local/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
    self._run_once()
  File "/usr/local/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
    handle._run()
  File "/usr/local/lib/python3.7/asyncio/events.py", line 88, in _r

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [7]:
for batch in data_module.val_dataloader():
    x, y = batch['x'], batch['y']
    print(x.shape, y.shape)
    print(y.requires_grad)
    break

torch.Size([2, 10]) torch.Size([2, 15])
False


In [None]:
for tag, param in hashtag2SentMapper._model.named_parameters():
    print(tag, param)

0.weight Parameter containing:
tensor([[-0.0045,  0.0196, -0.0028,  ..., -0.0244,  0.0312,  0.0542],
        [ 0.0257, -0.0047, -0.0458,  ...,  0.0614,  0.0132,  0.0039],
        [-0.0043, -0.0533,  0.0198,  ..., -0.0481, -0.0789,  0.0670],
        ...,
        [ 0.0027, -0.0346, -0.0622,  ...,  0.0242, -0.0706, -0.0575],
        [ 0.0003,  0.0006, -0.0041,  ..., -0.0195,  0.0237, -0.0774],
        [-0.0577, -0.0724, -0.0240,  ...,  0.0478, -0.0706, -0.0425]],
       requires_grad=True)
0.bias Parameter containing:
tensor([-0.0369, -0.0804,  0.0767, -0.0175, -0.0580,  0.0681, -0.0563,  0.0226,
         0.0728,  0.0215,  0.0420,  0.0791,  0.0812,  0.0374,  0.0699,  0.0184,
        -0.0467,  0.0095, -0.0084,  0.0237,  0.0787, -0.0406, -0.0173, -0.0424,
        -0.0663, -0.0303,  0.0167, -0.0173,  0.0482,  0.0023, -0.0567,  0.0556,
         0.0502,  0.0344, -0.0250,  0.0357,  0.0455,  0.0016,  0.0223, -0.0622,
         0.0579, -0.0343, -0.0765, -0.0330,  0.0740, -0.0232, -0.0173, -0.0165,

In [None]:
train_tweets, test_tweets = dataset_handler.split_into_train_test(sent_emb_tweets)

sentences_train, sentences_test, targets_train, targets_test, test_hashtags = dataset_handler.prepare_model_inputs_and_targets(
    train_tweets, test_tweets, word_embedding_model
)

targets_train.shape, targets_test.shape

some hashtags were filtered out since they are not contained in the Word2Vec-Vocab tweet-hashtags: ['#Kyiv', '#UkraineUnderAttack', '#Russian_Ukrainian_War'] | hashtags after filtering: ['#Kyiv', '#UkraineUnderAttack']
some hashtags were filtered out since they are not contained in the Word2Vec-Vocab tweet-hashtags: ['#SecondWorldWar', '#WWII', '#refugees', '#Estonia', '#Sweden', '#Australia', '#UK', '#US', '#Canada', '#Belgium', '#Germany'] | hashtags after filtering: ['#SecondWorldWar', '#WWII', '#refugees', '#Estonia', '#Sweden', '#Australia', '#UK', '#Canada', '#Belgium', '#Germany']
some hashtags were filtered out since they are not contained in the Word2Vec-Vocab tweet-hashtags: ['#Russian_Ukrainian_War', '#RussiaUkraineWar'] | hashtags after filtering: ['#RussiaUkraineWar']
some hashtags were filtered out since they are not contained in the Word2Vec-Vocab tweet-hashtags: ['#UkraineUnderAttaсk'] | hashtags after filtering: []
some hashtags were filtered out since they are not con

((135144,), (45057,))