In [7]:
from tweet_processor import TweetProcessor
from word_embedding_model import WordEmbeddingModel
from sentence_embedding_model import SentenceEmbeddingModel
import dataset_handler
from tweet import Tweet

# from hashtag_to_sentence_space_mapper import HashtagToSentenceSpaceMapper

dataset_file_path = 'datasets/hashtags-en-tweets.jsonl'

tweet_processor = TweetProcessor()

In [11]:
word_emb_tweets = tweet_processor.preprocess_tweets_for_word_embedding(dataset_file_path)

word_embedding_model = WordEmbeddingModel()
word_embedding_model.train(word_emb_tweets)

Removing stopwords and lemmatizing with spacy: [0 / 200526]
Removing stopwords and lemmatizing with spacy: [2000 / 200526]
Removing stopwords and lemmatizing with spacy: [4000 / 200526]
Removing stopwords and lemmatizing with spacy: [6000 / 200526]
Removing stopwords and lemmatizing with spacy: [8000 / 200526]
Removing stopwords and lemmatizing with spacy: [10000 / 200526]
Removing stopwords and lemmatizing with spacy: [12000 / 200526]
Removing stopwords and lemmatizing with spacy: [14000 / 200526]
Removing stopwords and lemmatizing with spacy: [16000 / 200526]
Removing stopwords and lemmatizing with spacy: [18000 / 200526]
Removing stopwords and lemmatizing with spacy: [20000 / 200526]
Removing stopwords and lemmatizing with spacy: [22000 / 200526]
Removing stopwords and lemmatizing with spacy: [24000 / 200526]
Removing stopwords and lemmatizing with spacy: [26000 / 200526]
Removing stopwords and lemmatizing with spacy: [28000 / 200526]
Removing stopwords and lemmatizing with spacy: [

In [3]:
sent_emb_tweets = tweet_processor.preprocess_tweets_for_sentence_embedding(dataset_file_path)

In [13]:
from functools import lru_cache

sent_emb_model = SentenceEmbeddingModel()

@lru_cache(maxsize=None)
def get_sent_emb_for_tweet_text(tweet_text: str):
    return sent_emb_model._generate_embedding(tweet_text)

In [1]:
sent_emb_model = SentenceEmbeddingModel()
res = sent_emb_model._generate_embedding(sent_emb_tweets[0].text)
tweet_id_to_sent_emb = {tweet.id: sent_emb_model._generate_embedding(tweet.text) for tweet in sent_emb_tweets}

NameError: name 'SentenceEmbeddingModel' is not defined

In [10]:
from collections import defaultdict
from typing import List

from tweet import Tweet

hashtag_to_tweets = defaultdict(list)
for tweet in sent_emb_tweets:
    for hashtag in tweet.hashtags:
        hashtag_to_tweets[hashtag].append(tweet)

def get_tweets_by_hashtag(hashtag: str) -> List[Tweet]:
    return hashtag_to_tweets[hashtag]

In [18]:
len(get_tweets_by_hashtag('#LiverpoolInter'))

50

In [39]:
import torch

from sentence_embedding_model import SentenceEmbeddingModel

unique_hashtags = list(set(hashtag for tweet in sent_emb_tweets for hashtag in tweet.hashtags))
unique_hashtags = word_embedding_model.remove_hashtags_not_part_of_the_vocab(unique_hashtags)

# TODO split into train, val, test
train_hashtags = unique_hashtags[:10]
test_hashtags = unique_hashtags[10:15]

sent_emb_model = SentenceEmbeddingModel()
# tweet_id_to_sent_emb = {tweet.id: sent_emb_model._generate_embedding(tweet.text) for tweet in sent_emb_tweets}

words_embs_train = []
avg_sent_embs_train = []
for hashtag in train_hashtags:
    words_embs_train.append(torch.tensor(word_embedding_model.get_embedding(hashtag)))
    tweets_containing_hashtag = get_tweets_by_hashtag(hashtag)
    # sent_embs = [tweet_id_to_sent_emb[tweet.id] for tweet in tweets_containing_hashtag]
    sent_embs = [get_sent_emb_for_tweet_text(tweet.text) for tweet in tweets_containing_hashtag]
    avg_sent_embs_train.append(torch.mean(torch.stack(sent_embs), dim=0))

x_train = torch.stack(tuple(words_embs_train))
y_train = torch.stack(tuple(avg_sent_embs_train))

words_embs_test = []
avg_sent_embs_test = []
for hashtag in test_hashtags:
    words_embs_test.append(torch.tensor(word_embedding_model.get_embedding(hashtag)))
    tweets_containing_hashtag = get_tweets_by_hashtag(hashtag)
    # sent_embs = [tweet_id_to_sent_emb[tweet.id] for tweet in tweets_containing_hashtag]
    sent_embs = [get_sent_emb_for_tweet_text(tweet.text) for tweet in tweets_containing_hashtag]
    avg_sent_embs_test.append(torch.mean(torch.stack(sent_embs), dim=0))

x_test = torch.stack(tuple(words_embs_test))
y_test = torch.stack(tuple(avg_sent_embs_test))

# word_embs = [torch.tensor(word_embedding_model.get_embedding(hashtag)) for hashtag in train_hashtags]
# x = torch.stack(tuple(word_embs))

# def get_texts_containing_hashtag(hashtag: str) -> List

x_train.shape, y_train.shape, x_test.shape, y_test.shape

(torch.Size([10, 150]),
 torch.Size([10, 768]),
 torch.Size([5, 150]),
 torch.Size([5, 768]))

In [42]:
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader

class Dataset(torch.utils.data.Dataset):
    def __init__(self, hashtags: List[str], word_emb_model: WordEmbeddingModel):
        self.hashtags = hashtags
        self.word_emb_model = word_emb_model

    def __len__(self):
        return len(self.hashtags)

    def __getitem__(self, idx):
        hashtag = self.hashtags[idx]

        hashtag_emb = torch.tensor(self.word_emb_model.get_embedding(hashtag))
        tweets_containing_hashtag = get_tweets_by_hashtag(hashtag)
        sent_embs = [get_sent_emb_for_tweet_text(tweet.text) for tweet in tweets_containing_hashtag]
        avg_sent_emb = torch.mean(torch.stack(sent_embs), dim=0)
        
        return {'x': hashtag_emb, 'y': avg_sent_emb}

class DataModule(pl.LightningDataModule):
    def __init__(self, train_hashtags: List[str], test_hashtags: List[str], word_emb_model: WordEmbeddingModel, batch_size: int = 1):
        self.train_hashtags = train_hashtags
        self.test_hashtags = test_hashtags
        self.word_emb_model = word_emb_model
        self.batch_size = batch_size

        self.train_dataset = None
        self.test_dataset = None

    def setup(self, stage: Optional[str] = None):
        self.train_dataset = Dataset(train_hashtags, self.word_emb_model)
        self.test_dataset = Dataset(test_hashtags, self.word_emb_model)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size)

    def val_dataloader(self):
        return None

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

unique_hashtags = list(set(hashtag for tweet in sent_emb_tweets for hashtag in tweet.hashtags))
unique_hashtags = word_embedding_model.remove_hashtags_not_part_of_the_vocab(unique_hashtags)

# TODO split into train, val, test
train_hashtags = unique_hashtags[:10]
test_hashtags = unique_hashtags[10:15]

data_module = DataModule(train_hashtags, test_hashtags, word_embedding_model)

ValueError: transformers.__spec__ is None

In [None]:
from hashtag_to_sent_mapper import Hashtag2SentMapper



device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

hashtag2SentMapper = Hashtag2SentMapper(x_train.shape[1], y_train.shape[1]).to(device) # TODO shape richtig?

trainer = pl.Trainer(gpus=1, max_epochs=50)
                            # callbacks=[EarlyStopping(monitor='val_loss', patience=100)]) 
trainer.fit(hashtag2SentMapper)
trainer.test()

In [5]:
train_tweets, test_tweets = dataset_handler.split_into_train_test(sent_emb_tweets)

sentences_train, sentences_test, targets_train, targets_test, test_hashtags = dataset_handler.prepare_model_inputs_and_targets(
    train_tweets, test_tweets, word_embedding_model
)

targets_train.shape, targets_test.shape

some hashtags were filtered out since they are not contained in the Word2Vec-Vocab tweet-hashtags: ['#Kyiv', '#UkraineUnderAttack', '#Russian_Ukrainian_War'] | hashtags after filtering: ['#Kyiv', '#UkraineUnderAttack']
some hashtags were filtered out since they are not contained in the Word2Vec-Vocab tweet-hashtags: ['#SecondWorldWar', '#WWII', '#refugees', '#Estonia', '#Sweden', '#Australia', '#UK', '#US', '#Canada', '#Belgium', '#Germany'] | hashtags after filtering: ['#SecondWorldWar', '#WWII', '#refugees', '#Estonia', '#Sweden', '#Australia', '#UK', '#Canada', '#Belgium', '#Germany']
some hashtags were filtered out since they are not contained in the Word2Vec-Vocab tweet-hashtags: ['#Russian_Ukrainian_War', '#RussiaUkraineWar'] | hashtags after filtering: ['#RussiaUkraineWar']
some hashtags were filtered out since they are not contained in the Word2Vec-Vocab tweet-hashtags: ['#UkraineUnderAttaсk'] | hashtags after filtering: []
some hashtags were filtered out since they are not con

((135144,), (45057,))

In [1]:
sent_emb_model = SentenceEmbeddingModel()
sent_embs_train = sent_emb_model.generate_embeddings(sentences_train)
sent_embs_test = sent_emb_model.generate_embeddings(sentences_test)
sent_embs_train.shape, sent_embs_test.shape

NameError: name 'SentenceEmbeddingModel' is not defined

In [None]:
from hashtag_to_sent_mapper import Hashtag2SentMapper
import pytorch_lightning as pl
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

hashtag2SentMapper = Hashtag2SentMapper(targets_train.shape[1], sent_embs_train.shape[1]).to(device) # TODO shape richtig?

trainer = pl.Trainer(gpus=1, max_epochs=50)
                            # callbacks=[EarlyStopping(monitor='val_loss', patience=100)]) 
trainer.fit(hashtag2SentMapper, data)
trainer.test()