In [1]:
import pickle

from tweet_processor import TweetProcessor
from word_embedding_model import WordEmbeddingModel
from sentence_embedding_model import SentenceEmbeddingModel
import dataset_handler

dataset_file_path = 'datasets/hashtags-en-tweets.jsonl'

tweet_processor = TweetProcessor()

# word_emb_tweets = tweet_processor.preprocess_tweets_for_word_embedding(dataset_file_path)

# word_embedding_model = WordEmbeddingModel()
# word_embedding_model.train(word_emb_tweets)

# pickle.dump(word_embedding_model, open('save_files/word_emb_model.pkl', 'wb'))

word_emb_model = pickle.load(open('save_files/word_emb_model.pkl', 'rb'))

sent_emb_tweets = tweet_processor.preprocess_tweets_for_sentence_embedding(dataset_file_path)

# sent_emb_model = SentenceEmbeddingModel()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dataset import DataModule
from hashtag_to_sent_mapper import Hashtag2SentMapper

import torch

data_module = DataModule.restore_from_file()
data_module.batch_size = 500
data_module.num_workers = 1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

hashtag_to_sent_mapper = Hashtag2SentMapper(data_module.in_features, data_module.out_features, learning_rate=0.001, hidden_layer1_size=300, hidden_layer2_size=500).to(device)

In [3]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import pytorch_lightning as pl

# torch.autograd.set_detect_anomaly(True)

trainer = pl.Trainer(
    gpus=0, max_epochs=500, 
    callbacks=[EarlyStopping(monitor='val_loss', patience=200)],
# callbacks=[EarlyStopping(monitor='val_cosine_distance', patience=200)],  
#  track_grad_norm=2, detect_anomaly=True, 
    log_every_n_steps=1
) 
trainer.fit(
    hashtag_to_sent_mapper, 
    data_module.train_dataloader(),
    data_module.val_dataloader()
)
trainer.test(dataloaders=[data_module.test_dataloader()])
trainer.save_checkpoint('save_files/best_model.ckpt', weights_only=True)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name                  | Type                | Params
--------------------------------------------------------------
0 | _model                | Sequential          | 580 K 
1 | _cosine_distance_loss | CosineEmbeddingLoss | 0     
2 | _cosine_similarity    | CosineSimilarity    | 0     
--------------------------------------------------------------
580 K     Trainable params
0         Non-trainable params
580 K     Total params
2.322     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



                                                                           



Epoch 499: 100%|██████████| 7/7 [00:00<00:00,  8.62it/s, loss=0.00881, v_num=20]

  f"`.{fn}(ckpt_path=None)` was called without a model."
Restoring states from the checkpoint path at /workspaces/HASHET/lightning_logs/version_20/checkpoints/epoch=499-step=3000.ckpt





Loaded model weights from checkpoint at /workspaces/HASHET/lightning_logs/version_20/checkpoints/epoch=499-step=3000.ckpt


Testing DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 38.60it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
  test_cosine_distance      0.29528093338012695
        test_loss          0.022327067330479622
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [5]:
from torch import Tensor

hashtags = [hashtag for tweet in sent_emb_tweets for hashtag in tweet.hashtags]

def get_transformed_hashtag_embedding(hashtag: str, word_emb_model: WordEmbeddingModel, 
    hashtag2SentMapper: Hashtag2SentMapper
) -> Tensor:
    with torch.no_grad():
        return hashtag2SentMapper(word_emb_model.get_embedding(hashtag))

print('before')
get_transformed_hashtag_embedding(hashtags[0], word_emb_model, hashtag2SentMapper)

before


tensor([ 1.9311e-02, -3.6307e-02,  4.0846e-02,  8.5354e-02, -6.7750e-02,
        -2.3759e-02, -2.2721e-03, -7.6248e-02, -1.9770e-02, -7.8939e-02,
         3.0294e-03, -5.2640e-02,  1.2315e-02,  5.8993e-02, -1.9416e-02,
        -5.0486e-02,  3.3549e-03,  3.5711e-02, -5.1872e-02, -5.3912e-02,
         4.8903e-02,  5.5136e-02,  3.5773e-02,  5.8461e-02,  9.6186e-04,
         7.2198e-02, -1.5755e-01, -6.0633e-02, -5.9500e-02, -1.2687e-02,
         2.6658e-02,  9.5729e-02,  4.4948e-02,  7.9280e-02, -7.0561e-03,
         2.7932e-02,  1.0857e-01, -1.1617e-02,  2.7340e-02,  1.3276e-01,
         7.6379e-02, -6.0530e-03,  1.2843e-02, -7.6485e-03, -6.1671e-02,
         2.0695e-03,  5.8715e-03,  1.3272e-01,  1.5155e-02,  9.1497e-02,
        -2.4424e-02, -5.4722e-02,  1.5701e-02,  9.9927e-03,  4.3658e-02,
         1.8088e-02,  8.9851e-02,  5.3792e-02, -4.5926e-02,  4.6862e-02,
        -1.2422e-01, -1.4561e-01, -4.1013e-02, -8.4233e-02,  7.3806e-02,
         7.9305e-02,  5.7702e-02, -4.8380e-02,  2.5

In [7]:
print('after')
get_transformed_hashtag_embedding(hashtags[0], word_emb_model, hashtag2SentMapper)

after


tensor([-2.4266e+00,  1.5253e+00, -1.5048e+00, -3.7962e+00, -4.8223e+00,
        -1.9657e+00,  6.1075e+00,  1.5758e+00, -3.0358e-02, -2.9325e+00,
        -4.0804e+00, -1.1042e+00, -3.0132e+00,  9.5930e-01,  6.6828e+00,
         2.0320e+00, -1.3290e+00,  3.2539e+00,  1.3742e+00, -1.0440e+00,
         8.1541e-01, -2.2690e+00,  1.0848e+00, -2.8071e+00,  1.3118e+00,
        -1.2969e+00, -7.1366e-01, -8.3527e-01, -1.1823e+00,  1.2938e+00,
         1.6526e-01,  2.7041e+00, -3.1776e+00,  1.0766e-01,  2.5182e+00,
        -7.0047e-01,  2.6078e+00, -3.5728e-01,  2.6673e+00,  2.9091e+00,
        -7.1166e-01, -1.2255e+00,  2.8295e+00, -5.2520e-01, -3.5899e-01,
        -4.1030e+00, -2.7239e+01, -9.7231e-01, -3.1204e+00, -2.5185e+00,
         7.2273e-01,  2.3606e-01,  1.5567e+00,  4.8183e-01, -2.0953e+00,
         5.3699e+00, -8.9445e-01,  1.6214e+00,  1.4679e-01,  9.9692e-02,
         7.7356e-01,  4.5966e-01, -1.4301e+00,  1.6263e+00, -1.4973e+00,
         3.6232e+00, -1.0655e+00,  4.0679e+00, -3.1

In [8]:
import torch

test_id_to_y = {i: torch.rand(768) for i in range(4000)}

In [None]:
train_tweets, test_tweets = dataset_handler.split_into_train_test(sent_emb_tweets)

sentences_train, sentences_test, targets_train, targets_test, test_hashtags = dataset_handler.prepare_model_inputs_and_targets(
    train_tweets, test_tweets, word_embedding_model
)

targets_train.shape, targets_test.shape

some hashtags were filtered out since they are not contained in the Word2Vec-Vocab tweet-hashtags: ['#Kyiv', '#UkraineUnderAttack', '#Russian_Ukrainian_War'] | hashtags after filtering: ['#Kyiv', '#UkraineUnderAttack']
some hashtags were filtered out since they are not contained in the Word2Vec-Vocab tweet-hashtags: ['#SecondWorldWar', '#WWII', '#refugees', '#Estonia', '#Sweden', '#Australia', '#UK', '#US', '#Canada', '#Belgium', '#Germany'] | hashtags after filtering: ['#SecondWorldWar', '#WWII', '#refugees', '#Estonia', '#Sweden', '#Australia', '#UK', '#Canada', '#Belgium', '#Germany']
some hashtags were filtered out since they are not contained in the Word2Vec-Vocab tweet-hashtags: ['#Russian_Ukrainian_War', '#RussiaUkraineWar'] | hashtags after filtering: ['#RussiaUkraineWar']
some hashtags were filtered out since they are not contained in the Word2Vec-Vocab tweet-hashtags: ['#UkraineUnderAttaсk'] | hashtags after filtering: []
some hashtags were filtered out since they are not con

((135144,), (45057,))

In [6]:
from dataset import CachedDataset
import torch

t1 = torch.tensor([1, 2])
t2 = torch.tensor([3, 4])
func = lambda x: x

CachedDataset.memory_efficient_mean([t1, t2], func, 2)

tensor([2., 3.])

In [4]:
from dataset import DataModule

sent_emb_model = SentenceEmbeddingModel()

data_module = DataModule(sent_emb_tweets, word_emb_model, sent_emb_model)
data_module.setup()



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Cashing Dataset: save_files/train_dataset.pkl ...


KeyboardInterrupt: 

In [3]:
from dataset import DataModule

def get_all_samples():
    data_module = DataModule.restore_from_file()
    data_module.batch_size = 1

    all_samples = []
    all_samples.extend(data_module.train_dataloader())
    all_samples.extend(data_module.val_dataloader())
    all_samples.extend(data_module.test_dataloader())

    return all_samples

all_samples = get_all_samples()

In [4]:
list(all_samples)[0]['x'].shape

torch.Size([1, 150])

In [7]:
len(all_samples)

3913

In [3]:
import random
import numpy as np
import pickle

def time_saving_k_fold_cross_validation(k: int, percent_of_train_set_used_for_val: float = 1 / 8):
    all_samples = get_all_samples()

    random.shuffle(all_samples)

    partitions = np.array_split(all_samples, k)

    for split_idx in range(k):
        test_set = partitions[split_idx]
        train_val_set = []
        for i in range(k):
            if i != split_idx:
                train_val_set.extend(partitions[i])
            
        val_set = train_val_set[:int(len(train_val_set) * percent_of_train_set_used_for_val)]
        train_set = train_val_set[int(len(train_val_set) * percent_of_train_set_used_for_val):]

        for split, split_name in [(train_set, 'train'), (val_set, 'val'), (test_set, 'test')]:
            cache = {i: sample for sample in enumerate(split)}

            with open(f'{split_name}_dataset_split_{split_idx}') as file:
                pickle.dump(cache, file)

        print(len(train_set), len(val_set), len(test_set))

time_saving_k_fold_cross_validation(k=5)

2739 391 783
2739 391 783
2739 391 783
2740 391 782
2740 391 782


In [None]:
for split in splits:
    val = split[:len(split) * 0.2]

In [17]:
val = np.random.choice(splits[0], int(len(splits[0]) * 0.2), replace=False)
print(len(val))
print(len(splits[0]))

156
783
