In [1]:
import os
import re
import sys
import argparse
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_cluster import random_walk
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm.auto import tqdm
#from collections import defaultdict

sys.path.append("../")
from model.graph_sage import GraphSAGE, GraphSAGE_Dataset
from utils.loss import ListNet, MythNet
from utils.data_processing import get_process_data
from utils.data_loader import load_document
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all
from utils.toolbox import get_free_gpu, same_seeds, show_settings, record_settings, get_preprocess_document, get_preprocess_document_embs, get_preprocess_document_labels, get_word_embs, split_data, doc_filter, generate_graph



In [2]:
config = {
    'model': 'ZTM',
    'dataset': 'tweet',
    'dataset_name': 'tweet',
    'vocabulary_size':100,
    'encoder': 'roberta',
    'target': 'tf-idf',
    'lr': 1e-4,
    'optimizer': 'adam',
    'momentum': 0.99, 
    'seed': 123,
    'epochs': 10,
    'batch_size': 16,
    'ratio': 0.8,
    'topk': [5, 10, 15],
    'visualize': True,
    'threshold': 0.7,
}

show_settings(config)
same_seeds(config['seed'])

-------- Info ---------
model: ZTM
dataset: tweet
dataset_name: tweet
vocabulary_size: 100
encoder: roberta
target: tf-idf
lr: 0.0001
optimizer: adam
momentum: 0.99
seed: 123
epochs: 10
batch_size: 16
ratio: 0.8
topk: [5, 10, 15]
visualize: True
threshold: 0.7

-----------------------


In [3]:
# data preprocessing
unpreprocessed_corpus ,preprocessed_corpus = get_preprocess_document(**config)

Getting preprocess documents: tweet
min_df: 1 max_df: 1.0 vocabulary_size: 100 min_doc_word: 15


Reusing dataset tweet_eval (/dhome/casimir0304/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


In [4]:
# generating document embedding
doc_embs, doc_model = get_preprocess_document_embs(preprocessed_corpus, config['encoder'])

Getting preprocess documents embeddings
Using cuda 0 for training...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
# Decode target & Vocabulary
labels, vocabularys= get_preprocess_document_labels(preprocessed_corpus)
id2token = {k: v for k, v in zip(range(0, len(vocabularys[config['target']])), vocabularys[config['target']])}
token2id = {v: k for k, v in id2token.items()}

Getting preprocess documents labels


In [None]:
# word embedding preparation
word_embeddings = get_word_embs(vocabularys[config['target']], id2token=id2token, data_type='tensor')

0it [00:00, ?it/s]

In [None]:
vocab_set = set(token2id)
doc_list = [doc_filter(doc, vocab_set) for doc in tqdm(preprocessed_corpus, desc="Delete word from raw document:")]

In [None]:
# Build Graph
vocab_set = set(token2id)
doc_list = [doc_filter(doc, vocab_set) for doc in tqdm(preprocessed_corpus, desc="Delete word from raw document:")]
edge_index = torch.tensor(generate_graph(doc_list, token2id, id2token), dtype=torch.long).t().contiguous()

In [None]:
# prepare dataset
dataset = GraphSAGE_Dataset(unpreprocessed_corpus, doc_embs, labels[config['target']])
training_length = int(len(dataset) * config['ratio'])
validation_length = len(dataset) - training_length
training_set, validation_set = random_split(dataset, lengths=[training_length, validation_length],generator=torch.Generator().manual_seed(42))

In [None]:
# Define document embeddings dimension
if config['encoder'] == 'doc2vec':
    contextual_size = 200
elif config['encoder'] == 'average':
    contextual_size = 300
else:
    contextual_size = 768

In [None]:
model = GraphSAGE(config=config, edge_index=edge_index, vocabulary=vocabularys[config['target']], id2token=id2token, contextual_size=contextual_size, vocab_size=len(vocabularys[config['target']]), word_embeddings=word_embeddings)

In [None]:
model.fit(training_set, validation_set)