In [None]:
!pip install torch torch_geometric transformers tokenizers

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/co-citation-prediction/src

Mounted at /content/drive
/content/drive/MyDrive/co-citation-prediction/src


In [3]:
import torch
from tokenizers import Tokenizer
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd

from data import EncoderDataset, GraphDataset, RegressionDataset
from encoder import EncoderModel
from graph import GraphModel
from model import Model
from regression import RegressionModel

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print (torch.cuda.is_available())

True


In [None]:
torch.cuda.empty_cache()

In [None]:
import gc
embeddings = None
test = None
gc.collect()
torch.cuda.empty_cache()

In [None]:
data_path = "../data/data.feather"
tokenizer_name = "sentence-transformers/all-MiniLM-L6-v2"
encoder_dataset = EncoderDataset(data_path, tokenizer_name, device)
encoder_loader = DataLoader(
    encoder_dataset,
    batch_size=64,
    shuffle=False,
    collate_fn=encoder_dataset.collate_fn
)

In [None]:
encoder_config = {
  # "architectures": [
  #   "BertForMaskedLM"
  # ],
  # "attention_probs_dropout_prob": 0.1,
  # "gradient_checkpointing": False,
  # "hidden_act": "gelu",
  # "hidden_dropout_prob": 0.1,
  # "hidden_size": 768,
  # "initializer_range": 0.02,
  # "intermediate_size": 3072,
  # "layer_norm_eps": 1e-12,
  # "max_position_embeddings": 512,
  # "model_type": "bert",
  # "num_attention_heads": 12,
  # "num_hidden_layers": 12,
  # "pad_token_id": 0,
  # "position_embedding_type": "absolute",
  # "transformers_version": "4.6.0.dev0",
  # "type_vocab_size": 2,
  # "use_cache": True,
  # "vocab_size": 30522
  "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": False,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size": 2,
  "use_cache": True,
  "vocab_size": 30522
}
encoder_model_name = "sentence-transformers/all-MiniLM-L6-v2"
encoder_model = EncoderModel(encoder_model_name, encoder_config).to(device)

In [None]:
count = 0
embeddings = np.empty((0, 384))
import time
for batch in encoder_loader:
  # print(count)
  # time.sleep(0.1)
  batch = batch.T
  infer = encoder_model(batch)
  # print(infer)
  embed = infer[0].to('cpu')
  pool = infer[1].to('cpu')
  add = embed[:, 0, :].cpu().tolist()
  # test.append(embed[:, 0, :].cpu().tolist())
  embeddings = np.concatenate((embeddings, add))
  # print(test.get_device())
  # embed_inpt = embed['last_hidden_state'][:, 0, :].cpu()
  # test = torch.cat((test, embed[:, 0, :]), dim=0)
  # test.append(embed_inpt)
  # embed = None
  # gc.collect()
  torch.cuda.empty_cache()
  # if count == 1:
  #   break
  # count += 1

In [None]:
graph_dataset = GraphDataset(embeddings, encoder_dataset.data)
graph_loader = DataLoader(graph_dataset, shuffle=False)

In [8]:
import pandas as pd
from itertools import combinations
from torch_geometric.data import Dataset as GeoDataset
from torch_geometric.data import Data as GeoData
from torch.nn.utils.rnn import pad_sequence

In [9]:
edge_index = encoder_dataset.data[["id", "references"]].explode("references").values.transpose()
# lookupTable, indexed_dataSet = np.unique(edge_index.flatten(), return_inverse=True)

In [10]:
paper_to_id = {}
id_to_paper = {}
graph_ei = np.empty(edge_index.shape)
curr_id = 0
for i in range(edge_index.shape[0]):
  for j in range(edge_index.shape[1]):
    papers = paper_to_id.keys()
    paper = edge_index[i, j]
    if paper not in papers:
      paper_to_id[paper] = curr_id
      id_to_paper[curr_id] = paper
      curr_id += 1
    graph_ei[i, j] = paper_to_id[paper]

In [17]:
remaining_papers = max(id_to_paper.keys()) - embeddings.shape[0]
average_embed = np.mean(embeddings, axis=0, keepdims=True)
remaining_papers_embed = np.repeat(average_embed, remaining_papers, axis=0)
final_embed = np.concatenate((embeddings, remaining_papers_embed), axis=0)

In [None]:
data = GeoData(x=final_embed, edge_index=graph_ei, is_directed=True)

In [None]:
graph_model = GraphModel()

In [None]:
regression_dataset = RegressionDataset(encoder_dataset.data, graph_model.embeddings)
regression_loader = DataLoader(regression_dataset, shuffle=False)

In [None]:
regression_model = RegressionModel()

In [None]:
model = Model(encoder_model, graph_model, regression_model)

In [None]:
model.setup(encoder_loader)

In [None]:
model.train()