# ALTeGraD 2023 Data Challenge
## Molecule Retrieval with Natural Language Queries
### École Polytechnique

MLP

## Drive and GitHub utils

### Retrieve data

We unzip the data.zip file directly in the virtual file system created with each single colab notebook. Such file system is temporary and specific to each notebook so this operation must be repeated each time (see later). It is the only way I found not to corrupt the data.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
#################
## TODO:
## Set the path to the google drive git repository
## The data.zip should be in the git folder!

# Even: ./drive/MyDrive/ENPC/MVA/ALTEGRAD/Project/Code/Public/data.zip
# Joy: ./drive/MyDrive/MVA/Altegrad_project/data.zip
#################

!unzip ./drive/MyDrive/.../altegrad_challenge_2024/data.zip

### Set the working directory to the git folder


In [None]:
#################
## TODO:
## Set the path to the google drive git repository

# Even: /content/drive/MyDrive/ENPC/MVA/ALTEGRAD/Project/Code/Public/
#################

%cd /content/drive/MyDrive/.../altegrad_challenge_2024
!pwd

### GitHub operations?

In [None]:
#!git status
#!git checkout <branch>
#!git checkout -b <new_branch>

!git pull

## Imports

In [None]:
!pip install -r /content/drive/MyDrive/ENPC/MVA/ALTEGRAD/Project/Code/Public/requirements.txt
# Joy: !pip install -r /content/requirements.txt

## *-----Beginning of the run locally-----*

## Data path

Data path must be a path from the git repository folder towards the virtual file system. So it must be of the following format:

In [None]:
# In Colab
data_path = '/content/data'

## Main script

In [None]:
from dataloader import GraphTextDataset, GraphDataset, TextDataset
from Model import Model
from train_val_test import train, test
import LossFunctions

from torch_geometric.loader import DataLoader
import numpy as np
from transformers import AutoTokenizer
import torch
from torch import optim
import pandas as pd
import os

In [None]:
##################################################
## TRAINING

# Select model
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Model(model_name=model_name, num_node_features=300, nout=768, nhid=300, graph_hidden_channels=300, graph_gnnlayers=4) # nout = bert model hidden dim
model.to(device)

# Load data
gt = np.load(f"{data_path}/token_embedding_dict.npy", allow_pickle=True)[()]
val_dataset = GraphTextDataset(root=f'{data_path}/', gt=gt, split='val', tokenizer=tokenizer, )
train_dataset = GraphTextDataset(root=f'{data_path}/', gt=gt, split='train', tokenizer=tokenizer, )

# Hyper-parameters
nb_epochs = 40
batch_size = 64
learning_rate = 3e-5
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), weight_decay=0.01)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1, threshold=0.1, threshold_mode='rel', verbose=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
similarity = 'cosine'
loss = LossFunctions.NTXent(device, batch_size, 0.1, use_cosine_similarity=True if 'cosine' in similarity else False)

hyper_param = {
    "nb_epochs": nb_epochs,
    "batch_size": batch_size,
    "loss": "NTXent",
    "learning_rate": learning_rate,
    "LRAP": similarity,
    "optimizer": optimizer.__str__(),
    "model": model.__str__(),
    }

# Save path
save_path = './model_checkpoints/test'
if not os.path.exists(save_path):
    os.makedirs(save_path)

In [None]:
train(nb_epochs, optimizer, loss, model, train_loader, val_loader, save_path, device, hyper_param, print_every=50)
# train(nb_epochs, optimizer, loss, model, train_loader, val_loader, save_path, device, hyper_param, scheduler=scheduler, print_every=1)

In [None]:
## TESTING

model_path = os.path.join(save_path, 'model.pt')

test_cids_dataset = GraphDataset(root='./data/', gt=gt, split='test_cids', nrows=10)
test_text_dataset = TextDataset(file_path='./data/test_text.txt', tokenizer=tokenizer, nrows=10)

text_embeddings, graph_embeddings = test(model_path, model, test_cids_dataset, test_text_dataset, device)

In [None]:
## GENERATE OUTPUT

from sklearn.metrics.pairwise import cosine_similarity, safe_sparse_dot

if 'cosine' in similarity:
    similarity = cosine_similarity(text_embeddings, graph_embeddings)
else:
    similarity = np.dot(text_embeddings, np.transpose(graph_embeddings))

solution = pd.DataFrame(similarity)
solution['ID'] = solution.index
solution = solution[['ID'] + [col for col in solution.columns if col!='ID']]
solution.to_csv('submission.csv', index=False)