In [None]:
import torch
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import wandb

In [None]:
model_name_base = 'graphormer-base-pcqm4mv1'
model_name = 'clefourrier/graphormer-base-pcqm4mv1'

In [None]:
batch_size = 2

### Upload Dataset

In [None]:
dataframe = pd.read_csv("data_10k_graph.csv")

In [None]:
print('Percentage on NaNs:')
dataframe.isna().mean()

In [None]:
dataframe = dataframe.drop(columns=['Smiles', 'ecfp1', 'ecfp2', 'ecfp3'])

In [None]:
def preprocess_array_column(df, column):
    for row in tqdm(range(len(df))):
        str_ints = eval(df.iloc[row][column])
        df.at[row, column] = str_ints

In [None]:
preprocess_array_column(dataframe, 'node_feat')
preprocess_array_column(dataframe, 'edge_index')
preprocess_array_column(dataframe, 'edge_attr')
preprocess_array_column(dataframe, 'y')

### Normalize target

In [None]:
dataframe['y'] = dataframe['y'].apply(lambda x: x[0])

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() # отображает данные в отрезок [0, 1]
dataframe['y'] = scaler.fit_transform(dataframe['y'].to_numpy().reshape(-1, 1))

In [None]:
dataframe['y'] = dataframe['y'].apply(lambda x: [x])

In [None]:
dataframe['y']

### Create Dataloader

In [None]:
from datasets import Dataset, DatasetDict

dataset = Dataset.from_pandas(dataframe)
train_testvalid = dataset.train_test_split(test_size=0.2, seed=15)

test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=15)

# 10% for test, 10 for validation, 80% for train
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})

dataset

In [None]:
from transformers.models.graphormer.collating_graphormer import preprocess_item, GraphormerDataCollator

dataset_processed = dataset.map(preprocess_item, batched=False)
# data_loader = GraphormerDataCollator(on_the_fly_processing=True) # либо препроцессинг либо коллайтор с on_the_fly_processing

In [None]:
dataset_processed.save_to_disk('dataset_10k_graphormer_preprocessed_normilized')

In [None]:
# from datasets import load_from_disk
# dataset_processed = load_from_disk('dataset_10k_graphormer_preprocessed')

### Create Model

In [None]:
# from transformers import AutoModel, AutoConfig

# config = AutoConfig.from_pretrained(model_name)
# AutoModel.from_pretrained(model_name, config=config)

In [None]:
# from transformers import AutoModel, AutoConfig

# class MolecularPropertiesRegression(torch.nn.Module):
#     def __init__(self, model_name, num_properties):
#         super(MolecularPropertiesRegression, self).__init__()
#         self.num_properties = num_properties

#         config = AutoConfig.from_pretrained(model_name)
#         self.transformer = AutoModel.from_pretrained(model_name, config=config)
#         # removing last layer of transformer
#         self.transformer.pooler = torch.nn.Identity()
#         # freezing transformer weights
#         for param in self.transformer.parameters():
#             param.requires_grad = False
#         self.regressor = torch.nn.Linear(768, num_properties)

#     def forward(self, input_ids = None, attention_mask=None):
#         outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        
#         last_hidden_state = outputs[0]
#         # last_hidden_state is the shape of (batch_size=32, input_sequence_length=512, hidden_size=768)
#         # so we take only hidden emdedding for [CLS] token (first) as it contains the entire context
#         # and would be sufficient for simple downstream tasks such as classification/regression
#         predicted_property_values = self.regressor(last_hidden_state[:, 0, : ].view(-1, 768))

#         return predicted_property_values
        

### Create DataLoader

In [None]:
from transformers.models.graphormer.collating_graphormer import GraphormerDataCollator

class GraphormerDataCollator_():
    def __init__(self):
        self.data_collator = GraphormerDataCollator()

    def __call__(self, features):
        for mol in features:
            if mol['num_nodes'] == 1:
                features.remove(mol)
        return self.data_collator(features)

In [None]:
from torch.utils.data import DataLoader

data_collator = GraphormerDataCollator_()

train_dataloader = DataLoader(
    dataset_processed['train'], shuffle = False, batch_size = batch_size, collate_fn = data_collator
)

eval_dataloader = DataLoader(
    dataset_processed['validation'], shuffle = False, batch_size = batch_size, collate_fn = data_collator
)

In [None]:
device = torch.device("cuda", index=5) if torch.cuda.is_available() else torch.device('cpu')

In [None]:
from transformers import GraphormerForGraphClassification

model = GraphormerForGraphClassification.from_pretrained(
    model_name, 
    num_classes=1,
    ignore_mismatched_sizes = True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
).to(device)

In [None]:
model

In [None]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epoch = 100

num_training_steps = num_epoch * len(train_dataloader)

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
)

loss_func = torch.nn.MSELoss()

In [None]:
wandb.init(
    project="graphormer",
    name="Graphormer Simple Classification on MolecularWeight 10k 100_epochs",
    config={}
)

### Training

In [None]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epoch * len(eval_dataloader)))

for epoch in range(num_epoch):
    model.train()
    train_epoch_loss = 0
    for batch in train_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() }
        
        outputs = model(**input_batch)
        
        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        loss.backward()
        train_epoch_loss += loss.item()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)

    model.eval()
    eval_epoch_loss = 0
    for batch in eval_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() }

        with torch.no_grad():
            outputs = model(**input_batch)

        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        eval_epoch_loss += loss.item()

        progress_bar_eval.update(1)
    
    wandb.log({"loss/train": train_epoch_loss / len(train_dataloader), "loss/validation": eval_epoch_loss / len(eval_dataloader)})

In [None]:
torch.onnx.export(model, input_batch, model_name_base + "_10k_100_epochs.onnx")
wandb.save(model_name_base + "_10k_100_epochs.onnx")

## Post Training Evaluation

In [None]:
test_dataloader = DataLoader(
    dataset_processed['test'], batch_size = batch_size, collate_fn = data_collator
)

model.eval()
epoch_loss = 0
for batch in tqdm(test_dataloader):
        input_batch = { k: v.to(device) for k, v in batch.items() }

        with torch.no_grad():
            outputs = model(**input_batch)

        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        epoch_loss += loss.item()
    
wandb.log({"loss/test": epoch_loss / len(test_dataloader)})

In [None]:
wandb.finish()

In [None]:
model.save_pretrained(model_name_base + '_10k_100epochs unnormalized')

In [None]:
# torch.save(model, model_name_base + '_10k_10epochs.pt')

In [None]:
torch.cuda.empty_cache()