In [1]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Importar o modelo contrastivo desenvolvido anteriormente

In [3]:
# fiz sem o bidirectional, não sei se é necessário para o meu teste simples 
class SMILESEncoder(torch.nn.Module):
    def __init__(self, vocab_size, max_len, padding_idx, embedding_dim=64, dim=128, num_layers=1):
        super(SMILESEncoder, self).__init__()

        self.vocab_size = vocab_size
        self.max_len = max_len
        self.padding_idx = padding_idx
        self.embedding_dim = embedding_dim
        self.dim = dim
        self.num_layers = num_layers

        self.encoder = torch.nn.LSTM(self.embedding_dim, self.dim, self.num_layers, batch_first=True)

    def forward(self, x, lengths):
        batch_size = x.size(0)
        seq_len = x.size(1)
        
        valid_lengths = torch.clamp(lengths, min=1, max=seq_len)
        
        # Mover lengths para a CPU antes de empacotar
        lengths_on_cpu = valid_lengths.cpu()
        
        x_packed = rnn_utils.pack_padded_sequence(
            x, lengths=lengths_on_cpu, batch_first=True, enforce_sorted=False
        )

        _, (hidden, _) = self.encoder(x_packed)
        return hidden[-1]
    
class CLRNet(torch.nn.Module):
    def __init__(self, vocab_size, max_len, padding_idx, embedding_dim=64, dim=128, num_layers=1):
        super(CLRNet, self).__init__()

        self.vocab_size = vocab_size
        self.max_len = max_len
        self.padding_idx = padding_idx
        self.embedding_dim = embedding_dim
        self.dim = dim
        self.num_layers = num_layers

        self.emb = torch.nn.Embedding(
            num_embeddings=self.vocab_size, 
            padding_idx=self.padding_idx,
            embedding_dim=self.embedding_dim, 
        )

        # definindo as duas redes LSTM responsáveis pelo aprendizado contrastivo
        self.SMILESEnc1 = SMILESEncoder(vocab_size=self.vocab_size, max_len=self.max_len, padding_idx=self.padding_idx,
                                        embedding_dim=self.embedding_dim, dim=2*self.dim, num_layers=self.num_layers)
        

        self.SMILESEnc2 = SMILESEncoder(vocab_size=self.vocab_size, max_len=self.max_len, padding_idx=self.padding_idx,
                                        embedding_dim=self.embedding_dim, dim=2*self.dim, num_layers=self.num_layers)
        
        # cabeça de projeção MLP
        self.projection_head = torch.nn.Sequential(
            torch.nn.Linear(2*self.dim, 4 * self.dim),
            torch.nn.BatchNorm1d(4 * self.dim),
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(4*self.dim, 6*self.dim, bias=False)
        )
    
    def forward(self, smi, random_smi, smi_len, random_smi_len):
        x = self.emb(smi)
        
        x1 = self.emb(random_smi)
        
        enc1 = self.SMILESEnc1(x, smi_len)
        enc2 = self.SMILESEnc2(x1, random_smi_len)
        
        return F.normalize(self.projection_head(enc1), dim=1), F.normalize(self.projection_head(enc2), dim=1)

In [4]:
clr_model = CLRNet(vocab_size=82, max_len=24, padding_idx=81)
clr_model.load_state_dict(torch.load("clr_model.pth"))
clr_model.to(device)

  clr_model.load_state_dict(torch.load("clr_model.pth"))


CLRNet(
  (emb): Embedding(82, 64, padding_idx=81)
  (SMILESEnc1): SMILESEncoder(
    (encoder): LSTM(64, 256, batch_first=True)
  )
  (SMILESEnc2): SMILESEncoder(
    (encoder): LSTM(64, 256, batch_first=True)
  )
  (projection_head): Sequential(
    (0): Linear(in_features=256, out_features=512, bias=True)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Linear(in_features=512, out_features=768, bias=False)
  )
)

# Definindo os dados

In [5]:
qm9 = pd.read_csv("../supervised-learning/QM9.csv", index_col=0)
qm9

Unnamed: 0,SMILES_1,SMILES_2,INCHI_1,INCHI_2,Property_0
0,OC1COC(=N)C1=O,O[C@H]1CO[C]([NH])C1=O,"InChI=1S/C4H5NO3/c5-4-3(7)2(6)1-8-4/h2,5-6H,1H2","InChI=1S/C4H5NO3/c5-4-3(7)2(6)1-8-4/h2,5-6H,1H...",3.49649
1,CN(C)C(=O)C=O,CN(C)C(=O)C=O,"InChI=1S/C4H7NO2/c1-5(2)4(7)3-6/h3H,1-2H3","InChI=1S/C4H7NO2/c1-5(2)4(7)3-6/h3H,1-2H3",3.92544
2,C1C2CC3N=COCC123,C1[C@H]2C[C@@H]3N=COC[C@]123,InChI=1S/C7H9NO/c1-5-2-7(5)3-9-4-8-6(1)7/h4-6H...,InChI=1S/C7H9NO/c1-5-2-7(5)3-9-4-8-6(1)7/h4-6H...,3.02103
3,OC1C2OC3CC2C13O,O[C@H]1[C@H]2O[C@@H]3C[C@H]2[C@]13O,"InChI=1S/C6H8O3/c7-5-4-2-1-3(9-4)6(2,5)8/h2-5,...","InChI=1S/C6H8O3/c7-5-4-2-1-3(9-4)6(2,5)8/h2-5,...",2.52006
4,OC1C2C(=N)OCC12O,O[C@H]1[C@H]2[C]([NH])OC[C@@]12O,"InChI=1S/C5H7NO3/c6-4-2-3(7)5(2,8)1-9-4/h2-3,6...","InChI=1S/C5H7NO3/c6-4-2-3(7)5(2,8)1-9-4/h2-3,6...",2.48587
...,...,...,...,...,...
133880,CC1=C(N)C(=NN1)C#C,Cc1c(N)c(n[nH]1)C#C,"InChI=1S/C6H7N3/c1-3-5-6(7)4(2)8-9-5/h1H,7H2,2...","InChI=1S/C6H7N3/c1-3-5-6(7)4(2)8-9-5/h1H,7H2,2...",3.37027
133881,CC(C)(C)C(O)C(N)=O,CC(C)(C)[C@@H](O)C(=O)N,"InChI=1S/C6H13NO2/c1-6(2,3)4(8)5(7)9/h4,8H,1-3...","InChI=1S/C6H13NO2/c1-6(2,3)4(8)5(7)9/h4,8H,1-3...",2.23826
133882,CC1C2C3C(N12)C3(C)O,C[C@H]1[C@H]2[C@H]3[C@@H](N12)[C@]3(C)O,"InChI=1S/C7H11NO/c1-3-5-4-6(8(3)5)7(4,2)9/h3-6...","InChI=1S/C7H11NO/c1-3-5-4-6(8(3)5)7(4,2)9/h3-6...",3.77036
133883,C1CN1C1C2CC1C2,C1CN1[C@H]1[C@H]2C[C@@H]1C2,"InChI=1S/C7H11N/c1-2-8(1)7-5-3-6(7)4-5/h5-7H,1...","InChI=1S/C7H11N/c1-2-8(1)7-5-3-6(7)4-5/h5-7H,1...",5.11764


# Definição da rede MLP preditora

In [6]:
class MLP(nn.Module):
    def __init__(self, ):

SyntaxError: incomplete input (326970354.py, line 2)