# DATA reading and exploration

In [1]:
import pandas as pd
from ast import literal_eval

In [2]:
df = pd.read_csv("data.csv",converters={"mutations": literal_eval, "mut_positions": literal_eval}, index_col=0)

# Pytorch models

In [3]:
import torch.nn as nn
from transformers import EsmTokenizer, EsmModel
import torch


device = torch.device("cuda")

class Regressor_with_Convlayer(nn.Module):
    def __init__(self, mode="transfert_learning"):
        super(Regressor_with_Convlayer, self).__init__()
        hidden_channels_dim = 32
        dropout_rate = 0.0
        self.mode = mode
        
        self.tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D", device=device)
        self.embedder = EsmModel.from_pretrained("facebook/esm2_t6_8M_UR50D").to(device).eval()
        if self.mode=="transfert_learning":
            for param in self.embedder.parameters():
                param.require_grad = False
        self.classifier = nn.Sequential(
                                        nn.Conv2d(
                                                    1, hidden_channels_dim, kernel_size=(7, 1), padding=(3, 0)
                                                ),  # 7x32
                                        nn.ReLU(),
                                        nn.Dropout(dropout_rate),
                                        nn.Flatten(),
                                        nn.LazyLinear(1),
                                    )
        self.to(device)

    def forward(self, x):
        """
        L = protein length
        B = batch-size
        F = number of features (320 for embeddings)
        N = 1 (regression)
        """
        tokens = self.tokenizer(x, return_tensors="pt").to(device)
        x = self.embedder(**tokens).last_hidden_state
        # IN: X = (B x L x F); OUT: (B x F x L, 1)
        x = x.unsqueeze(dim=1)
        Yhat_consurf = self.classifier(x)  # OUT: Yhat_consurf = (B x N x L x 1)
        # IN: (B x N x L x 1); OUT: ( B x L x N )
        Yhat_consurf = Yhat_consurf.squeeze(dim=-1)
        return Yhat_consurf

class Regressor_with_MLP(nn.Module):
    def __init__(self, mode="transfert_learning"):
        super(Regressor_with_MLP, self).__init__()
        hidden_size = 1024
        dropout_rate = 0.0
        self.mode = mode
        
        self.tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D", device=device)
        self.embedder = EsmModel.from_pretrained("facebook/esm2_t6_8M_UR50D").to(device).eval()
        if self.mode=="transfert_learning":
            for param in self.embedder.parameters():
                param.require_grad = False
        self.classifier = nn.Sequential(
                                        nn.Flatten(),
                                        nn.LazyLinear(hidden_size),
                                        nn.ReLU(),
                                        nn.Dropout(dropout_rate),
                                        nn.Linear(hidden_size, 1),
                                    )
        self.to(device)

    def forward(self, x):
        tokens = self.tokenizer(x, return_tensors="pt").to(device)
        x = self.embedder(**tokens).last_hidden_state
        x = x.unsqueeze(dim=1)
        x = self.classifier(x) 
        x = x.squeeze(dim=-1)
        return x





# Training and evaluation functions

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import numpy as np

X, y = shuffle(df["primary"], df.target, random_state=42)
offset = int(X.shape[0] * 0.7)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = list(X[offset:]), list(y[offset:])

train_dl = torch.utils.data.DataLoader(np.stack((X_train.values, y_train.values), axis=1).tolist(), 8, shuffle=True, drop_last=False)

from sklearn.metrics import mean_squared_error, r2_score
def eval_model(model, Xtest, ytest):
    with torch.no_grad():
        ypred = model(Xtest).cpu()
        mse = mean_squared_error(ytest, ypred)
        r2 = r2_score(ytest, ypred)
        return {"R2": r2, "MSE": mse, "RMSE": np.sqrt(mse)}

  X_train, y_train = X[:offset], y[:offset]
  X_test, y_test = list(X[offset:]), list(y[offset:])


In [17]:
import torch.optim as optim
loss = nn.MSELoss().to(device)
def train(model, epoch, learning_rate, train_dl, Xtest, ytest, Xtrain, ytrain):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(epoch):
        model.train()
        for sequences, targets in train_dl:
            model.zero_grad()
            ypred = model(sequences)
            lossbatch = loss(ypred, targets.to(device).float())
            lossbatch.backward()
            optimizer.step()
        model.eval()
        if epoch%10==0:
            print("Evaluation on training:")
            print(eval_model(model, Xtrain, ytrain))
            print("__")
        metrics = eval_model(model, Xtest, ytest)
        metrics["epoch"] = epoch+1
        print(metrics)
    return model, metrics

# Training and metrics

In [18]:
model = Regressor_with_Convlayer()
model, metrics = train(model, 50, 0.01, train_dl, X_test, y_test, list(X_train), list(y_train))

Some weights of the model checkpoint at facebook/esm2_t6_8M_UR50D were not used when initializing EsmModel: ['lm_head.dense.weight', 'esm.contact_head.regression.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'esm.contact_head.regression.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing EsmModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a

Evaluation on training:
{'R2': -6464.362094606112, 'MSE': 164541.00195281793, 'RMSE': 405.63653922300676}
__
{'R2': -4999.217766777125, 'MSE': 163954.05044722522, 'RMSE': 404.91239848543194, 'epoch': 1}
{'R2': -1363.5886682697692, 'MSE': 44744.01911927488, 'RMSE': 211.52782114718357, 'epoch': 2}
{'R2': -224.4785653913348, 'MSE': 7393.302813842559, 'RMSE': 85.9843172551981, 'epoch': 3}
{'R2': -6.692799597790016, 'MSE': 252.2421446755114, 'RMSE': 15.882132875514907, 'epoch': 4}
{'R2': -6.177155595200391, 'MSE': 235.33449649764424, 'RMSE': 15.340615909983674, 'epoch': 5}
{'R2': -8.852609310170093, 'MSE': 323.0609703860185, 'RMSE': 17.97389691708558, 'epoch': 6}
{'R2': -1.1999671711777657, 'MSE': 72.1355639672475, 'RMSE': 8.493265801047764, 'epoch': 7}
{'R2': -0.25095813280418766, 'MSE': 41.01814408482069, 'RMSE': 6.40454089570991, 'epoch': 8}
{'R2': -0.07483392065224548, 'MSE': 35.24314001279816, 'RMSE': 5.936593300268948, 'epoch': 9}
{'R2': 0.0005547903953735389, 'MSE': 32.7711907676317,

In [20]:
model = Regressor_with_MLP()
model, metrics = train(model, 50, 0.01, train_dl, X_test, y_test, list(X_train), list(y_train))

Some weights of the model checkpoint at facebook/esm2_t6_8M_UR50D were not used when initializing EsmModel: ['lm_head.dense.weight', 'esm.contact_head.regression.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'esm.contact_head.regression.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing EsmModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a

Evaluation on training:
{'R2': -120.60507055290095, 'MSE': 3094.802713062344, 'RMSE': 55.630951035033945}
__
{'R2': -91.16000279217167, 'MSE': 3021.8695368428403, 'RMSE': 54.971533877479175, 'epoch': 1}
{'R2': -16.185958001533752, 'MSE': 563.5169419798096, 'RMSE': 23.738511789491135, 'epoch': 2}
{'R2': -14.079632565191707, 'MSE': 494.45183262740534, 'RMSE': 22.236272903240895, 'epoch': 3}
{'R2': -4.797225218419677, 'MSE': 190.08743223744315, 'RMSE': 13.787219887905, 'epoch': 4}
{'R2': -1.9908812865722787, 'MSE': 98.06914902756283, 'RMSE': 9.90298687404779, 'epoch': 5}
{'R2': -0.6993943382881349, 'MSE': 55.72209012988791, 'RMSE': 7.464723044419525, 'epoch': 6}
{'R2': -2.516094938642697, 'MSE': 115.29058009787946, 'RMSE': 10.737345114034449, 'epoch': 7}
{'R2': -0.35797788507200345, 'MSE': 44.52725562367201, 'RMSE': 6.672874614712314, 'epoch': 8}
{'R2': -0.45505504429175625, 'MSE': 47.7103556809816, 'RMSE': 6.907268322642577, 'epoch': 9}
{'R2': -0.03376780733365914, 'MSE': 33.896607535864

In [21]:
model = Regressor_with_Convlayer(mode="fine_tuning")
model, metrics = train(model, 50, 0.01, train_dl, X_test, y_test, list(X_train), list(y_train))

Some weights of the model checkpoint at facebook/esm2_t6_8M_UR50D were not used when initializing EsmModel: ['lm_head.dense.weight', 'esm.contact_head.regression.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'esm.contact_head.regression.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing EsmModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a

Evaluation on training:
{'R2': -277.08255063613984, 'MSE': 7077.094962003556, 'RMSE': 84.12547154104728}
__
{'R2': -211.3381499878147, 'MSE': 6962.436713513722, 'RMSE': 83.44121711428784, 'epoch': 1}
{'R2': -62.16537402953803, 'MSE': 2071.1535783434083, 'RMSE': 45.50992834913507, 'epoch': 2}
{'R2': -0.3244885215131914, 'MSE': 43.429160088943746, 'RMSE': 6.590080431143746, 'epoch': 3}
{'R2': -15.022489058278452, 'MSE': 525.3675143521385, 'RMSE': 22.920896892402325, 'epoch': 4}
{'R2': -1.644036725082247, 'MSE': 86.6963302056067, 'RMSE': 9.311086413819103, 'epoch': 5}
{'R2': -0.4515712359517774, 'MSE': 47.59612375850085, 'RMSE': 6.8989944019763385, 'epoch': 6}
{'R2': -0.7001637475747631, 'MSE': 55.74731858490293, 'RMSE': 7.4664126985389, 'epoch': 7}
{'R2': -2.373214724702959, 'MSE': 110.60562618250525, 'RMSE': 10.516920945909275, 'epoch': 8}
{'R2': -0.31011997062694086, 'MSE': 42.958024185121644, 'RMSE': 6.5542371169436375, 'epoch': 9}
{'R2': -0.011091719111133713, 'MSE': 33.1530726168285

In [22]:
model = Regressor_with_MLP(mode="finetuning")
model, metrics = train(model, 50, 0.01, train_dl, X_test, y_test, list(X_train), list(y_train))

Some weights of the model checkpoint at facebook/esm2_t6_8M_UR50D were not used when initializing EsmModel: ['lm_head.dense.weight', 'esm.contact_head.regression.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'esm.contact_head.regression.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing EsmModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a

Evaluation on training:
{'R2': -14861.06255919385, 'MSE': 378233.8295662308, 'RMSE': 615.0071784672361}
__
{'R2': -11561.99257404459, 'MSE': 379143.38059474743, 'RMSE': 615.7461981975589, 'epoch': 1}
{'R2': -87.2535649182396, 'MSE': 2893.7798531280396, 'RMSE': 53.79386445616302, 'epoch': 2}
{'R2': -30.596130065112852, 'MSE': 1036.017578484698, 'RMSE': 32.18722694617693, 'epoch': 3}
{'R2': -3.6577626434062562, 'MSE': 152.72515858853177, 'RMSE': 12.358202077508352, 'epoch': 4}
{'R2': -0.09318927425462786, 'MSE': 35.84500071384546, 'RMSE': 5.987069459580828, 'epoch': 5}
{'R2': -0.6599803450400428, 'MSE': 54.42972964905841, 'RMSE': 7.377650686299698, 'epoch': 6}
{'R2': -0.13294499219684663, 'MSE': 37.148566136210135, 'RMSE': 6.094962357243081, 'epoch': 7}
{'R2': -7.264005909779492e-05, 'MSE': 32.79176382448294, 'RMSE': 5.7264093308532305, 'epoch': 8}
{'R2': -0.22876161498925862, 'MSE': 40.290333983076415, 'RMSE': 6.347466737453329, 'epoch': 9}
{'R2': -0.06586165070726469, 'MSE': 34.9489448