In [2]:
import sys, os
sys.path.insert(0, os.path.abspath('..'))

from lib.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
import torch
from torchdrug import utils, data
from lib.disable_logger import DisableLogger

GPU = 1
device = f'cuda:{GPU}'
pipeline = Pipeline(
    model='lm-gearnet',
    dataset='atpbind3d',
    gpus=[GPU],
    model_kwargs={
        'gpu': GPU,
        'gearnet_hidden_dim_size': 512,
        'gearnet_hidden_dim_count': 4,
        'bert_freeze': False,
        'bert_freeze_layer_count': 29,
    },
    batch_size=16,
)


get dataset atpbind3d
Split num:  [337, 41, 41]
train samples: 337, valid samples: 41, test samples: 41


In [3]:
sample_weight = 'rus_5_0_0.6151.pth'

pipeline.task.load_state_dict(torch.load(sample_weight, map_location=device), strict=False)
pipeline.evaluate()

15:03:55   >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
15:03:55   Evaluate on test
15:03:58   ------------------------------
15:03:58   accuracy: 0.961755
15:03:58   mcc: 0.602686
15:03:58   micro_auroc: 0.918718
15:03:58   precision: 0.636667
15:03:58   sensitivity: 0.60925
15:03:58   specificity: 0.981009


{'sensitivity': 0.6092504262924194,
 'specificity': 0.9810088276863098,
 'accuracy': 0.9617545008659363,
 'precision': 0.6366666555404663,
 'mcc': 0.6026855116250789,
 'micro_auroc': 0.9187183380126953}

In [33]:
train_set = pipeline.train_set
pipeline.task.eval()
dataloader = data.DataLoader(train_set, batch_size=1, shuffle=False)
first_batch = utils.cuda(next(iter(dataloader)), device=device)

pred = pipeline.task.predict(first_batch)
pred.flatten()

tensor([-17.3298, -13.4945, -18.2039, -18.6461, -21.5239, -16.4441, -18.9187,
        -13.5679, -20.7809, -17.0364, -19.8048, -23.1246, -13.9346, -18.5860,
        -21.9744, -13.0965, -17.6263, -18.4483, -12.7355,  -7.1648, -14.3571,
        -14.7658, -19.6489, -14.5633, -16.9808,  -9.8912,  -9.9540, -10.7494,
        -13.9948, -16.5773, -14.5978, -10.6935, -17.8283, -15.3989, -24.0926,
        -24.9417, -14.6402, -19.4090, -14.0754, -17.2853, -18.2780, -13.2628,
        -10.0945, -19.3690, -13.4677,  -7.4248,  -9.1188, -15.4757,  -7.8211,
         -1.2059, -17.9829, -18.0884, -19.3422, -13.7677, -17.3163, -19.6624,
        -15.1668, -14.9212, -19.8480, -12.7032, -16.1524, -13.5441, -15.8736,
        -23.1825, -16.7432, -21.6519, -20.3948, -19.5762, -18.5159, -21.4604,
        -13.4027, -23.6826, -20.0617, -13.5578, -15.3785, -20.7205, -15.1174,
        -14.9235, -20.8893, -24.3910, -19.8693, -19.2163, -18.1511, -18.7869,
        -21.7641, -14.3211, -19.3652, -21.8818, -22.0049, -24.30

In [35]:
target = pipeline.task.target(first_batch)
label = target['label'].flatten()

In [37]:
for i in label:
    print(i)
    break

tensor(0, device='cuda:1')


In [36]:
len(label)

350

In [80]:
import pandas as pd
df = pd.DataFrame()



def create_pred_dataframe(pipeline, dataset, weights):
    weights_loaded = [
        torch.load(weight, map_location='cpu') for weight in weights
    ]
    df = pd.DataFrame()
    pipeline.task.eval()
    for protein_index, batch in enumerate(data.DataLoader(dataset, batch_size=1, shuffle=False)):
        batch = utils.cuda(batch, device=device)
        if protein_index % 10 == 9:
            print(f'processing protein {protein_index + 1} / {len(dataset)}')
        label = pipeline.task.target(batch)['label'].flatten()
        
        new_data = {
            'protein_index': protein_index,
            'residue_index': list(range(len(label))),
            'target': label.tolist(),
        }
        for i, weight in enumerate(weights_loaded):
            pipeline.task.load_state_dict(utils.cuda(weight, device=device) , strict=False)
            pred = pipeline.task.predict(batch).flatten()
            assert(len(label) == len(pred))
            new_data[f'pred_{i}'] = [round(t, 5) for t in pred.tolist()]
        new_data = pd.DataFrame(new_data)
        df = pd.concat([df, new_data])
    
    return df

In [81]:
weights = [
    'rus_5_0_0.6151.pth',
    'rus_5_1_0.6221.pth',
    'rus_5_2_0.6193.pth',
    'rus_5_3_0.6266.pth',
    'rus_5_4_0.6052.pth',
    'rus_5_5_0.6085.pth',
    'rus_5_6_0.5986.pth',
    'rus_5_7_0.6108.pth',
    'rus_5_8_0.6046.pth',
    'rus_5_9_0.6080.pth',
]

df_valid = create_pred_dataframe(
    pipeline, 
    dataset=pipeline.valid_set,
    weights=weights,
)

processing protein 10 / 41
processing protein 20 / 41
processing protein 30 / 41
processing protein 40 / 41


In [96]:
df_valid

Unnamed: 0,protein_index,residue_index,target,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9
0,0,0,0,-16.97174,-7.91394,-16.05915,-12.74630,-11.08699,-7.19023,-13.97244,-6.04126,-9.91471,-12.43732
1,0,1,0,-3.93672,-9.47368,-10.20510,-11.30398,-4.25183,-7.08744,-14.01429,-3.18227,-3.96673,-11.00722
2,0,2,0,-21.56091,-15.97469,-15.18720,-21.67062,-15.45335,-13.68504,-18.13901,-13.80445,-13.61160,-16.70308
3,0,3,0,-18.38626,-16.23427,-12.79903,-14.91800,-13.64695,-13.11994,-15.92172,-9.01989,-11.00765,-15.36380
4,0,4,0,-14.65767,-15.40561,-16.41979,-15.94792,-18.58216,-14.66701,-21.26510,-7.42877,-15.56406,-14.34236
...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,40,278,0,-18.46217,-14.70327,-20.71081,-20.23520,-17.61584,-17.37701,-28.19762,-27.83505,-18.98898,-15.21086
279,40,279,0,-10.57654,-3.02167,-7.41655,-16.22624,-7.21401,-10.21475,-6.51724,-15.95897,-10.69379,-8.39563
280,40,280,0,-20.50487,-15.59969,-10.36148,-22.97976,-14.24386,-13.29326,-14.06623,-21.75292,-16.84909,-14.99062
281,40,281,0,-10.51467,-3.77928,-9.50453,-19.51342,-14.90506,-11.95329,-14.86140,-24.65895,-17.68014,-11.50816


In [93]:
df_valid.query('target == 0 and pred_0 + pred_1 + pred_2 + pred_3 + pred_4 + pred_5 +pred_6 +pred_7 + pred_8 + pred_9 > 0')

Unnamed: 0,protein_index,residue_index,target,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9
36,1,36,0,0.16160,-1.21214,-2.43465,2.80233,5.37596,3.12021,-1.18887,2.23289,3.14200,3.93285
162,1,162,0,3.86974,2.06259,4.75878,0.59927,3.70603,2.16602,2.13296,1.97036,1.53800,0.38363
195,1,195,0,5.07940,1.53355,10.49345,6.98770,7.38976,5.20673,5.27371,6.02002,2.30119,6.67236
64,2,64,0,3.55344,3.48452,-0.22793,0.26693,3.30486,-1.30311,-1.15516,1.65353,1.59179,3.57754
89,2,89,0,-0.82517,-0.47990,6.02652,2.64298,6.06695,-2.05129,0.88586,1.81419,4.90267,4.34896
...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,37,52,0,6.16719,6.63751,8.22852,11.81406,9.99815,11.91978,6.21871,5.66318,6.85949,11.58592
105,37,105,0,-1.32284,-0.26843,-0.26857,0.04339,2.97709,3.10297,-0.33467,4.16424,-1.19000,1.47427
138,38,138,0,4.85641,3.63775,3.75041,4.84171,2.83292,2.84886,6.25854,8.96903,7.12968,6.84633
169,38,169,0,3.91629,3.24369,-2.35969,5.31365,2.63783,3.91696,0.75238,6.73494,3.73079,3.00311


In [95]:
df_valid.to_csv('test.csv', index=False)

In [98]:
df_test = create_pred_dataframe(
    pipeline, 
    dataset=pipeline.test_set,
    weights=weights,
)

processing protein 10 / 41
processing protein 20 / 41
processing protein 30 / 41
processing protein 40 / 41


Create Dataset for learning:

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.X = torch.tensor(dataframe[['pred_0', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9']].values, dtype=torch.float32)
        self.y = torch.tensor(dataframe['target'].values, dtype=torch.float32)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

batch_size = 64
valid_dataset = CustomDataset(df_valid)
test_dataset = CustomDataset(df_test)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# Mean Ensemble

In [110]:
from sklearn.metrics import matthews_corrcoef, recall_score, accuracy_score, precision_score, confusion_matrix

sum_preds = df_test[['pred_0', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9']].sum(axis=1)
final_prediction = (sum_preds > 0).astype(int)
# Sensitivity (Recall)
sensitivity = recall_score(df_test['target'], final_prediction)

# Specificity
tn, fp, fn, tp = confusion_matrix(df_test['target'], final_prediction).ravel()
specificity = tn / (tn + fp)

# Accuracy
accuracy = accuracy_score(df_test['target'], final_prediction)

# Precision
precision = precision_score(df_test['target'], final_prediction)
mcc = matthews_corrcoef(df_test['target'], final_prediction)

print(f'Sensitivity (Recall): {sensitivity}')
print(f'Specificity: {specificity}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print('MCC:', mcc)



Sensitivity (Recall): 0.5422647527910686
Specificity: 0.9911142085547522
Accuracy: 0.9678671733024946
Precision: 0.7692307692307693
MCC: 0.630212175762319


# MLP

In [173]:
import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x.squeeze()

input_size = 10
hidden_size = 32
output_size = 1

model = MLP(input_size, hidden_size, output_size)

learning_rate = 0.0001
num_epochs = 10

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training
for epoch in range(num_epochs):
    model.train()
    sum_loss = 0.0
    for inputs, targets in valid_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        sum_loss += loss.item()
        loss.backward()
        optimizer.step()
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, sum_loss))


model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        predictions = (outputs > 0.5).float()
        all_preds.extend(predictions.numpy())
        all_targets.extend(targets.numpy())

mcc = matthews_corrcoef(all_targets, all_preds)
print('Matthews Correlation Coefficient on Test Set:', mcc)


Epoch [1/10], Loss: 17.7515
Epoch [2/10], Loss: 16.5698
Epoch [3/10], Loss: 16.2776
Epoch [4/10], Loss: 16.0787
Epoch [5/10], Loss: 15.9375
Epoch [6/10], Loss: 15.8504
Epoch [7/10], Loss: 15.7574
Epoch [8/10], Loss: 15.6544
Epoch [9/10], Loss: 15.5815
Epoch [10/10], Loss: 15.5368
Matthews Correlation Coefficient on Test Set: 0.6202251588586377


# XGBoost

In [176]:
import xgboost as xgb



X_valid = df_valid[['pred_0', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9']]
y_valid = df_valid['target']

X_test = df_test[['pred_0', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9']]
y_test = df_test['target']

def create_xgb_classifier():
    return xgb.XGBClassifier(
        booster='gbtree',
        learning_rate=0.1,
        gamma=0.1,
        max_depth=10,
        min_child_weight=1,
        subsample=0.8,
        colsample_bytree=0.8,
        colsample_bylevel=0.8,
        colsample_bynode=0.8,
        reg_lambda=1,
        reg_alpha=0,
        scale_pos_weight=1,
        n_estimators=500,
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
    )
clf = create_xgb_classifier()
clf.fit(X_valid, y_valid)

y_pred = clf.predict(X_test)
mcc = matthews_corrcoef(y_test, y_pred)

print('Matthews Correlation Coefficient on Test Set:', mcc)

# How about applying sigmoid:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
X_valid_sigmoid = X_valid.applymap(sigmoid)
X_test_sigmoid = X_test.applymap(sigmoid)

clf = create_xgb_classifier()
clf.fit(X_valid_sigmoid, y_valid)
y_pred = clf.predict(X_test_sigmoid)
mcc = matthews_corrcoef(y_test, y_pred)

print('Matthews Correlation Coefficient on Test Set:', mcc)




Matthews Correlation Coefficient on Test Set: 0.5868540518431951




Matthews Correlation Coefficient on Test Set: 0.6021520755539236


# Linear Regression

In [194]:
from sklearn.linear_model import LogisticRegression


model = LogisticRegression(solver='lbfgs', max_iter=10)
model.fit(X_valid, y_valid)
coefficients = model.coef_[0]
y_pred_probabilities = model.predict_proba(X_test)[:,1]
y_pred = (y_pred_probabilities > 0.5).astype(int)
mcc = matthews_corrcoef(y_test, y_pred)
print('Matthews Correlation Coefficient:', mcc)


model.coef_[0] = np.array([1] * 10)
y_pred_probabilities = model.predict_proba(X_test)[:,1]
y_pred = (y_pred_probabilities > 0.5).astype(int)
mcc = matthews_corrcoef(y_test, y_pred)
print('Assuming linear regression of coefficient all 1, then Matthews Correlation Coefficient:', mcc)


Matthews Correlation Coefficient: 0.6346424533239282
Assuming linear regression of coefficient all 1, then Matthews Correlation Coefficient: 0.630212175762319


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
