In [3]:
import itertools
import numpy as np
import pandas as pd
import statsmodels.api as sm
from matplotlib import pyplot as plt
import seaborn as sns
import torch

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
# Render the figure in a notebook:
%matplotlib inline  

from scipy import stats

from sklearn.model_selection import train_test_split

from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import plot_tree

from sklearn import metrics

import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import torch.nn as nn
import torch.nn.functional as F

from torchmetrics.classification import Accuracy
from torchmetrics.classification import Recall
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

from torchmetrics import R2Score
from torchmetrics import MeanAbsoluteError

from sklearn.preprocessing import StandardScaler

In [4]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [5]:
x_train = pd.read_csv('../no_nontoxic_nans_X_train.csv',index_col=0)
y_train = pd.read_csv('../no_nontoxic_nans_y_train.csv',index_col=0)

x_val = pd.read_csv('../no_nontoxic_nans_x_val.csv',index_col=0)
y_val = pd.read_csv('../no_nontoxic_nans_y_val.csv',index_col=0)

x_test = pd.read_csv('../no_nontoxic_nans_x_test.csv',index_col=0)
y_test = pd.read_csv('../no_nontoxic_nans_y_test.csv',index_col=0)

In [6]:
#Remove all nan values from the df
mask = x_train.notna().all(axis=1)

x_train_clean = x_train[mask]
y_train_clean = y_train[mask]

mask2 = x_test.notna().all(axis=1)

x_test_clean = x_test[mask2]
y_test_clean = y_test[mask2]

mask3 = x_val.notna().all(axis=1)

x_val_clean = x_val[mask3]
y_val_clean = y_val[mask3]

In [7]:
### You have been using DataFrames because these are easy to deal with but here we need the data as an array

x_train_array = x_train_clean.to_numpy()
x_train_tensor = torch.Tensor(x_train_array)

### Now some properties will be printed so you can see how this is similar to a NumPy array

print(f'X_train_tensor = {x_train_tensor}')

print(f'Size of X_train_tensor = {x_train_tensor.size()}') 

print(f'Data type of X_train_tensor = {x_train_tensor.dtype}')

X_train_tensor = tensor([[ 0.7823,  0.7823, -0.4119,  ..., -0.0907, -0.2697, -0.1562],
        [ 0.5145,  0.5145, -0.7155,  ..., -0.0907, -0.2697, -0.1562],
        [-1.5689, -1.5689,  0.6186,  ..., -0.0907, -0.2697, -0.1562],
        ...,
        [ 0.2563,  0.2563, -0.5645,  ..., -0.0907, -0.2697, -0.1562],
        [ 0.2330,  0.2330, -0.1187,  ..., -0.0907, -0.2697, -0.1562],
        [ 0.7342,  0.7342, -0.7105,  ..., -0.0907, -0.2697, -0.1562]])
Size of X_train_tensor = torch.Size([3607, 217])
Data type of X_train_tensor = torch.float32


In [8]:
x_val_array = x_val_clean.to_numpy()
x_val_tensor = torch.Tensor(x_val_array)

y_train_array = y_train_clean['toxic'].to_numpy()
y_train_tensor = torch.Tensor(y_train_array)

y_val_array = y_val_clean['toxic'].to_numpy()
y_val_tensor = torch.Tensor(y_val_array)

x_test_array = x_test_clean.to_numpy()
x_test_tensor = torch.Tensor(x_test_array)

y_test_array = y_test_clean.to_numpy()
y_test_tensor = torch.Tensor(y_test_array)

In [9]:
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
trainloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)

In [10]:
class DNN_3_layer(nn.Module):
    def __init__(self):
        super(DNN_3_layer, self).__init__()
        self.fc1 = nn.Linear(217, 108)        
        self.fc2 = nn.Linear(108, 54)
        self.fc3 = nn.Linear(54, 27)
        self.fc4 = nn.Linear(27,2)
           
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x) # For the final layer we use raw logits for the cross entropy loss
        return x

In [19]:
#test the SGD model with optimised params
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.inspection import permutation_importance
from sklearn.metrics import balanced_accuracy_score

def test_SGD_metrics(network, learning_rate, best_epochs, trainloader, x_train_tensor, y_train_tensor, x_val_tensor, y_val_tensor, x_test_tensor, y_test_tensor):
    epoch_list = []
    avg_train_loss_list =[]
    
    network.train()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(network.parameters(), lr=learning_rate)

    num_epochs = best_epochs  
    torch.manual_seed(0)

    for epoch in range(1,num_epochs+1):
        running_loss = 0.0
        for i, data in enumerate(trainloader):
            inputs, labels = data 
            optimizer.zero_grad() 
            outputs = network(inputs)
            loss = criterion(outputs, labels.long()) 
            loss.backward() 
            optimizer.step() 
            running_loss += loss.item() 
                
        avg_train_loss = running_loss / len(trainloader) 
        avg_train_loss_list.append(avg_train_loss)
        epoch_list.append(epoch)

    network.eval()
    with torch.no_grad():  # stops gradient tracking

        y_val_logits = network(x_val_tensor).detach() 
        y_val_pred = torch.argmax(y_val_logits, dim=1) 
        
        baseline = balanced_accuracy_score(
            y_val_tensor,
            y_val_pred #calculates a balanced accuracy score using the optimised model
        ) #baseline balanced accuracy

        importances = [] #empty list generation
        
        for col in x_val.columns:
            X_perm = x_val.copy() #makes a copy of the df
            X_perm[col] = np.random.permutation(X_perm[col]) #goes through the columns in the copied df and shuffles the features
            
            x_perm_array = X_perm.to_numpy()
            x_perm_tensor = torch.Tensor(x_perm_array)
            y_val_logits = network(x_perm_tensor).detach() 
            
            y_perm_pred = torch.argmax(y_val_logits, dim=1)
            score = balanced_accuracy_score(
                y_val,
                y_perm_pred
            ) #calculates a balanced accuracy score with shuffled features

            importances.append(baseline - score)
    return importances #the list of all the feature importances
network = DNN_3_layer()
importances = pd.DataFrame(test_SGD_metrics(network, 0.14, 100, trainloader, x_train_tensor, y_train_tensor, x_val_tensor, y_val_tensor, x_test_tensor, y_test_tensor))

In [26]:
df_importances = pd.DataFrame({
    "feature": x_val.columns,
    "importance": importances[0]
})
fr_features = df_importances[df_importances["feature"].str.startswith("fr_")]

# Get top 25
top25_fr = fr_features.nlargest(25, "importance").reset_index(drop=True)

In [28]:
top25_fr

Unnamed: 0,feature,importance
0,fr_halogen,0.016814
1,fr_aniline,0.016641
2,fr_bicyclic,0.014103
3,fr_phenol,0.011912
4,fr_Ar_N,0.011566
5,fr_hdrzone,0.01122
6,fr_alkyl_halide,0.010557
7,fr_Ndealkylation1,0.010384
8,fr_imidazole,0.010384
9,fr_sulfide,0.010211


let's try permutation feature importance.
Shuffle each column, if this increases the model error then this is a significant feature.