In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.io import loadmat

from src.utils import get_cc_mat, get_dataset

### Get data

In [None]:
df = get_dataset(
    labels_file="./data/class_vector_train_ref.mat", 
    params_file="./data/training_set_ref.mat",
    names_file="./data/paremeterNames.mat"
)

print(f'***Dataset shape: {df.shape}')

# Value counts of the stability label
print('***Value counts of the stability label:')
print(df['label'].value_counts(normalize=True).round(4))

df.head()

### Feature Selection

In [4]:
# from mrmr import mrmr_classif

# K = int(df.drop('label', axis=1).shape[1] / 3)
# print(f'***Number of features to select: {K}')
# selected_features = mrmr_classif(X=df.drop('label', axis=1), y=df['label'], K=K)

### ML

### FFC

In [None]:
enzyme, commonEnz, allEnzymes, \
    commonConCoeff, allConCoeff = get_cc_mat('./data/ccXTR_ref.mat')


In [None]:
# Change plot style
plt.style.use('ggplot')

# Calculate the 1st and 3rd quartiles of commonConCoeff values
q1 = commonConCoeff.quantile(0.25)
q3 = commonConCoeff.quantile(0.75)

# Plot the mean of commonConCoeff values
plt.figure(figsize=(10, 3))
plt.barh(commonConCoeff.columns[::-1], commonConCoeff.mean()[::-1])
plt.axvline(x=0, color='black', linestyle='-')
plt.plot(q1, commonConCoeff.columns, '|', color='black', label='Q1', alpha=1)
plt.plot(q3, commonConCoeff.columns, '|', color='black', label='Q3', alpha=1)
for i, enzyme in enumerate(commonConCoeff.columns):
    plt.plot([q1[i], q3[i]], [enzyme, enzyme], color='black')
plt.title("CC_XTR")
plt.grid()
plt.show()

In [None]:
idx_HXK = commonConCoeff[commonConCoeff['HXK'] < 0].index
df_HXK = df.drop('label', axis=1)
df_HXK['label'] = [1 if i in idx_HXK else 0 for i in df_HXK.index]
df_HXK['label'].value_counts(normalize=True).round(4)

In [8]:
# # Keep only the features selected by mRMR
# df_HXK = df_HXK[selected_features + ['label']]
# print(f'***Dataset shape: {df_HXK.shape}')

In [None]:
from src.machinelearning import train_xgboost
from sklearn.model_selection import train_test_split

X = df_HXK.drop('label', axis=1)
y = df_HXK['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99, random_state=42)

print(f'Traininig set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')

xgb_model = train_xgboost(X_train, y_train)

In [11]:
stop

NameError: name 'stop' is not defined

In [16]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

lr = XGBClassifier()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, matthews_corrcoef

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'ROC AUC score: {roc_auc_score(y_test, y_pred)}')
print(f'Matthews correlation coefficient: {matthews_corrcoef(y_test, y_pred)}')


Accuracy: 0.6312272727272727
F1 score: 0.5980545967994979
ROC AUC score: 0.6285600085944588
Matthews correlation coefficient: 0.2581927271115636


In [None]:
from sklearn.model_selection import train_test_split

X = df_HXK.drop('label', axis=1)
y = df_HXK['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

print(f'Traininig set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')

# Train ANNClassifier
import torch
from src.machinelearning import ANNClassifier, train, evaluate
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

input_dim = X_train.shape[1]
hidden_dim = 2048
output_dim = 1
hidden_layers = 3

model = ANNClassifier(input_dim, hidden_dim, output_dim, hidden_layers).to(device)

model = train(model, X_train, y_train, num_epochs=1000, learning_rate=0.001, batch_size=2048)

In [47]:
# Save trained model
torch.save(model.state_dict(), './ann_classifier.pth')

In [None]:
# Evaluate model
evaluate(model, X_test, y_test)

In [None]:
aaa

In [66]:
import shap

# Calculate shap values for the ANN model
def f(x):
    return model(torch.tensor(x, dtype=torch.float32)).detach().numpy()

X_shap = shap.utils.sample(X_train, 100)
explainer = shap.KernelExplainer(f, X_shap)

In [None]:
shap_values = explainer.shap_values(X_train.sample(300)) 

In [None]:
fig = plt.figure()
shap.summary_plot(shap_values[0], X_shap, plot_type='dot', show=False)
plt.gcf().set_size_inches(10,4)
plt.show()
