In [5]:
import pandas as pd
import numpy as np

ukb_data_path = '/Users/fjosesala/Library/CloudStorage/GoogleDrive-fsalamancar@unal.edu.co/Shared drives/UKB_Data/Raw_data'

with open('/Users/fjosesala/Documents/GitHub/IBD_GNN-NN/NN/data/raw/proteins.txt', 'r') as txt_file:
    columns = [line.strip() for line in txt_file.readlines()]

protein_values_df = pd.read_csv(f'{ukb_data_path}/ibd0.tsv', sep='\t')
protein_values_df['class'] = np.where(protein_values_df['Disease'] == 'Control', 0, 1)
weight_matrix_df = pd.read_csv('/Users/fjosesala/Documents/GitHub/IBD_GNN-NN/NN/data/raw/protein_scores.csv', index_col=0)

  protein_values_df = pd.read_csv(f'{ukb_data_path}/ibd0.tsv', sep='\t')


In [6]:
# Count people per class
print(protein_values_df['class'].value_counts())

class
0    18999
1     3626
Name: count, dtype: int64


In [7]:
protein_values_matrix = protein_values_df[columns].values
# Replace nan with 0
protein_values_matrix = np.nan_to_num(protein_values_matrix)
weight_matrix = weight_matrix_df.values

print(protein_values_matrix.shape)
print(protein_values_matrix[:5])
print(weight_matrix.shape)
print(weight_matrix[:5])

assert protein_values_matrix.shape[1] == weight_matrix.shape[0], "Dimensions don't match for multiplication!"

# Multiply the matrices
result_matrix = np.dot(protein_values_matrix, weight_matrix)
print(result_matrix.shape)

(22625, 1983)
[[-0.1808   1.12915 -0.75435 ...  0.       0.       0.     ]
 [ 0.0956  -0.67875 -0.6113  ...  0.       0.       0.     ]
 [ 0.0952   0.01855 -0.66255 ...  0.       0.       0.     ]
 [ 0.       0.      -0.26505 ...  0.       0.       0.     ]
 [ 0.0414  -0.25105 -0.7799  ...  0.       0.       0.     ]]
(1983, 1983)
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(22625, 1983)


In [8]:
# Print first rows of result matrix
print(result_matrix[:5])

[[ 1.87458735  1.12915    -0.6959844  ...  0.          1.2309898
   2.4537142 ]
 [ 0.431761   -0.67875    -0.41851385 ...  0.          0.390574
  -0.362337  ]
 [ 1.23370855  0.01855    -0.8137977  ...  0.         -0.2483794
  -1.6756953 ]
 [-1.1338386   0.         -0.26505    ...  0.         -0.1827758
  -0.24352635]
 [-2.4618103  -0.25105    -0.99157145 ...  0.         -0.2804594
  -1.3356045 ]]


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import BorderlineSMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

labels = protein_values_df['class'].values
X = result_matrix

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Focal loss definition
def focal_loss(gamma=2., alpha=0.25):
    def loss(y_true, y_pred):
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1 - 1e-7)
        cross_entropy = -y_true * tf.math.log(y_pred) - (1 - y_true) * tf.math.log(1 - y_pred)
        weight = alpha * y_true * tf.pow(1 - y_pred, gamma) + (1 - alpha) * (1 - y_true) * tf.pow(y_pred, gamma)
        return tf.reduce_mean(weight * cross_entropy)
    return loss

# Model builder function
def create_model(learning_rate=0.0005, neurons=512):
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],)))
    model.add(Dense(neurons, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())

    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss=focal_loss(), metrics=['accuracy'])
    return model

# Hyperparameters
learning_rates = [0.0001, 0.0005, 0.001]
neurons_list = [256, 512, 1024]
batch_sizes = [32, 64]
epochs_list = [20, 40]
threshold = 0.5

# Grid search
for lr in learning_rates:
    for neurons in neurons_list:
        for batch_size in batch_sizes:
            for epochs in epochs_list:
                print(f"\nTesting: lr={lr}, neurons={neurons}, batch_size={batch_size}, epochs={epochs}")

                # Apply BorderlineSMOTE
                smote = BorderlineSMOTE(random_state=42)
                X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

                # Build model
                model = create_model(learning_rate=lr, neurons=neurons)

                # Early stopping
                early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

                # Train
                history = model.fit(
                    X_train_bal, y_train_bal,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stop],
                    verbose=0
                )

                # Evaluate
                loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
                print(f"Test Loss: {loss:.4f} | Test Accuracy: {accuracy:.4f}")

                # Predict with threshold tuning
                y_pred_probs = model.predict(X_test)
                y_pred = (y_pred_probs > threshold).astype(int)

                # Report
                print(classification_report(y_test, y_pred, zero_division=0))



Testing: lr=0.0001, neurons=256, batch_size=32, epochs=20




Test Loss: 0.0511 | Test Accuracy: 0.8391
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
              precision    recall  f1-score   support

           0       0.84      1.00      0.91      3797
           1       0.00      0.00      0.00       728

    accuracy                           0.84      4525
   macro avg       0.42      0.50      0.46      4525
weighted avg       0.70      0.84      0.77      4525


Testing: lr=0.0001, neurons=256, batch_size=32, epochs=40


