# **Neural Networks**

In [4]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Bidirectional, LSTM
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [5]:
gpus = tf.config.list_physical_devices('GPU')
print("Available GPU devices:", gpus)

# Alternatively, you can use:
print("GPU Device:", tf.test.gpu_device_name())

tf.debugging.set_log_device_placement(True)


Available GPU devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU Device: /device:GPU:0


In [6]:
data_path = r"../data/prototype/merged_protein_data_cleaned.csv"
df = pd.read_csv(data_path)
df.head()


Unnamed: 0,sequence,classification
0,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,OXYGEN TRANSPORT
1,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...,HYDROLASE(O-GLYCOSYL)
2,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,OXYGEN TRANSPORT
3,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK...,HYDROLASE(O-GLYCOSYL)
4,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,OXYGEN TRANSPORT


In [7]:
class_counts = df['classification'].value_counts()
class_priors = class_counts / len(df)
print("\nTarget class priors:")
print(class_priors)



Target class priors:
classification
HYDROLASE                            0.133795
TRANSFERASE                          0.105174
OXIDOREDUCTASE                       0.099102
IMMUNE SYSTEM                        0.045088
LYASE                                0.033732
                                       ...   
STRUCTURAL PROTEIN, CELL CYCLE       0.000003
KINASE (GLYCOGEN METABOLISM)         0.000003
SURFACE GLYCOPROTEIN                 0.000003
IMMUNE SYSTEM,HYDROLASE INHIBITOR    0.000003
antimicrobial                        0.000003
Name: count, Length: 4468, dtype: float64


In [8]:
len(df['classification'].value_counts())

4468

In [9]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['classification'])
print("Label encoding mapping:")
print(dict(zip(le.classes_, range(len(le.classes_)))))

Label encoding mapping:
{'3-EPIMERASE': 0, "5'-3' EXO/ENDO NUCLEASE": 1, 'ACARBOSE': 2, 'ACETYLATION': 3, 'ACETYLATION AND ACTIN-BINDING': 4, 'ACETYLCHOLINE BINDING PROTEIN': 5, 'ACETYLCHOLINE RECEPTOR': 6, 'ACETYLCHOLINE RECEPTOR ANTAGONIST': 7, 'ACETYLCHOLINE-BINDING PROTEIN': 8, 'ACETYLTRANSFERASE': 9, 'ACID ANHYDRIDE HYDROLASE': 10, 'ACID PROTEASE': 11, 'ACTIN BINDING': 12, 'ACTIN BINDING PEPTIDE': 13, 'ACTIN BINDING PROTEIN': 14, 'ACTIN BINDING, STRUCTURAL PROTEIN': 15, 'ACTIN CYTOSKELETON': 16, 'ACTIN DEPOLYMERIZATION FACTOR': 17, 'ACTIN-BINDING': 18, 'ACTIN-BINDING PROTEIN': 19, 'ACTIN-BINDING PROTEIN/PEPTIDE': 20, 'ACTIN-BINDING PROTEIN/PROTEIN BINDING': 21, 'ACTIVATION DOMAIN': 22, 'ACTIVE-SITE CARBOXYMETHYLATION': 23, 'ACUTE-PHASE PROTEIN': 24, 'ACYL-COENZYME A BINDING PROTEIN': 25, 'ACYLPHOSPHATASE': 26, 'ACYLTRANSFERASE': 27, 'ADAPTOR PROTEIN CONTAINING SH2 AND SH3': 28, 'ADENOSINE BINDING PROTEIN': 29, 'ADENOVIRUS': 30, 'ADENYLATE KINASE': 31, 'ADHESIN': 32, 'ADHESION': 33

In [10]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

Train size: 277056, Test size: 69265


In [None]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(train_df['sequence'])
vocab_size = len(tokenizer.word_index) + 1  # +1 for the padding token
print("Vocabulary size:", vocab_size)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(train_df['sequence'])
X_test_seq = tokenizer.texts_to_sequences(test_df['sequence'])

# Compute raw maximum sequence length in the training data
raw_max_length = max(len(seq) for seq in X_train_seq)
# Cap maximum sequence length to avoid GPU memory issues
max_length = min(raw_max_length, 1000)
if raw_max_length > max_length:
    print(f"Truncating sequences from max length {raw_max_length} to {max_length}")

Truncating sequences from max length 5037 to 1000


In [None]:
X_train = pad_sequences(X_train_seq, maxlen=max_length, padding="post", truncating="post")
X_test = pad_sequences(X_test_seq, maxlen=max_length, padding="post", truncating="post")
print("Final sequence length used:", max_length)

Final sequence length used: 1000


In [None]:
y_train = train_df['label'].values
y_test = test_df['label'].values
num_classes = len(le.classes_)
print("Number of classes:", num_classes)

Number of classes: 4468


In [None]:
embedding_dim = 100  # increased embedding dimension

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Conv1D(filters=256, kernel_size=5, activation='relu', padding='same'),
    Dropout(0.3),
    Bidirectional(LSTM(128, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=[
                  tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
                  tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name='top_5_accuracy')
              ])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 100)         2600      
                                                                 
 conv1d (Conv1D)             (None, 1000, 256)         128256    
                                                                 
 dropout (Dropout)           (None, 1000, 256)         0         
                                                                 
 bidirectional (Bidirectiona  (None, 1000, 256)        394240    
 l)                                                              
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 256)               6

In [None]:
model_name = "Arc-1-NN-BiLSTM"
checkpoint_dir = os.path.join("models", model_name)
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_path = os.path.join(checkpoint_dir, "model_epoch{epoch:02d}_val_loss{val_loss:.2f}.h5")

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1)


In [None]:
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=128,
    validation_split=0.1,  # using 10% of training data for validation
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/50

In [None]:
final_model_path = os.path.join(checkpoint_dir, "Arc-1-NN-BiLSTM.h5")
model.save(final_model_path)
print("Final model saved at:", final_model_path)

In [None]:
plt.figure(figsize=(12, 5))

# Plot Loss Curves.
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.title("Loss Curves")

# Plot Accuracy Curves (including Top-5 Accuracy).
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.plot(history.history['top_5_accuracy'], label='Train Top-5 Accuracy')
plt.plot(history.history['val_top_5_accuracy'], label='Validation Top-5 Accuracy')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Accuracy Curves")

plt.tight_layout()
plt.show()

In [None]:
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

accuracy_val = accuracy_score(y_test, y_pred)
precision_val = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall_val = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1_val = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Evaluation Metrics on Test Set:")
print(f"Accuracy       : {accuracy_val:.4f}")
print(f"Precision (W)  : {precision_val:.4f}")
print(f"Recall (W)     : {recall_val:.4f}")
print(f"F1 Score (W)   : {f1_val:.4f}")

In [None]:
report = classification_report(y_test, y_pred, target_names=[str(cls) for cls in le.classes_], zero_division=0, output_dict=True)
print("Detailed Weighted Avg - Precision: {:.4f}, Recall: {:.4f}, F1: {:.4f}".format(
    report['weighted avg']['precision'],
    report['weighted avg']['recall'],
    report['weighted avg']['f1-score']
))