<a href="https://colab.research.google.com/github/gab892/gab892/blob/main/eghosa_enzymedemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install biopython tensorflow scikit-learn matplotlib numpy
import numpy as np
from Bio.Seq import Seq
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, Embedding
import matplotlib.pyplot as plt

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [5]:
# ======================
# STEP 0: Install & Import
# ======================
!pip install biopython tensorflow scikit-learn matplotlib numpy pandas
import numpy as np
from Bio import SeqIO
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, Embedding
import matplotlib.pyplot as plt

# ======================
# STEP 1: Load and Filter Data
# ======================
def load_fasta(filename):
    ldh_seqs = []
    other_seqs = []

    for record in SeqIO.parse(filename, "fasta"):
        seq = str(record.seq)
        desc = record.description.lower()

        if "lactate dehydrogenase" in desc:
            ldh_seqs.append(seq)
        elif "malate dehydrogenase" in desc:  # Common similar enzyme
            other_seqs.append(seq)

    return ldh_seqs, other_seqs

# Load your FASTA file (upload to Colab first)
ldh_seqs, other_seqs = load_fasta("uniprotkb_lactate_dehydrogenase_AND_mod_2025_05_27.fasta")

print(f"Found {len(ldh_seqs)} LDH sequences")
print(f"Found {len(other_seqs)} non-LDH sequences")

# ======================
# STEP 2: Prepare Dataset
# ======================
# Add random negatives if needed
if len(other_seqs) < 50:
    amino_acids = list('ACDEFGHIKLMNPQRSTVWY')
    for _ in range(50 - len(other_seqs)):
        random_protein = ''.join(np.random.choice(amino_acids, size=300))
        other_seqs.append(random_protein)

# Combine and label data
sequences = ldh_seqs + other_seqs
labels = [1]*len(ldh_seqs) + [0]*len(other_seqs)  # 1=LDH, 0=non-LDH

# ======================
# STEP 3: Encode Sequences
# ======================
max_length = 500  # Pad/truncate to this length
aa_to_int = {aa:i+1 for i, aa in enumerate('ACDEFGHIKLMNPQRSTVWY')}  # 0=padding

def encode_seq(seq):
    encoded = [aa_to_int.get(aa, 0) for aa in seq[:max_length]]
    if len(encoded) < max_length:
        encoded += [0]*(max_length - len(encoded))
    return encoded

X = np.array([encode_seq(seq) for seq in sequences])
y = np.array(labels)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ======================
# STEP 4: Build Model
# ======================
model = Sequential([
    Embedding(input_dim=len(aa_to_int)+1, output_dim=32, input_length=max_length),
    Conv1D(64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Conv1D(128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# ======================
# STEP 5: Train Model
# ======================
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test, y_test)
)

# ======================
# STEP 6: Evaluate
# ======================
# Plot accuracy
plt.figure(figsize=(10, 4))
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Prediction function
def predict_ldh(sequence):
    encoded = encode_seq(sequence)
    prob = model.predict(np.array([encoded]))[0][0]
    return "LDH" if prob > 0.5 else "Not LDH", f"{max(prob, 1-prob)*100:.1f}% confidence"

# Test cases
test_ldh = "MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKGEMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFIIPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGVHPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKEVHKQVVESAYEVIKLKGYTSWAIGLSVADLAESIMKNLRRVHPVSTMIKGLYGIKDDVFLSVPCILGQNGISDLVKVTLTSEEEARLKKSADTLWGIQKELQF"
test_non_ldh = "MSEPIRVLVTGAAGQIAYSLLYSIGNGSVFGKDQPIILVLLDITPMMGVLDGVLMELQDCALPLLKDVIATDKEDVAFKDLDVAILVGSMPRREGMERKDLLKANVKIFKSQGAALDKYAKKSVKVIVVGNPANTNCLTASKSAPSIPKENFSCLTRLDHNRAKAQIALKLGVTANDVKNVIIWGNHSSTQYPDVNHAKVKLQGKEVGVYEALKDDSWLKGEFVTTVQQRGAAVIKARKLSSAMSAAKAICDHVRDIWFGTPEGEFVSMGVISDGNSYGVPDDLLYSFPVVIKNKTWKFVEGLPINDFSREKMDLTAKELTEEKESAFEFLSSA"

print("LDH Test:", predict_ldh(test_ldh))
print("Non-LDH Test:", predict_ldh(test_non_ldh))

# Save model
model.save("ldh_predictor.h5")
print("Model saved as 'ldh_predictor.h5'")



FileNotFoundError: [Errno 2] No such file or directory: 'uniprotkb_lactate_dehydrogenase_AND_mod_2025_05_27.fasta'

# New section

In [None]:
# ======================
# STEP 0: Install & Import
# ======================
!pip install biopython tensorflow scikit-learn matplotlib numpy pandas
import numpy as np
from Bio import SeqIO
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, Embedding
import matplotlib.pyplot as plt

# ======================
# STEP 1: Load and Filter Data
# ======================
def load_fasta(filename):
    ldh_seqs = []
    other_seqs = []

    for record in SeqIO.parse(filename, "fasta"):
        seq = str(record.seq)
        desc = record.description.lower()

        if "lactate dehydrogenase" in desc:
            ldh_seqs.append(seq)
        elif "malate dehydrogenase" in desc:  # Common similar enzyme
            other_seqs.append(seq)

    return ldh_seqs, other_seqs

# Load your FASTA file (upload to Colab first)
ldh_seqs, other_seqs = load_fasta("uniprotkb_lactate_dehydrogenase_AND_mod_2025_05_27.fasta")

print(f"Found {len(ldh_seqs)} LDH sequences")
print(f"Found {len(other_seqs)} non-LDH sequences")

# ======================
# STEP 2: Prepare Dataset
# ======================
# Add random negatives if needed
if len(other_seqs) < 50:
    amino_acids = list('ACDEFGHIKLMNPQRSTVWY')
    for _ in range(50 - len(other_seqs)):
        random_protein = ''.join(np.random.choice(amino_acids, size=300))
        other_seqs.append(random_protein)

# Combine and label data
sequences = ldh_seqs + other_seqs
labels = [1]*len(ldh_seqs) + [0]*len(other_seqs)  # 1=LDH, 0=non-LDH

# ======================
# STEP 3: Encode Sequences
# ======================
max_length = 500  # Pad/truncate to this length
aa_to_int = {aa:i+1 for i, aa in enumerate('ACDEFGHIKLMNPQRSTVWY')}  # 0=padding

def encode_seq(seq):
    encoded = [aa_to_int.get(aa, 0) for aa in seq[:max_length]]
    if len(encoded) < max_length:
        encoded += [0]*(max_length - len(encoded))
    return encoded

X = np.array([encode_seq(seq) for seq in sequences])
y = np.array(labels)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ======================
# STEP 4: Build Model
# ======================
model = Sequential([
    Embedding(input_dim=len(aa_to_int)+1, output_dim=32, input_length=max_length),
    Conv1D(64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Conv1D(128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# ======================
# STEP 5: Train Model
# ======================
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test, y_test)
)

# ======================
# STEP 6: Evaluate
# ======================
# Plot accuracy
plt.figure(figsize=(10, 4))
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Prediction function
def predict_ldh(sequence):
    encoded = encode_seq(sequence)
    prob = model.predict(np.array([encoded]))[0][0]
    return "LDH" if prob > 0.5 else "Not LDH", f"{max(prob, 1-prob)*100:.1f}% confidence"

# Test cases
test_ldh = "MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKGEMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFIIPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGVHPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKEVHKQVVESAYEVIKLKGYTSWAIGLSVADLAESIMKNLRRVHPVSTMIKGLYGIKDDVFLSVPCILGQNGISDLVKVTLTSEEEARLKKSADTLWGIQKELQF"
test_non_ldh = "MSEPIRVLVTGAAGQIAYSLLYSIGNGSVFGKDQPIILVLLDITPMMGVLDGVLMELQDCALPLLKDVIATDKEDVAFKDLDVAILVGSMPRREGMERKDLLKANVKIFKSQGAALDKYAKKSVKVIVVGNPANTNCLTASKSAPSIPKENFSCLTRLDHNRAKAQIALKLGVTANDVKNVIIWGNHSSTQYPDVNHAKVKLQGKEVGVYEALKDDSWLKGEFVTTVQQRGAAVIKARKLSSAMSAAKAICDHVRDIWFGTPEGEFVSMGVISDGNSYGVPDDLLYSFPVVIKNKTWKFVEGLPINDFSREKMDLTAKELTEEKESAFEFLSSA"

print("LDH Test:", predict_ldh(test_ldh))
print("Non-LDH Test:", predict_ldh(test_non_ldh))

# Save model
model.save("ldh_predictor.h5")
print("Model saved as 'ldh_predictor.h5'")