In [None]:
!pip install tensorflow
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install numpy



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, concatenate
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
!pip install gdown

import gdown
import pandas as pd

file_id = '1mRsbSSICz0_WrVJsfrA1Kp14Oxx_FSE8'
url = f'https://drive.google.com/uc?id={file_id}'

output = 'scop_hs_for_cnn.csv'
gdown.download(url, output, quiet=False)

data = pd.read_csv(output)
print(data.head())




Downloading...
From: https://drive.google.com/uc?id=1mRsbSSICz0_WrVJsfrA1Kp14Oxx_FSE8
To: /content/scop_hs_for_cnn.csv
100%|██████████| 1.54M/1.54M [00:00<00:00, 126MB/s]

                                            Sequence       CL       CF  \
0  DKLPYKVADIGLAAWGRKALDIAENEMPGLMRMRERYSASKPLKGA...  1000002  2000005   
1  KIIVKHVTVIGGGLMGAGIAQVAAATGHTVVLVDQTEDILAKSKKG...  1000002  2000154   
2  VASYDYLVIGGGSGGLASARRAAELGARAAVVESHKLGGTCVNVGC...  1000002  2000021   
3  NKCDVVVVGGGISGMAAAKLLHDSGLNVVVLEARDRVGGRTYTLRN...  1000002  2000021   
4  PTKKTGKVIIIGSGVSGLAAARQLQSFGMDVTLLEARDRVGGRVAT...  1000002  2000021   

        FA                        CL_NAME                            CF_NAME  \
0  4000098  Alpha and beta proteins (a/b)            Canonical Rossmann fold   
1  4000107  Alpha and beta proteins (a/b)  6PGDH-type extended Rossmann fold   
2  4000121  Alpha and beta proteins (a/b)          FAD/NAD(P)-binding domain   
3  4000128  Alpha and beta proteins (a/b)          FAD/NAD(P)-binding domain   
4  4000128  Alpha and beta proteins (a/b)          FAD/NAD(P)-binding domain   

                                         FA_NAME  
0           S-adenosylh




In [None]:
sequence_lengths = data['Sequence'].apply(len)

max_len = sequence_lengths.max()
print(f"Panjang maksimum sequence: {max_len}")
print(f"Rata-rata panjang sequence: {sequence_lengths.mean():.2f}")

percentiles = sequence_lengths.quantile([0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

max_len_95th = int(percentiles[0.95])
print(f"\nPanjang maksimum (persentil ke-95): {max_len_95th}")

Panjang maksimum sequence: 1564
Rata-rata panjang sequence: 171.10

Panjang maksimum (persentil ke-95): 390


In [None]:
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
aa_to_int = {aa: i+1 for i, aa in enumerate(amino_acids)}

def sequence_to_int(sequence):
    return [aa_to_int.get(aa, 0) for aa in sequence]


In [None]:
X_int = [sequence_to_int(seq) for seq in data['Sequence']]
X_padded = pad_sequences(X_int, maxlen=max_len_95th, padding='post', truncating='post')

print(X_padded.shape)

(5703, 390)


In [None]:
le_cl = LabelEncoder()
le_cf = LabelEncoder()
le_fa = LabelEncoder()

y_cl = le_cl.fit_transform(data['CL'])
y_cf = le_cf.fit_transform(data['CF'])
y_fa = le_fa.fit_transform(data['FA'])

print(f"Jumlah kelas Class (CL): {len(le_cl.classes_)}")
print(f"Jumlah kelas Fold (CF): {len(le_cf.classes_)}")
print(f"Jumlah kelas Family (FA): {len(le_fa.classes_)}")


Jumlah kelas Class (CL): 5
Jumlah kelas Fold (CF): 587
Jumlah kelas Family (FA): 1667


In [None]:
vocab_size = len(aa_to_int) + 1
embedding_dim = 128

input_seq = Input(shape=(max_len_95th,), name='input_sequence')
x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_seq)
x = Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(x)
x = MaxPooling1D(pool_size=2)(x)
x = Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(x)
x = MaxPooling1D(pool_size=2)(x)
x = Conv1D(filters=128, kernel_size=3, activation='relu', padding='same')(x)
x = GlobalMaxPooling1D()(x)
x = Dropout(0.5)(x)

In [None]:
# Model 1 (Class)
class_output = Dense(len(le_cl.classes_), activation='softmax', name='class_output')(x)
model_class = Model(inputs=input_seq, outputs=class_output, name='model_class')

model_class.compile(optimizer='adam',
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])

In [None]:
# Model 2 (Fold)

fold_input = x
fold_x = Dense(64, activation='relu')(fold_input)
fold_x = Dropout(0.5)(fold_x)
fold_output = Dense(len(le_cf.classes_), activation='softmax', name='fold_output')(fold_x)

model_fold = Model(inputs=input_seq, outputs=fold_output, name='model_fold')

model_fold.compile(optimizer='adam',
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])

In [None]:
# Model 3 (Family)
family_input = x
family_x = Dense(64, activation='relu')(family_input)
family_x = Dropout(0.5)(family_x)
family_output = Dense(len(le_fa.classes_), activation='softmax', name='family_output')(family_x)

model_family = Model(inputs=input_seq, outputs=family_output, name='model_family')

model_family.compile(optimizer='adam',
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])


In [None]:
# Melatih Model Class
early_stopping_cl = EarlyStopping(monitor='val_loss', patience=50, min_delta=0.001, restore_best_weights=True)
history_cl = model_class.fit(
    X_padded,
    y_cl,
    epochs=1000,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping_cl]
)


Epoch 1/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 35ms/step - accuracy: 0.2916 - loss: 1.5308 - val_accuracy: 0.5022 - val_loss: 1.2303
Epoch 2/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.4983 - loss: 1.1956 - val_accuracy: 0.5723 - val_loss: 1.0258
Epoch 3/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6029 - loss: 0.9947 - val_accuracy: 0.6503 - val_loss: 0.8941
Epoch 4/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6420 - loss: 0.9071 - val_accuracy: 0.5986 - val_loss: 0.9969
Epoch 5/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6985 - loss: 0.7835 - val_accuracy: 0.6801 - val_loss: 0.8721
Epoch 6/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7226 - loss: 0.7185 - val_accuracy: 0.6573 - val_loss: 0.8884
Epoch 7/1000


In [None]:
# Melatih Model Fold
early_stopping_cf = EarlyStopping(monitor='accuracy', patience=100, min_delta=0.001, restore_best_weights=True)
history_cf = model_fold.fit(
    X_padded,
    y_cf,
    epochs=1000,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping_cf]
)

Epoch 1/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 33ms/step - accuracy: 0.0336 - loss: 5.9922 - val_accuracy: 0.0973 - val_loss: 5.3409
Epoch 2/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.1105 - loss: 5.0308 - val_accuracy: 0.1420 - val_loss: 5.0575
Epoch 3/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.1536 - loss: 4.6170 - val_accuracy: 0.1534 - val_loss: 4.8810
Epoch 4/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.1801 - loss: 4.3539 - val_accuracy: 0.1569 - val_loss: 4.8138
Epoch 5/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.1849 - loss: 4.2524 - val_accuracy: 0.1840 - val_loss: 4.7097
Epoch 6/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.1990 - loss: 4.1035 - val_accuracy: 0.1876 - val_loss: 4.7175
Epoch 7/1000


In [None]:
# Melatih Model Family
early_stopping_fa = EarlyStopping(monitor='accuracy', patience=100, min_delta=0.001, restore_best_weights=True)
history_fa = model_family.fit(
    X_padded,
    y_fa,
    epochs=1000,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping_fa]
)

Epoch 1/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 33ms/step - accuracy: 0.0527 - loss: 7.0013 - val_accuracy: 0.0824 - val_loss: 6.7390
Epoch 2/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.1555 - loss: 5.5860 - val_accuracy: 0.1078 - val_loss: 6.5707
Epoch 3/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.1970 - loss: 5.0886 - val_accuracy: 0.1122 - val_loss: 6.5161
Epoch 4/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.2129 - loss: 4.7943 - val_accuracy: 0.1253 - val_loss: 6.4821
Epoch 5/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.2226 - loss: 4.5675 - val_accuracy: 0.1350 - val_loss: 6.4963
Epoch 6/1000
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.2589 - loss: 4.2812 - val_accuracy: 0.1253 - val_loss: 6.4916
Epoch 7/1000
[

In [None]:
# Evaluasi Model
print("\nEvaluasi Model Class:")
loss_cl, acc_cl = model_class.evaluate(X_padded, y_cl)
print(f"Loss Class: {loss_cl}, Accuracy Class: {acc_cl}")

print("\nEvaluasi Model Fold:")
loss_cf, acc_cf = model_fold.evaluate(X_padded, y_cf)
print(f"Loss Fold: {loss_cf}, Accuracy Fold: {acc_cf}")

print("\nEvaluasi Model Family:")
loss_fa, acc_fa = model_family.evaluate(X_padded, y_fa)
print(f"Loss Family: {loss_fa}, Accuracy Family: {acc_fa}")



Evaluasi Model Class:
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6175 - loss: 1.8728
Loss Class: 2.09381365776062, Accuracy Class: 0.6151148676872253

Evaluasi Model Fold:
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7122 - loss: 1.7363
Loss Fold: 3.7019731998443604, Accuracy Fold: 0.6440470218658447

Evaluasi Model Family:
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9178 - loss: 0.7012
Loss Family: 2.9025824069976807, Accuracy Family: 0.7971243262290955


In [None]:
sample_seq = data['Sequence'].iloc[0]
sample_int = pad_sequences([sequence_to_int(sample_seq)], maxlen=max_len_95th, padding='post')

# Prediksi Class
pred_cl_prob = model_class.predict(sample_int)
predicted_cl_index = np.argmax(pred_cl_prob[0])
predicted_cl_label = le_cl.inverse_transform([predicted_cl_index])
print("\nPrediksi Class:", predicted_cl_label)

# Prediksi Fold
pred_cf_prob = model_fold.predict(sample_int)
predicted_cf_index = np.argmax(pred_cf_prob[0])
predicted_cf_label = le_cf.inverse_transform([predicted_cf_index])
print("Prediksi Fold:", predicted_cf_label)

# Prediksi Family
pred_fa_prob = model_family.predict(sample_int)
predicted_fa_index = np.argmax(pred_fa_prob[0])
predicted_fa_label = le_fa.inverse_transform([predicted_fa_index])
print("Prediksi Family:", predicted_fa_label)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 489ms/step

Prediksi Class: [1000002]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 421ms/step
Prediksi Fold: [2000088]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 396ms/step
Prediksi Family: [4000098]


In [None]:
import pickle

model_class.save('model_class.h5')
model_fold.save('model_fold.h5')
model_family.save('model_family.h5')

with open('label_encoder_cl.pkl', 'wb') as f:
    pickle.dump(le_cl, f)
with open('label_encoder_cf.pkl', 'wb') as f:
    pickle.dump(le_cf, f)
with open('label_encoder_fa.pkl', 'wb') as f:
    pickle.dump(le_fa, f)

model_params = {
    'max_len_95th': max_len_95th,
    'aa_to_int': aa_to_int
}
with open('model_params.pkl', 'wb') as f:
    pickle.dump(model_params, f)

