<a href="https://colab.research.google.com/github/fronre/Mini-Datathon-NLP-DNA-Sequence-Challenge/blob/main/NLP-DNA-Sequence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
!pip install biopython

import pandas as pd
import numpy as np
from Bio import SeqIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Bidirectional, GRU, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
import tensorflow as tf




In [48]:
def load_data(file_path, is_train=True):
    sequences, ids, labels = [], [], []

    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append(str(record.seq))

        # استخراج id_sequence من record.description
        header_parts = record.description.split("|")
        if len(header_parts) > 1:
            id_part = header_parts[0].strip()
            id_sequence = id_part.split("id_sequence_")[-1].strip()
            ids.append(id_sequence)
        else:
            ids.append("unknown")

        if is_train:
            label = record.description.split("label=")[-1].strip()
            labels.append(label)

    if is_train:
        return pd.DataFrame({"id_sequence": ids, "sequence": sequences, "label": labels})
    else:
        return pd.DataFrame({"id_sequence": ids, "sequence": sequences})

In [49]:
train_data = load_data("train_dna.fasta", is_train=True)
test_data = load_data("test_dna.fasta", is_train=False)

In [50]:
# تعريف Vocabulary
vocab = "ACGT"
char_to_int = {c: i for i, c in enumerate(vocab)}

# تحويل السلاسل إلى أرقام
X = np.array([[char_to_int[char] for char in seq] for seq in train_data["sequence"]])
y = np.array([1 if label == "promoter" else 0 for label in train_data["label"]])

# جعل جميع السلاسل بنفس الطول (Padding)
max_len = max(len(seq) for seq in X)
X = pad_sequences(X, maxlen=max_len, padding='post')

# تقسيم البيانات إلى مجموعتين: تدريب واختبار
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [51]:
# تعريف النموذج
inputs = Input(shape=(max_len,))
x = Embedding(input_dim=len(vocab), output_dim=512, input_length=max_len)(inputs)
x = Bidirectional(GRU(512, return_sequences=True, kernel_regularizer=l2(0.005)))(x)
x = Dropout(0.3)(x)
x = Bidirectional(GRU(256, kernel_regularizer=l2(0.005)))(x)
x = BatchNormalization()(x)
x = Dropout(0.4)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
outputs = Dense(1, activation='sigmoid')(x)

# بناء النموذج
model = Model(inputs, outputs)

# تجميع النموذج
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# ملخص النموذج
model.summary()



In [52]:
# إضافة Early Stopping و ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001)

# تدريب النموذج
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=15,
    batch_size=128,
    callbacks=[early_stopping, reduce_lr]
)

Epoch 1/15
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 506ms/step - accuracy: 0.5939 - loss: 10.5962 - val_accuracy: 0.6908 - val_loss: 2.5897 - learning_rate: 5.0000e-04
Epoch 2/15
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 478ms/step - accuracy: 0.7318 - loss: 2.0344 - val_accuracy: 0.7434 - val_loss: 1.2186 - learning_rate: 5.0000e-04
Epoch 3/15
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 479ms/step - accuracy: 0.7963 - loss: 1.0174 - val_accuracy: 0.4906 - val_loss: 3.7285 - learning_rate: 5.0000e-04
Epoch 4/15
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 477ms/step - accuracy: 0.8348 - loss: 0.6614 - val_accuracy: 0.5149 - val_loss: 1.2540 - learning_rate: 5.0000e-04
Epoch 5/15
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 473ms/step - accuracy: 0.8983 - loss: 0.4229 - val_accuracy: 0.5133 - val_loss: 1.7257 - learning_rate: 5.0000e-04
Epoch 6/15
[1m128/128[0m [32m━━

In [53]:
# تحويل السلاسل إلى أرقام
X_test = [[char_to_int.get(char, 0) for char in seq] for seq in test_data["sequence"]]

# جعل جميع السلاسل بنفس الطول (Padding)
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')

# التنبؤ
test_data["type"] = (model.predict(X_test) > 0.5).astype(int)
test_data["type"] = test_data["type"].apply(lambda x: "promoter" if x == 1 else "non_promoter")

# حفظ النتائج
test_data[["id_sequence", "type"]].to_csv("submission.csv", index=False)

# عرض النتائج
print(test_data.head())
test_data[["id_sequence", "type"]].to_csv("submission.csv", index=False)

[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 63ms/step
  id_sequence                                           sequence          type
0          id                                           sequence      promoter
1       17042  TCTATTTGCGCGGTGCGGGACCGCTTACCCCCTACTCCATTGCTCT...      promoter
2       14466  CTCGTACACCCTCAGCGAGGCTAACATAATTCGGGTAAAGCTCAGT...  non_promoter
3       19734  TTAAGGTCCCCAGAGCGCCCAGTTCGCTCCCCGCTCGTCTAGCTTC...  non_promoter
4       17415  TGCCTCTGTGCGACATGCCTTCGCTCCACTCATCAGGATAAGAGGG...  non_promoter
