<a href="https://colab.research.google.com/github/elangbijak4/Riset-Bioinformatika/blob/main/ML_for_Allignment_using_EAFSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Import libraries
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Sample RNA sequences and labels
rna_sequences = ["AGCUAGUCA", "GCUAGUC", "AGUCA", "AAGCUAGU", "GGCUAGCUA"]
labels = [1, 0, 1, 1, 0]  # Binary labels indicating biological relevance

# Step 1: Build suffix array
def build_suffix_array(sequence):
    suffixes = [(sequence[i:], i) for i in range(len(sequence))]
    sorted_suffixes = sorted(suffixes, key=lambda x: x[0])
    return [suffix[0] for suffix in sorted_suffixes]

# Step 2: Extract suffix features
def extract_suffix_features(sequences):
    all_suffixes = []
    for seq in sequences:
        suffixes = build_suffix_array(seq)
        all_suffixes.append(" ".join(suffixes))  # Join suffixes as a single string
    return all_suffixes

suffix_features = extract_suffix_features(rna_sequences)

# Step 3: Convert suffix features to numerical representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(suffix_features)

# Step 4: Split data and ensure both classes are present in training data
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42, stratify=labels)

# Step 5: Train Logistic Regression for adaptive filtering
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 6: Filtering suffix array using ML
def filter_with_model(sequence, model, vectorizer):
    suffixes = build_suffix_array(sequence)
    suffix_features = [" ".join(suffixes)]
    X_suffix = vectorizer.transform(suffix_features)
    prediction = model.predict(X_suffix)
    # Select suffixes predicted as class 1
    return [suffixes[i] for i in range(len(suffixes)) if prediction[0] == 1]

# Step 7: Alignment using filtered suffixes
def align_sequences(seq1, seq2, filtered_suffixes):
    # A simple alignment function (for demonstration purposes)
    aligned_seq1 = []
    aligned_seq2 = []
    for suffix in filtered_suffixes:
        if suffix in seq2:
            aligned_seq1.append(suffix)
            aligned_seq2.append(suffix)
        else:
            aligned_seq1.append("-")
            aligned_seq2.append("-")
    return "".join(aligned_seq1), "".join(aligned_seq2)

# Example RNA sequences to align
sequence1 = "AGCUAGUCA"
sequence2 = "GCUAGUCA"
filtered_suffixes = filter_with_model(sequence1, model, vectorizer)
aligned_seq1, aligned_seq2 = align_sequences(sequence1, sequence2, filtered_suffixes)

print(f"Filtered Suffixes: {filtered_suffixes}")
print(f"Aligned Sequences:\nSeq1: {aligned_seq1}\nSeq2: {aligned_seq2}")

Filtered Suffixes: ['A', 'AGCUAGUCA', 'AGUCA', 'CA', 'CUAGUCA', 'GCUAGUCA', 'GUCA', 'UAGUCA', 'UCA']
Aligned Sequences:
Seq1: A-AGUCACACUAGUCAGCUAGUCAGUCAUAGUCAUCA
Seq2: A-AGUCACACUAGUCAGCUAGUCAGUCAUAGUCAUCA
