<a href="https://colab.research.google.com/github/elangbijak4/Riset-Bioinformatika/blob/main/ML_for_Filtering_using_EAFSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Import libraries
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Step 1: Generate sample RNA sequences and labels
rna_sequences = ["AGCUAGUCA", "GCUAGUC", "AGUCA", "AAGCUAGU", "GGCUAGCUA"]
labels = [1, 0, 1, 1, 0]  # Example binary labels (e.g., functional vs non-functional RNA)

# Step 2: Build suffix array
def build_suffix_array(sequence):
    suffixes = [(sequence[i:], i) for i in range(len(sequence))]
    sorted_suffixes = sorted(suffixes, key=lambda x: x[0])
    return [suffix[0] for suffix in sorted_suffixes]

# Step 3: Extract suffix features
def extract_suffix_features(sequences):
    all_suffixes = []
    for seq in sequences:
        suffixes = build_suffix_array(seq)
        all_suffixes.append(" ".join(suffixes))  # Join suffixes as a single string
    return all_suffixes

suffix_features = extract_suffix_features(rna_sequences)

# Step 4: Convert features to numerical form
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(suffix_features)

# Step 5: Train Logistic Regression for adaptive filtering
# Use stratify to ensure both classes are represented in train and test splits
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.3, random_state=42, stratify=labels
)

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 6: Use the model as an adaptive filter
def filter_with_model(sequence, model, vectorizer):
    suffixes = build_suffix_array(sequence)
    suffix_features = [" ".join(suffixes)]
    X_suffix = vectorizer.transform(suffix_features)
    prediction = model.predict(X_suffix)
    return prediction

# Example: Filter a new RNA sequence
test_sequence = "AGCUAG"
filtered_result = filter_with_model(test_sequence, model, vectorizer)
print(f"Filtered result for {test_sequence}: {filtered_result}")

Filtered result for AGCUAG: [1]
