In [33]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# online
# splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
# df_train = pd.read_parquet("hf://datasets/tattabio/mibig_classification_dna/" + splits["train"])
# df_test = pd.read_parquet("hf://datasets/tattabio/mibig_classification_dna/" + splits["test"])
df_train = pd.read_csv("mibig_classification_dna_train.csv")
df_test = pd.read_csv("mibig_classification_dna_test.csv")
display(df_train)

Unnamed: 0,Entry,Sequence,bgc,class,simple_class
0,BGC0000476,CCAACCAACATATGAACAAGAAGAACATTCTACCCCAACAAGGCCA...,BGC0000476,RiPP,RiPP
1,BGC0001412,CAGCTGCAATTGTTCGTAACCAAACGAAACGACTGCCATCACTACC...,BGC0001412,Saccharide,Saccharide
2,BGC0001669,AGCTCCGTCGCGACGGCCGCGGCGCTGACGACCAGAACCGCTACTC...,BGC0001669,RiPP,RiPP
3,BGC0002337,TCAGGTGACGAGGCCGAGCCGCGGTCCGCCCCCATGGCGCAGCCGT...,BGC0002337,RiPP,RiPP
4,BGC0001769,GTGAGCGCGATCCGGCAACGCACCCGTGTCGGTGGTGGGCCTCTGG...,BGC0001769,Polyketide,Polyketide
...,...,...,...,...,...
1758,BGC0001728,TTGTCGGAAACAGAAAAGAAAGATGCATTGCAAGTGTTAAGAAGGA...,BGC0001728,"NRP, Polyketide",NRP
1759,BGC0001608,ATGAACAGATACGAAACCGTGAGGCGAGTCGAGTCGAATGCCAGAA...,BGC0001608,NRP,NRP
1760,BGC0000958,TCAGGCGGCGGTGGGCTGCCATGCCTCCTGGAAGCGGCGGCGGGCC...,BGC0000958,"NRP, Polyketide",NRP
1761,BGC0002232,TCTGTCCCTTTTCTTCTCCGTTTGGCTTCTTGTATCGCTGACGGTT...,BGC0002232,Alkaloid,Alkaloid


In [5]:
print(df_train['simple_class'].value_counts())
display(df_train.head(5))

simple_class
NRP           652
Polyketide    551
RiPP          266
Terpene       133
Saccharide    113
Alkaloid       48
Name: count, dtype: int64


Unnamed: 0,Entry,Sequence,bgc,class,simple_class
0,BGC0000476,CCAACCAACATATGAACAAGAAGAACATTCTACCCCAACAAGGCCA...,BGC0000476,RiPP,RiPP
1,BGC0001412,CAGCTGCAATTGTTCGTAACCAAACGAAACGACTGCCATCACTACC...,BGC0001412,Saccharide,Saccharide
2,BGC0001669,AGCTCCGTCGCGACGGCCGCGGCGCTGACGACCAGAACCGCTACTC...,BGC0001669,RiPP,RiPP
3,BGC0002337,TCAGGTGACGAGGCCGAGCCGCGGTCCGCCCCCATGGCGCAGCCGT...,BGC0002337,RiPP,RiPP
4,BGC0001769,GTGAGCGCGATCCGGCAACGCACCCGTGTCGGTGGTGGGCCTCTGG...,BGC0001769,Polyketide,Polyketide


In [3]:
df_train.to_csv("mibig_classification_dna_train.csv", index=False)
df_test.to_csv("mibig_classification_dna_test.csv", index=False)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, classification_report

def extract_basic_seq_features(df, seq_col='Sequence'):
    """
    Extracts basic sequence features from a DNA sequence column in a DataFrame.
    Features include length, GC content, AT content, skews, N content, CpG ratio.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing the sequences.
        seq_col (str): Name of the column containing the sequences.
    
    Returns:
        pd.DataFrame: DataFrame with the original sequences and the extracted features.
    """
    def calc_features(seq):
        seq = seq.upper()
        length = len(seq)
        a = seq.count('A')
        t = seq.count('T')
        g = seq.count('G')
        c = seq.count('C')
        n = seq.count('N')
        
        # Avoid division by zero
        gc_total = g + c if (g + c) > 0 else 1
        at_total = a + t if (a + t) > 0 else 1
        length_safe = length if length > 0 else 1
        
        cpg_obs = seq.count('CG')
        cpg_exp = (c * g) / length_safe
        cpg_ratio = cpg_obs / cpg_exp if cpg_exp > 0 else 0

        return {
            'seq_length': length,
            'gc_content': (g + c) / length_safe,
            'at_content': (a + t) / length_safe,
            'n_content': n / length_safe,
            'gc_skew': (g - c) / gc_total,
            'at_skew': (a - t) / at_total,
            'cpg_ratio': cpg_ratio
        }

    features_df = df[seq_col].apply(calc_features).apply(pd.Series)
    return pd.concat([df.reset_index(drop=True), features_df], axis=1)

def seq_to_kmers(seq):
    return ' '.join([seq[i:i+k] for i in range(len(seq) - k + 1)])

def get_kmer_vectorizer(df_train, df_test, k=3):
    """
    Converts sequences to k-mers and creates a k-mer vectorizer for training and testing datasets.

    Args:
        df_train (pd.DataFrame): Training dataset.
        df_test (pd.DataFrame): Testing dataset.
        k (int, optional): k value for k-mers. Defaults to 3.

    Returns:
        tuple: Tuple containing the k-mer vectorized training and testing datasets.
    """
    train_kmers = df_train['Sequence'].apply(seq_to_kmers)
    test_kmers = df_test['Sequence'].apply(seq_to_kmers)
    vectorizer = CountVectorizer(analyzer='word', token_pattern=r'\S+')
    
    X_train_kmer = vectorizer.fit_transform(train_kmers)
    X_train_df = pd.DataFrame(X_train_kmer.toarray(), columns=vectorizer.get_feature_names_out())
    X_train_df = X_train_df.div(X_train_df.sum(axis=1), axis=0) * 100
    X_train_df.reset_index(drop=True, inplace=True)
    X_train_df['seq_length'] = df_train['Sequence'].apply(len)
    
    X_test_kmer = vectorizer.transform(test_kmers)
    X_test_df = pd.DataFrame(X_test_kmer.toarray(), columns=vectorizer.get_feature_names_out())
    X_test_df = X_test_df.div(X_test_df.sum(axis=1), axis=0) * 100
    X_test_df.reset_index(drop=True, inplace=True)
    X_test_df['seq_length'] = df_test['Sequence'].apply(len)

    return X_train_df, X_test_df

df_train = pd.read_csv("mibig_classification_dna_train.csv")
df_test = pd.read_csv("mibig_classification_dna_test.csv")
df_train = extract_basic_seq_features(df_train)
df_test = extract_basic_seq_features(df_test)
for k in range(3, 8):
    print(f"Processing k={k}...")
    X_train_kmer, X_test_kmer = get_kmer_vectorizer(df_train, df_test, k=k)
    y_train = df_train['simple_class']
    y_test = df_test['simple_class']
    print(f"Shape of train k-mer features: {X_train_kmer.shape}")
    rfc = RandomForestClassifier(n_estimators=100, random_state=42)
    rfc.fit(X_train_kmer, y_train)
    y_pred = rfc.predict(X_test_kmer)
    print(classification_report(y_test, y_pred, zero_division=0))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted', zero_division=0))

Processing k=3...
Shape of train k-mer features: (1763, 126)
              precision    recall  f1-score   support

    Alkaloid       0.00      0.00      0.00        12
         NRP       0.62      0.70      0.66       163
  Polyketide       0.60      0.73      0.66       138
        RiPP       0.74      0.78      0.76        67
  Saccharide       0.90      0.32      0.47        28
     Terpene       0.60      0.18      0.28        33

    accuracy                           0.64       441
   macro avg       0.58      0.45      0.47       441
weighted avg       0.63      0.64      0.62       441

F1 Score: 0.6170995465412341
Processing k=4...
Shape of train k-mer features: (1763, 598)
              precision    recall  f1-score   support

    Alkaloid       0.00      0.00      0.00        12
         NRP       0.59      0.71      0.64       163
  Polyketide       0.62      0.74      0.67       138
        RiPP       0.83      0.72      0.77        67
  Saccharide       1.00      0.32  

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, accuracy_score, classification_report
import pandas as pd
import numpy as np

# Define hyperparameter search space
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Load and process data
df_train = pd.read_csv("mibig_classification_dna_train.csv")
df_test = pd.read_csv("mibig_classification_dna_test.csv")
df_train = extract_basic_seq_features(df_train)
df_test = extract_basic_seq_features(df_test)

results = []

for k in range(3, 8):
    print(f"\n===== Processing k={k} =====")
    
    # Extract features
    X_train_kmer, X_test_kmer = get_kmer_vectorizer(df_train, df_test, k=k)
    y_train = df_train['simple_class']
    y_test = df_test['simple_class']
    
    print(f"Feature shape: {X_train_kmer.shape}")
    
    # Initialize and run RandomizedSearchCV
    base_rfc = RandomForestClassifier(random_state=42)
    search = RandomizedSearchCV(
        base_rfc,
        param_distributions=param_dist,
        n_iter=20,
        scoring='f1_weighted',
        cv=3,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )
    
    search.fit(X_train_kmer, y_train)
    best_rfc = search.best_estimator_
    y_pred = best_rfc.predict(X_test_kmer)
    
    # Evaluate
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    acc = accuracy_score(y_test, y_pred)
    
    print("Best Parameters:", search.best_params_)
    print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))
    print(f"F1 Score: {f1:.4f}")
    print(f"Accuracy: {acc:.4f}")
    
    # Log results
    results.append({
        'k': k,
        'f1_score': f1,
        'accuracy': acc,
        'best_params': search.best_params_
    })

# Optional: Convert results to DataFrame for analysis
results_df = pd.DataFrame(results)
print("\nSummary of Results Across k:")
print(results_df)



===== Processing k=3 =====
Feature shape: (1763, 126)
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 30, 'bootstrap': True}
Classification Report:
               precision    recall  f1-score   support

    Alkaloid       0.00      0.00      0.00        12
         NRP       0.65      0.70      0.67       163
  Polyketide       0.61      0.65      0.63       138
        RiPP       0.64      0.84      0.73        67
  Saccharide       0.69      0.32      0.44        28
     Terpene       0.53      0.27      0.36        33

    accuracy                           0.63       441
   macro avg       0.52      0.46      0.47       441
weighted avg       0.61      0.63      0.61       441

F1 Score: 0.6116
Accuracy: 0.6304

===== Processing k=4 =====
Feature shape: (1763, 598)
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'n_es

In [None]:
# random forest
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train_kmer, y_train)

In [None]:
# evaluate rfc

y_pred = rfc.predict(X_test_kmer)
print(classification_report(y_test, y_pred, zero_division=0))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

    Alkaloid       0.00      0.00      0.00        12
         NRP       0.59      0.74      0.66       163
  Polyketide       0.62      0.70      0.66       138
        RiPP       0.69      0.73      0.71        67
  Saccharide       1.00      0.21      0.35        28
     Terpene       0.80      0.12      0.21        33

    accuracy                           0.63       441
   macro avg       0.62      0.42      0.43       441
weighted avg       0.64      0.63      0.60       441

Accuracy: 0.6258503401360545
