In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, accuracy_score, classification_report

# online
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/tattabio/mibig_classification_dna/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/tattabio/mibig_classification_dna/" + splits["test"])
display(df_train.head())

Unnamed: 0,Entry,Sequence,bgc,class,simple_class
0,BGC0000476,CCAACCAACATATGAACAAGAAGAACATTCTACCCCAACAAGGCCA...,BGC0000476,RiPP,RiPP
1,BGC0001412,CAGCTGCAATTGTTCGTAACCAAACGAAACGACTGCCATCACTACC...,BGC0001412,Saccharide,Saccharide
2,BGC0001669,AGCTCCGTCGCGACGGCCGCGGCGCTGACGACCAGAACCGCTACTC...,BGC0001669,RiPP,RiPP
3,BGC0002337,TCAGGTGACGAGGCCGAGCCGCGGTCCGCCCCCATGGCGCAGCCGT...,BGC0002337,RiPP,RiPP
4,BGC0001769,GTGAGCGCGATCCGGCAACGCACCCGTGTCGGTGGTGGGCCTCTGG...,BGC0001769,Polyketide,Polyketide


In [8]:
print(df_train['simple_class'].value_counts())
display(df_train.head())

NRP           652
Polyketide    551
RiPP          266
Terpene       133
Saccharide    113
Alkaloid       48
Name: simple_class, dtype: int64


Unnamed: 0,Entry,Sequence,bgc,class,simple_class
0,BGC0000476,CCAACCAACATATGAACAAGAAGAACATTCTACCCCAACAAGGCCA...,BGC0000476,RiPP,RiPP
1,BGC0001412,CAGCTGCAATTGTTCGTAACCAAACGAAACGACTGCCATCACTACC...,BGC0001412,Saccharide,Saccharide
2,BGC0001669,AGCTCCGTCGCGACGGCCGCGGCGCTGACGACCAGAACCGCTACTC...,BGC0001669,RiPP,RiPP
3,BGC0002337,TCAGGTGACGAGGCCGAGCCGCGGTCCGCCCCCATGGCGCAGCCGT...,BGC0002337,RiPP,RiPP
4,BGC0001769,GTGAGCGCGATCCGGCAACGCACCCGTGTCGGTGGTGGGCCTCTGG...,BGC0001769,Polyketide,Polyketide


In [None]:
def extract_basic_seq_features(df, seq_col='Sequence'):
    """
    Extracts basic sequence features from a DNA sequence column in a DataFrame.
    Features include length, GC content, AT content, skews, N content, CpG ratio.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing the sequences.
        seq_col (str): Name of the column containing the sequences.
    
    Returns:
        pd.DataFrame: DataFrame with the original sequences and the extracted features.
    """
    def calc_features(seq):
        seq = seq.upper()
        length = len(seq) if len(seq) > 0 else 1
        a = seq.count('A')
        c = seq.count('C')
        g = seq.count('G')
        t = seq.count('T')
        n = seq.count('N') # unknowns

        return {
            'seq_length': length,
            'a_percent': a / length,
            'c_percent': c / length,
            'g_percent': g / length,
            't_percent': t / length,
            'n_percent': n / length,
            'gc_percent': (g + c) / length, # pairs of GC
            'at_percent': (a + t) / length  # pairs of AT
        }

    features_df = df[seq_col].apply(calc_features).apply(pd.Series)
    return pd.concat([df.reset_index(drop=True), features_df], axis=1)

df_train = extract_basic_seq_features(df_train)
df_test = extract_basic_seq_features(df_test)
display(df_train.head())

Unnamed: 0,Entry,Sequence,bgc,class,simple_class,seq_length,a_percent,c_percent,g_percent,t_percent,n_percent,gc_percent,at_percent
0,BGC0000476,CCAACCAACATATGAACAAGAAGAACATTCTACCCCAACAAGGCCA...,BGC0000476,RiPP,RiPP,242.0,0.256198,0.252066,0.22314,0.268595,0.0,0.475207,0.524793
1,BGC0001412,CAGCTGCAATTGTTCGTAACCAAACGAAACGACTGCCATCACTACC...,BGC0001412,Saccharide,Saccharide,24454.0,0.294144,0.154494,0.206715,0.344647,0.0,0.361209,0.638791
2,BGC0001669,AGCTCCGTCGCGACGGCCGCGGCGCTGACGACCAGAACCGCTACTC...,BGC0001669,RiPP,RiPP,40001.0,0.163121,0.349491,0.336817,0.150571,0.0,0.686308,0.313692
3,BGC0002337,TCAGGTGACGAGGCCGAGCCGCGGTCCGCCCCCATGGCGCAGCCGT...,BGC0002337,RiPP,RiPP,2741.0,0.124042,0.352061,0.394382,0.129515,0.0,0.746443,0.253557
4,BGC0001769,GTGAGCGCGATCCGGCAACGCACCCGTGTCGGTGGTGGGCCTCTGG...,BGC0001769,Polyketide,Polyketide,44740.0,0.15266,0.361779,0.341641,0.14392,0.0,0.70342,0.29658


In [19]:
def seq_to_kmers(seq, k=3):
    """
    Converts a sequence into k-mers of length k.

    Args:
        seq (str): Input sequence.
        k (int, optional): Length of k-mers. Defaults to 3.

    Returns:
        str: Space-separated k-mers
    """
    # if seq.startswith('CCAACCAACATATGAACAAGAAGAACATTCTACCCCAACAAGGCCAACC'): # debug
    #     print("Split:", ' '.join([seq[i:i+k] for i in range(len(seq) - k + 1)])[0:100])
    return ' '.join([seq[i:i+k] for i in range(len(seq) - k + 1)])

def get_kmer_vectorizer(df_train, df_test, k=3):
    """
    Converts sequences to k-mers and creates a k-mer vectorizer for training and testing datasets.

    Args:
        df_train (pd.DataFrame): Training dataset.
        df_test (pd.DataFrame): Testing dataset.
        k (int, optional): k value for k-mers. Defaults to 3.

    Returns:
        tuple: Tuple containing the k-mer vectorized training and testing datasets.
    """
    train_kmers = df_train['Sequence'].apply(seq_to_kmers, k=k)
    test_kmers = df_test['Sequence'].apply(seq_to_kmers, k=k)
    vectorizer = CountVectorizer(analyzer='word', token_pattern=r'\S+')
    
    X_train_kmer = vectorizer.fit_transform(train_kmers)
    X_train_df = pd.DataFrame(X_train_kmer.toarray(), columns=vectorizer.get_feature_names_out())
    X_train_df = X_train_df.div(X_train_df.sum(axis=1), axis=0) * 100
    X_train_df.reset_index(drop=True, inplace=True)
    X_train_df['seq_length'] = df_train['Sequence'].apply(len)
    
    X_test_kmer = vectorizer.transform(test_kmers)
    X_test_df = pd.DataFrame(X_test_kmer.toarray(), columns=vectorizer.get_feature_names_out())
    X_test_df = X_test_df.div(X_test_df.sum(axis=1), axis=0) * 100
    X_test_df.reset_index(drop=True, inplace=True)
    X_test_df['seq_length'] = df_test['Sequence'].apply(len)

    return X_train_df, X_test_df

In [20]:
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}
results = []

for k in range(3, 8):
    print(f"\n===== Processing k={k} =====")
    X_train_kmer, X_test_kmer = get_kmer_vectorizer(df_train, df_test, k=k)
    y_train = df_train['simple_class']
    y_test = df_test['simple_class']
    print(f"Feature shape: {X_train_kmer.shape}")
    
    base_rfc = RandomForestClassifier(random_state=42)
    search = RandomizedSearchCV(
        base_rfc,
        param_distributions=param_dist,
        n_iter=20,
        scoring='f1_weighted',
        cv=3,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )
    
    search.fit(X_train_kmer, y_train)
    best_rfc = search.best_estimator_
    y_pred = best_rfc.predict(X_test_kmer)
    f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
    f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    acc = accuracy_score(y_test, y_pred)
    print("Best Parameters:", search.best_params_)
    print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))
    print(f"F1 Score (Macro): {f1_macro:.4f}")
    print(f"F1 Score (Weighted): {f1_weighted:.4f}")
    print(f"Accuracy: {acc:.4f}")
    
    results.append({
        'k': k,
        'best_params': search.best_params_,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'accuracy': acc
    })
results_df = pd.DataFrame(results)


===== Processing k=3 =====
Feature shape: (1763, 126)
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 30, 'bootstrap': True}
Classification Report:
               precision    recall  f1-score   support

    Alkaloid       0.00      0.00      0.00        12
         NRP       0.65      0.70      0.67       163
  Polyketide       0.61      0.65      0.63       138
        RiPP       0.64      0.84      0.73        67
  Saccharide       0.69      0.32      0.44        28
     Terpene       0.53      0.27      0.36        33

    accuracy                           0.63       441
   macro avg       0.52      0.46      0.47       441
weighted avg       0.61      0.63      0.61       441

F1 Score (Macro): 0.4717
F1 Score (Weighted): 0.6116
Accuracy: 0.6304

===== Processing k=4 =====
Feature shape: (1763, 598)
Fitting 3 folds for each of 20 candidates, total