In [31]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

dataset = fetch_openml("leukemia", version=1, as_frame=True)
X = dataset.data
y = dataset.target

# Encode class labels (ALL, AML: 0, 1)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split into train/test sets (stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

# Build ML pipeline
# Select top 100 genes using ANOVA F-test then apply RF
pipeline = Pipeline([
    ('select', SelectKBest(score_func=f_classif, k=100)),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
print("Random Forest Classification Report:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Cross-validation
cv_scores = cross_val_score(pipeline, X, y_encoded, cv=5)
print("Cross-Validated Accuracy: %.2f%%" % (100 * cv_scores.mean()))


Random Forest Classification Report:
Accuracy: 0.9333333333333333
              precision    recall  f1-score   support

         ALL       0.91      1.00      0.95        10
         AML       1.00      0.80      0.89         5

    accuracy                           0.93        15
   macro avg       0.95      0.90      0.92        15
weighted avg       0.94      0.93      0.93        15

Cross-Validated Accuracy: 98.57%
