# RAVEN ON LEUKEMIA DATASET

In [1]:
!pip install openml
!pip install scikit-learn




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_openml
import openml
import pandas as pd
import numpy as np

In [8]:
from itertools import combinations
import numpy as np
import networkx as nx

def raven(data, mode='openml', sample_size=50, tau=0.95, target_col=None):
    """
    Implements the Raven algorithm that identifies redundant features in a dataset.

    Args: 
        data: DataFrame object, OpenML dataset ID (int), or name (str).
        mode (str): 'openml' (default) or 'df'. 
                    Specifies how to interpret the 'data' argument.
        tau (float): Threshold for correlation coefficient. Default is 0.95.
        sample_size (int): Number of samples to use. Default is 50.
        target_col (str, optional): Target column to drop.

    Returns:
        essential (list): Names of selected (non-redundant) features.
        redundant (list): Names of redundant features.
    """

    # --- Validate mode ---
    if mode not in ['openml', 'df']:
        raise ValueError("mode must be either 'openml' or 'df'")

    # --- Load dataset based on mode ---
    if mode == 'openml':
        if not isinstance(data, (int, str)):
            raise ValueError("If mode='openml', data must be an OpenML dataset ID (int) or name (str).")
        
        print(f"Fetching OpenML dataset: {data}...")
        dataset = openml.datasets.get_dataset(data)
        df, *_ = dataset.get_data(dataset_format="dataframe")
        if target_col is None and dataset.default_target_attribute:
            target_col = dataset.default_target_attribute
        if target_col and target_col in df.columns:
            df = df.drop(columns=[target_col])
        dataset_name = dataset.name


    elif mode == 'df':
        if not isinstance(data, pd.DataFrame):
            raise ValueError("If mode='df', data must be a pandas DataFrame.")
        
        df = data.copy()
        dataset_name = "Custom DataFrame"
        if target_col and target_col in df.columns:
            # Drop target column if specified for DataFrame
            df = df.drop(columns=[target_col])

    # --- Keep only numeric columns ---
    df = df.select_dtypes(include=[np.number])
    total_features = len(df.columns)

    # --- Validate parameters ---
    if tau <= 0 or tau >= 1:
        raise ValueError("tau must be greater than 0 and lesser than 1")
    if sample_size < 1:
        raise ValueError("sample_size must be greater than 0")
    if sample_size > len(df):
        print(f"Warning: sample_size ({sample_size}) is larger than dataset length ({len(df)}). Using full dataset (n={len(df)}) for sampling.")
        sample_size = len(df)
    if total_features < 2:
        raise ValueError("DataFrame must have at least 2 numeric columns")

    # --- Convert to numpy sample ---
    n_samples = min(sample_size, len(df))
    sample = df.sample(n_samples, random_state=42).to_numpy()
    r2_scores = {}
    col_idx = {col: df.columns.get_loc(col) for col in df.columns}

    # --- Compute R^2 between feature pairs ---

    for first, second in combinations(df.columns, 2):
        f_i, s_i = col_idx[first], col_idx[second]
        cov = np.cov(sample[:, f_i], sample[:, s_i])
        
        denom = cov[1, 1] * cov[0, 0]
        if denom == 0:
            r2_scores[first, second] = 0
        else:
            r2_scores[first, second] = cov[1, 0]**2 / denom

    # --- Build correlation graph ---

    def make_graph(scores, tau):
        def get_weight(r2): return (r2 - tau)/(1 - tau) * 0.5 + 0.5
        G = nx.Graph()
        for (a, b), r2 in scores.items():
            if r2 >= tau:
                G.add_edge(a, b, weight=get_weight(r2))
        return G

    G = make_graph(r2_scores, tau)
    del sample, r2_scores

    # --- Identify essential and redundant features ---

    essential = []
    for comp in nx.connected_components(G):
        sub = G.subgraph(comp)
        max_deg_node, _ = max(sub.degree(), key=lambda x: x[1])
        essential.append(max_deg_node)

    redundant = [node for node in G.nodes() if node not in essential]
    
    connected_features = set(G.nodes())
    all_features = set(df.columns)
    isolated = list(all_features - connected_features)
    essential = essential + isolated

    return essential, redundant

In [4]:
dataset = fetch_openml(data_id=1104, as_frame=True, parser='auto')

# Get data, separating features (X) and target (y)
X = dataset.data
y = dataset.target

print(f"Original feature shape: {X.shape}")
print(f"Target 'y' values: {y.unique()}")

Original feature shape: (72, 7129)
Target 'y' values: ['ALL', 'AML']
Categories (2, object): ['ALL', 'AML']


In [None]:
essential_features, redundant_features = raven(data=X, mode="df",sample_size=72, tau=0.99)

In [11]:
X_full = X
X_essential = X[essential_features]

print(f"\nFull feature set shape: {X_full.shape}")
print(f"Essential feature set shape: {X_essential.shape}")

le = LabelEncoder()
y_encoded = le.fit_transform(y)

model_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

NameError: name 'essential_features' is not defined

In [None]:
X_train_ess, X_test_ess, y_train_ess, y_test_ess = train_test_split(
    X_essential, y_encoded, test_size=0.3, random_state=42
)

model_pipeline.fit(X_train_ess, y_train_ess)

preds_ess = model_pipeline.predict(X_test_ess)
acc_essential = accuracy_score(y_test_ess, preds_ess)

print(f"Accuracy with essential features: {acc_essential * 100:.2f}%")

## WITHOUT RAVEN OR LASSO (Logistic regression)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_encoded, test_size=0.3, random_state=42
)

model_pipeline.fit(X_train, y_train)

preds_full = model_pipeline.predict(X_test)
acc_full = accuracy_score(y_test, preds_full)

print(f"Accuracy with all features: {acc_full * 100:.2f}%")

NameError: name 'X_full' is not defined

## LASSO ON LEUKEMIA

In [20]:
leukemia = fetch_openml(data_id=1104, as_frame=False, parser='auto')
X = leukemia.data
y = leukemia.target

# Encode labels ('ALL', 'AML') to (0, 1)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)
# 'C' is inverse regularization strength. Lower C = stronger regularization.
lasso_classifier = LogisticRegression(penalty='l1', solver='liblinear', C=0.5, random_state=42)
lasso_classifier.fit(X_train, y_train)

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,0.5
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,100


In [22]:
# Evaluating the model
y_pred = lasso_classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nLASSO score for Leukemia dataset:")
print(f"Accuracy: {acc:.4f}")

coefficients = lasso_classifier.coef_[0]
selected_features = np.sum(coefficients != 0)
print(f"Features selected: {selected_features} / {X.shape[1]} \n")


LASSO score for Leukemia dataset:
Accuracy: 0.9545
Features selected: 22 / 7129 

