# RAVEN

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import openml
import pandas as pd
from scipy.sparse import issparse
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier

In [7]:
from itertools import combinations
import numpy as np
import networkx as nx
import openml
import pandas as pd

def raven(data, mode='openml', sample_size=50, tau=0.95, target_col=None):
    """
    Implements the Raven algorithm that identifies redundant features in a dataset.

    Args: 
        data: DataFrame object, OpenML dataset ID (int), or name (str).
        mode (str): 'openml' (default) or 'df'. 
                    Specifies how to interpret the 'data' argument.
        tau (float): Threshold for correlation coefficient. Default is 0.95.
        sample_size (int): Number of samples to use. Default is 50.
        target_col (str, optional): Target column to drop.

    Returns:
        essential (list): Names of selected (non-redundant) features.
        redundant (list): Names of redundant features.
    """

    # --- Validate mode ---
    if mode not in ['openml', 'df']:
        raise ValueError("mode must be either 'openml' or 'df'")

    # --- Load dataset based on mode ---
    if mode == 'openml':
        if not isinstance(data, (int, str)):
            raise ValueError("If mode='openml', data must be an OpenML dataset ID (int) or name (str).")
        
        print(f"Fetching OpenML dataset: {data}...")
        dataset = openml.datasets.get_dataset(data)
        df, *_ = dataset.get_data(dataset_format="dataframe")
        if target_col is None and dataset.default_target_attribute:
            target_col = dataset.default_target_attribute
        if target_col and target_col in df.columns:
            df = df.drop(columns=[target_col])
        dataset_name = dataset.name


    elif mode == 'df':
        if not isinstance(data, pd.DataFrame):
            raise ValueError("If mode='df', data must be a pandas DataFrame.")
        
        df = data.copy()
        dataset_name = "Custom DataFrame"
        if target_col and target_col in df.columns:
            # Drop target column if specified for DataFrame
            df = df.drop(columns=[target_col])

    # --- Keep only numeric columns ---
    df = df.select_dtypes(include=[np.number])
    total_features = len(df.columns)

    # --- Validate parameters ---
    if tau <= 0 or tau >= 1:
        raise ValueError("tau must be greater than 0 and lesser than 1")
    if sample_size < 1:
        raise ValueError("sample_size must be greater than 0")
    if sample_size > len(df):
        print(f"Warning: sample_size ({sample_size}) is larger than dataset length ({len(df)}). Using full dataset (n={len(df)}) for sampling.")
        sample_size = len(df)
    if total_features < 2:
        raise ValueError("DataFrame must have at least 2 numeric columns")

    # --- Convert to numpy sample ---
    n_samples = min(sample_size, len(df))
    sample = df.sample(n_samples, random_state=42).to_numpy()
    r2_scores = {}
    col_idx = {col: df.columns.get_loc(col) for col in df.columns}

    # --- Compute R^2 between feature pairs ---

    for first, second in combinations(df.columns, 2):
        f_i, s_i = col_idx[first], col_idx[second]
        cov = np.cov(sample[:, f_i], sample[:, s_i])
        
        denom = cov[1, 1] * cov[0, 0]
        if denom == 0:
            r2_scores[first, second] = 0
        else:
            r2_scores[first, second] = cov[1, 0]**2 / denom

    # --- Build correlation graph ---

    def make_graph(scores, tau):
        def get_weight(r2): return (r2 - tau)/(1 - tau) * 0.5 + 0.5
        G = nx.Graph()
        for (a, b), r2 in scores.items():
            if r2 >= tau:
                G.add_edge(a, b, weight=get_weight(r2))
        return G

    G = make_graph(r2_scores, tau)
    del sample, r2_scores

    # --- Identify essential and redundant features ---

    essential = []
    for comp in nx.connected_components(G):
        sub = G.subgraph(comp)
        max_deg_node, _ = max(sub.degree(), key=lambda x: x[1])
        essential.append(max_deg_node)

    redundant = [node for node in G.nodes() if node not in essential]
    
    connected_features = set(G.nodes())
    all_features = set(df.columns)
    isolated = list(all_features - connected_features)
    essential = essential + isolated

    return essential, redundant

In [12]:
X_dexter, y_dexter = fetch_openml(
    data_id=4136, 
    as_frame=False, 
    return_X_y=True, 
    parser='auto'
)

print(f"Original feature shape: {X_dexter.shape}")


Original feature shape: (600, 20000)


In [13]:
essential_features_dex, redundant_features_dex = raven(
    data=X_dexter,
    sample_size=600,
    tau=0.92
)

ValueError: If mode='openml', data must be an OpenML dataset ID (int) or name (str).

In [None]:
X_full_dex = X_dexter
X_essential_dex = X_dexter[essential_features_dex] # Filter X to only include essential columns

print(f"\nFull feature set shape: {X_full_dex.shape}")
print(f"Essential feature set shape: {X_essential_dex.shape}")

# LabelEncoder will map [1, -1] to [1, 0]
le_dex = LabelEncoder()
y_encoded_dex = le_dex.fit_transform(y_dexter)
model_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000, solver='liblinear'))
])

In [None]:
X_train_ess_dex, X_test_ess_dex, y_train_ess_dex, y_test_ess_dex = train_test_split(
    X_essential_dex, y_encoded_dex, test_size=0.3, random_state=42
)

model_pipeline.fit(X_train_ess_dex, y_train_ess_dex)

preds_ess_dex = model_pipeline.predict(X_test_ess_dex)
acc_essential_dex = accuracy_score(y_test_ess_dex, preds_ess_dex)

print(f"Accuracy with essential features: {acc_essential_dex * 100:.2f}%")

# LASSO

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline

In [14]:
dataset = openml.datasets.get_dataset(4136)

X, y, _, _ = dataset.get_data(
    dataset_format="dataframe",
    target=dataset.default_target_attribute
)

  pd.factorize(type_)[0]


In [None]:
le = LabelEncoder()
y_enc = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.3, random_state=42
)

# === LASSO CLASSIFICATION (L1 Logistic Regression) ===
lasso_clf = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False)),
    ('logreg', LogisticRegression(
        penalty='l1',
        solver='liblinear',  
        max_iter=5000,
        C=1.0,                # inverse of regularization strength
        random_state=42
    ))
])

lasso_clf.fit(X_train, y_train)

0,1,2
,steps,"[('imputer', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,False
,with_std,True

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,5000


In [18]:
y_pred = lasso_clf.predict(X_test)
coef = lasso_clf.named_steps['logreg'].coef_[0]

n_redundant = np.sum(coef == 0)

n_selected = np.sum(coef != 0)

print(f"Number of redundant (zero-weight) features: {n_redundant}")
print(f"Number of selected features: {n_selected}")
print("\n--- TRUE LASSO Classification Results (L1 Logistic Regression) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")

Number of redundant (zero-weight) features: 19740
Number of selected features: 260

--- TRUE LASSO Classification Results (L1 Logistic Regression) ---
Accuracy: 91.67%
F1 Score: 0.9153


# without both RAVEN or LASSO

In [19]:
dataset = fetch_openml(data_id=4136, as_frame=False, parser='auto')
X_raw = dataset.data
y_raw = dataset.target
    
print(f"Dataset loaded: {X_raw.shape[0]} samples, {X_raw.shape[1]} features.")

Dataset loaded: 600 samples, 20000 features.


In [20]:
le = LabelEncoder()
y_enc = le.fit_transform(y_raw)
X_train, X_test, y_train, y_test = train_test_split(
    X_raw, y_enc, test_size=0.3, random_state=42
)

is_sparse_input = issparse(X_train)

In [23]:
rf_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=not is_sparse_input)), 
    ('model', RandomForestClassifier(
        n_estimators=100, 
        random_state=42, 
        n_jobs=-1  # Use all CPU cores
    ))
])
rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)

In [24]:
print(f"Accuracy: {acc * 100:.2f}%")
print(f"Features Used: {X_raw.shape[1]} (All)")

Accuracy: 91.11%
Features Used: 20000 (All)
