# Raven

In [15]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
from itertools import combinations
import numpy as np
import networkx as nx
import openml
import pandas as pd

def raven(data, mode='openml', sample_size=50, tau=0.95, target_col=None):
    """
    Implements the Raven algorithm that identifies redundant features in a dataset.

    Args: 
        data: DataFrame object, OpenML dataset ID (int), or name (str).
        mode (str): 'openml' (default) or 'df'. 
                    Specifies how to interpret the 'data' argument.
        tau (float): Threshold for correlation coefficient. Default is 0.95.
        sample_size (int): Number of samples to use. Default is 50.
        target_col (str, optional): Target column to drop.

    Returns:
        essential (list): Names of selected (non-redundant) features.
        redundant (list): Names of redundant features.
    """

    # --- Validate mode ---
    if mode not in ['openml', 'df']:
        raise ValueError("mode must be either 'openml' or 'df'")

    # --- Load dataset based on mode ---
    if mode == 'openml':
        if not isinstance(data, (int, str)):
            raise ValueError("If mode='openml', data must be an OpenML dataset ID (int) or name (str).")
        
        print(f"Fetching OpenML dataset: {data}...")
        dataset = openml.datasets.get_dataset(data)
        df, *_ = dataset.get_data(dataset_format="dataframe")
        if target_col is None and dataset.default_target_attribute:
            target_col = dataset.default_target_attribute
        if target_col and target_col in df.columns:
            df = df.drop(columns=[target_col])
        dataset_name = dataset.name


    elif mode == 'df':
        if not isinstance(data, pd.DataFrame):
            raise ValueError("If mode='df', data must be a pandas DataFrame.")
        
        df = data.copy()
        dataset_name = "Custom DataFrame"
        if target_col and target_col in df.columns:
            # Drop target column if specified for DataFrame
            df = df.drop(columns=[target_col])

    # --- Keep only numeric columns ---
    df = df.select_dtypes(include=[np.number])
    total_features = len(df.columns)

    # --- Validate parameters ---
    if tau <= 0 or tau >= 1:
        raise ValueError("tau must be greater than 0 and lesser than 1")
    if sample_size < 1:
        raise ValueError("sample_size must be greater than 0")
    if sample_size > len(df):
        print(f"Warning: sample_size ({sample_size}) is larger than dataset length ({len(df)}). Using full dataset (n={len(df)}) for sampling.")
        sample_size = len(df)
    if total_features < 2:
        raise ValueError("DataFrame must have at least 2 numeric columns")

    # --- Convert to numpy sample ---
    n_samples = min(sample_size, len(df))
    sample = df.sample(n_samples, random_state=42).to_numpy()
    r2_scores = {}
    col_idx = {col: df.columns.get_loc(col) for col in df.columns}

    # --- Compute R^2 between feature pairs ---

    for first, second in combinations(df.columns, 2):
        f_i, s_i = col_idx[first], col_idx[second]
        cov = np.cov(sample[:, f_i], sample[:, s_i])
        
        denom = cov[1, 1] * cov[0, 0]
        if denom == 0:
            r2_scores[first, second] = 0
        else:
            r2_scores[first, second] = cov[1, 0]**2 / denom

    # --- Build correlation graph ---

    def make_graph(scores, tau):
        def get_weight(r2): return (r2 - tau)/(1 - tau) * 0.5 + 0.5
        G = nx.Graph()
        for (a, b), r2 in scores.items():
            if r2 >= tau:
                G.add_edge(a, b, weight=get_weight(r2))
        return G

    G = make_graph(r2_scores, tau)
    del sample, r2_scores

    # --- Identify essential and redundant features ---

    essential = []
    for comp in nx.connected_components(G):
        sub = G.subgraph(comp)
        max_deg_node, _ = max(sub.degree(), key=lambda x: x[1])
        essential.append(max_deg_node)

    redundant = [node for node in G.nodes() if node not in essential]
    
    connected_features = set(G.nodes())
    all_features = set(df.columns)
    isolated = list(all_features - connected_features)
    essential = essential + isolated

    return essential, redundant

In [8]:
data = pd.read_csv("train1000.csv").drop(columns=["sample_id", "Unnamed: 0"], errors="ignore")
X = data.iloc[:, :556]
y = data.iloc[:, 556]

# Convert all to numeric, coercing errors to NaN
X = X.apply(pd.to_numeric, errors='coerce')

# Drop columns that are completely NaN or have non-numeric data types
X = X.select_dtypes(include=[np.number]).dropna(axis=1, how='all')

In [5]:
ess_features,red_features = raven(X, mode='df', sample_size=100, tau=0.95)

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

def train(x, y, reduction_status = ""):
    train_size = int(len(x) * 0.8)

    x_train = x.iloc[:train_size, :]
    y_train = y.iloc[:train_size]
    x_test = x.iloc[train_size:, :]
    y_test = y.iloc[train_size:]

    model = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)

    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)

    mse_without_reduction = mean_squared_error(y_test, y_pred)
    r2_without_reduction = r2_score(y_test, y_pred)
    print(f"Mean Squared Error{reduction_status}: ", mse_without_reduction)
    print(f"R2 Score{reduction_status}: ", r2_without_reduction)

In [12]:
print(f"{len(red_features)} redundant columns identified, out of {len(X.columns)} columns")
print(f"{len(red_features)/len(X.columns) * 100}% Reduction")

x1 = X.drop(columns=red_features)

349 redundant columns identified, out of 556 columns
62.76978417266187% Reduction


In [13]:
train(x1, y, " after reduction")

Mean Squared Error after reduction:  1.722481026681409e-11
R2 Score after reduction:  0.9794293200547943


# LASSO

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [18]:
lasso_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('lasso', LassoCV(cv=5, random_state=42, n_jobs=-1, max_iter=10000))
])

print("Training LASSO model...")
lasso_pipeline.fit(X_train, y_train)

Training LASSO model...


0,1,2
,steps,"[('imputer', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,eps,0.001
,n_alphas,'deprecated'
,alphas,'warn'
,fit_intercept,True
,precompute,'auto'
,max_iter,10000
,tol,0.0001
,copy_X,True
,cv,5
,verbose,False


In [19]:
y_pred = lasso_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
lasso_model = lasso_pipeline.named_steps['lasso']
selected_features_count = np.sum(np.abs(lasso_model.coef_) > 1e-6)

print("\n--- LASSO Results ---")
print(f"R² Score:           {r2:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Features Selected:  {selected_features_count} / {X.shape[1]}")


--- LASSO Results ---
R² Score:           0.9691
Mean Squared Error: 0.0000
Features Selected:  36 / 556


# without RAVEN and LASSO

In [21]:
train(X,y, " without reduction")

Mean Squared Error without reduction:  1.7957039524099185e-11
R2 Score without reduction:  0.9785548573777135
