# Feature selection using chi-square

In [None]:
# import 
# you may not need to run this cell, you can just run the next cell
import sys
sys.path.append('/mnt/storage/faithfulco/python_packages')

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import chi2



## Functions

### Data-Loading Functions

In [3]:
def load_parquet_data(parquet_path, engine='fastparquet'):
    """
    Reads a Parquet file and returns a DataFrame.
    By default, it uses 'fastparquet'. If it fails,
    it automatically switches to 'pyarrow'.
    """
    try:
        df = pd.read_parquet(parquet_path, engine=engine)
    except Exception as e:
        print(f"Reading with '{engine}' failed. Error: {e}")
        print("Retrying with 'pyarrow'...")
        df = pd.read_parquet(parquet_path, engine='pyarrow')
    return df


def load_csv_data(csv_path):
    """
    Reads a CSV file and returns a DataFrame.
    """
    df = pd.read_csv(csv_path)
    return df


### Data Preparation Function

In [4]:
def prepare_features_and_target(df_features, df_labels, label_col="sample_type", drop_col="sample_id"):
    """
    Prepares the feature matrix X and target vector y.

    Parameters:
    -----------
    df_features : pd.DataFrame
        A DataFrame containing feature columns plus an index that may need resetting.
    df_labels : pd.DataFrame
        A DataFrame containing labels (e.g., 'sample_type') corresponding 
        to the same samples.
    label_col : str
        The name of the column in df_labels that contains the target variable.
    drop_col : str
        The column to drop from df_features (e.g., 'sample_id'), 
        because it's not a feature.

    Returns:
    --------
    X : pd.DataFrame
        The feature matrix.
    y : pd.Series
        The target labels.
    """

    # If df_features has a named index, reset it.
    if df_features.index.name is not None:
        df_features.reset_index(inplace=True)

    # Separate the features (X) and the target (y)
    X = df_features.drop(columns=[drop_col])
    y = df_labels[label_col]
    
    return X, y


### Chi-Square Computation and Sorting

In [5]:
def compute_chi2_test(X, y):
    """
    Computes the chi-square statistic and p-value for each feature relative to the target.

    Parameters:
    -----------
    X : pd.DataFrame
        Feature matrix (binary or categorical features).
    y : pd.Series
        Target vector (categorical / discrete).

    Returns:
    --------
    chi_df : pd.DataFrame
        A DataFrame containing columns: ['Feature', 'Chi2', 'p_value'],
        sorted by ascending p_value.
    """
    # chi2 returns two arrays: chi2 stats and p-values
    chi2_values, p_values = chi2(X, y)

    chi_df = pd.DataFrame({
        "Feature": X.columns,
        "Chi2": chi2_values,
        "p_value": p_values
    })
    # Sort by ascending p_value (so we can see the most significant features first)
    chi_df.sort_values(by="p_value", ascending=True, inplace=True)
    chi_df.reset_index(drop=True, inplace=True)
    return chi_df


### Filter Out Non-Significant Features (p-value ≥ 0.05)

In [6]:
def filter_insignificant_features(chi_df, alpha=0.05):
    """
    Filters out features whose p-value is >= alpha (default 0.05).

    Parameters:
    -----------
    chi_df : pd.DataFrame
        DataFrame with columns ['Feature', 'Chi2', 'p_value'].
    alpha : float
        Significance level. Features with p_value >= alpha are considered insignificant.

    Returns:
    --------
    drop_features : list
        A list of feature names that are NOT significant (p_value >= alpha).
    """
    drop_features = chi_df[chi_df["p_value"] >= alpha]["Feature"].tolist()
    return drop_features


### All-in-One Workflow: Chi-Square Feature Selection

In [7]:
def select_features_by_chi2(
    parquet_path, 
    labels_csv_path,
    train_sampleids_path,
    test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    alpha=0.05
):
    """
    A convenience function that:
      1) Loads data from a Parquet file,
      2) Loads labels from a CSV file,
      3) Prepares X, y,
      4) Performs Chi-Square test,
      5) Drops features with p_value >= alpha,
      6) Returns X_reduced_train, X_reduced_test, and the chi-square DataFrame.

    Parameters:
    -----------
    parquet_path : str
        Path to the Parquet file with features.
    labels_csv_path : str
        Path to the CSV file with labels.
    label_col : str
        The name of the target column in the labels DataFrame (default "sample_type").
    drop_col : str
        Name of the column in the features DataFrame to drop (default "sample_id").
    alpha : float
        Significance level for p-values (default 0.05).

    Returns:
    --------
    X_reduced_train : pd.DataFrame
        The reduced training feature matrix after dropping features whose p_value >= alpha.
    X_reduced_test : pd.DataFrame
        The reduced test feature matrix after selecting the same features as in X_reduced_train.
    y : pd.Series
        The target vector.
    chi_df : pd.DataFrame
        The chi-square DataFrame with columns ['Feature', 'Chi2', 'p_value'].
    """

    # 1) Load features from Parquet
    df_features = load_parquet_data(parquet_path)

    # 2) Load labels and sample IDS CSVs
    df_labels = load_csv_data(labels_csv_path)

    train_sampleids = load_csv_data(train_sampleids_path)
    test_sampleids = load_csv_data(test_sampleids_path)

    # Select rows in df where the sample_id is in train_sampleids
    df_features_train = df_features[df_features.index.isin(train_sampleids['sample_id'])]
    df_labels_train = df_labels[df_labels['sample_id'].isin(train_sampleids['sample_id'])]

    # Select rows in df where the sample_id is in test_sampleids
    df_features_test = df_features[df_features.index.isin(test_sampleids['sample_id'])]
    df_labels_test = df_labels[df_labels['sample_id'].isin(test_sampleids['sample_id'])]

    # 3) Prepare X, y
    X, y = prepare_features_and_target(df_features_train, df_labels_train, label_col, drop_col)

    # 4) Compute chi-square stats
    chi_df = compute_chi2_test(X, y)

    # 5) Identify insignificant features
    to_drop = filter_insignificant_features(chi_df, alpha=alpha)
    print(f"Number of features with p_value >= {alpha}: {len(to_drop)}")

    # 6) Create X_reduced_train by dropping insignificant features
    X_reduced_train = X.drop(columns=to_drop)

    # Add the drop_col (sample_id) as the first column in X_reduced_train
    X_reduced_train[drop_col] = df_features_train[drop_col]
    
    # Reorder columns so that drop_col is the first column
    X_reduced_train = X_reduced_train[[drop_col] + [col for col in X_reduced_train.columns if col != drop_col]]

    print(f"Reduced training feature set shape: {X_reduced_train.shape}")

    # 7) Create X_reduced_test by selecting the same features as in X_reduced_train
    X_reduced_test = df_features_test[X_reduced_train.columns[1:]]  # Select features, excluding the drop_col

    # Convert the index of df_features_test to a column
    X_reduced_test = X_reduced_test.reset_index()

    print(f"Reduced test feature set shape: {X_reduced_test.shape}")

    return X_reduced_train, X_reduced_test, y, chi_df


## Usage for All 9 Feature Groups

In [8]:
# Common labels CSV
labels_csv_path = '/mnt/storage/faithfulco/5_mlran_dataset/MLRan_labels.csv'
train_sampleids_path = 'FS_MLRan_Datasets/type_train_sampleids.csv'
test_sampleids_path = 'FS_MLRan_Datasets/type_test_sampleids.csv'

# 1) API Dataset
api_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/1_api_dataset.parquet'
X_reduced_train_api, X_reduced_test_api, y_api, chi_df_api = select_features_by_chi2(
    parquet_path=api_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    alpha=0.05  # remove features whose p-value >= 0.05
)


Number of features with p_value >= 0.05: 59
Reduced training feature set shape: (3907, 255)
Reduced test feature set shape: (977, 255)


In [9]:
X_reduced_train_api.head()

Unnamed: 0,sample_id,1,2,3,4,5,7,8,9,10,...,280,283,285,289,291,294,299,304,305,306
0,10001,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,10002,0,0,1,0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,10003,0,0,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,10004,0,0,1,1,1,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,10005,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X_reduced_train_api.shape

(3907, 255)

In [11]:
X_reduced_test_api.head()

Unnamed: 0,sample_id,1,2,3,4,5,7,8,9,10,...,280,283,285,289,291,294,299,304,305,306
0,11027,0,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,11030,0,0,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,11033,0,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,11035,0,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,11036,0,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
X_reduced_test_api.shape

(977, 255)

In [13]:
# 2) REG Dataset
reg_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/2_reg_dataset.parquet'

X_reduced_train_reg, X_reduced_test_reg, y_reg, chi_df_reg = select_features_by_chi2(
    parquet_path=reg_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    alpha=0.05  # remove features whose p-value >= 0.05
)

Number of features with p_value >= 0.05: 442916
Reduced training feature set shape: (3907, 82590)
Reduced test feature set shape: (977, 82590)


In [26]:
X_reduced_train_reg.head()

Unnamed: 0,sample_id,314,361,362,363,365,366,368,369,393,...,525804,525805,525806,525807,525808,525809,525810,525811,525813,525818
0,10001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# 3) FILE Dataset
file_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/3_file_dataset.parquet'

X_reduced_train_file, X_reduced_test_file, y_file, chi_df_file = select_features_by_chi2(
    parquet_path=file_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    alpha=0.05  # remove features whose p-value >= 0.05
)

Number of features with p_value >= 0.05: 1740704
Reduced training feature set shape: (3907, 337584)
Reduced test feature set shape: (977, 337584)


In [15]:
# 4) DIR Dataset
dir_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/4_dir_dataset.parquet'

X_reduced_train_dir, X_reduced_test_dir, y_dir, chi_df_dir = select_features_by_chi2(
    parquet_path=dir_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    alpha=0.05  # remove features whose p-value >= 0.05
)

Number of features with p_value >= 0.05: 106799
Reduced training feature set shape: (3907, 51325)
Reduced test feature set shape: (977, 51325)


In [16]:
# 5) STR Dataset
str_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/5_str_dataset.parquet'

X_reduced_train_str, X_reduced_test_str, y_str, chi_df_str = select_features_by_chi2(
    parquet_path=str_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    alpha=0.05  # remove features whose p-value >= 0.05
)

Number of features with p_value >= 0.05: 3049544
Reduced training feature set shape: (3907, 582576)
Reduced test feature set shape: (977, 582576)


In [17]:
# 6) NET Dataset
net_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/6_net_dataset.parquet'

X_reduced_train_net, X_reduced_test_net, y_net, chi_df_net = select_features_by_chi2(
    parquet_path=net_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    alpha=0.05  # remove features whose p-value >= 0.05
)

Number of features with p_value >= 0.05: 4553
Reduced training feature set shape: (3907, 262)
Reduced test feature set shape: (977, 262)


In [18]:
# 7) SYS Dataset
sys_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/7_sys_dataset.parquet'

X_reduced_train_sys, X_reduced_test_sys, y_sys, chi_df_sys = select_features_by_chi2(
    parquet_path=sys_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    alpha=0.05  # remove features whose p-value >= 0.05
)

Number of features with p_value >= 0.05: 13887
Reduced training feature set shape: (3907, 3025)
Reduced test feature set shape: (977, 3025)


In [19]:
# 8) DROP Dataset
drop_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/8_drop_dataset.parquet'

X_reduced_train_drop, X_reduced_test_drop, y_drop, chi_df_drop = select_features_by_chi2(
    parquet_path=drop_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    alpha=0.05  # remove features whose p-value >= 0.05
)

Number of features with p_value >= 0.05: 47149
Reduced training feature set shape: (3907, 4503)
Reduced test feature set shape: (977, 4503)


In [20]:
# 9) SIG Dataset
sig_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/9_sig_dataset.parquet'

X_reduced_train_sig, X_reduced_test_sig, y_sig, chi_df_sig = select_features_by_chi2(
    parquet_path=sig_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    alpha=0.05  # remove features whose p-value >= 0.05
)

Number of features with p_value >= 0.05: 64
Reduced training feature set shape: (3907, 140)
Reduced test feature set shape: (977, 140)


In [27]:
# Merging all X_train DataFrames by 'sample_id'
X_train_all = X_reduced_train_api.merge(X_reduced_train_reg, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_file, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_dir, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_str, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_net, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_sys, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_drop, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_sig, on='sample_id', how='inner')
print(f"Merged training set shape: {X_train_all.shape}")
display(X_train_all.head())

Merged training set shape: (3907, 1062252)


Unnamed: 0,sample_id,1,2,3,4,5,7,8,9,10,...,6468255,6468256,6468258,6468260,6468261,6468264,6468266,6468267,6468269,6468270
0,10001,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,1,0,0
1,10002,0,0,1,0,1,1,1,0,1,...,1,0,0,0,0,0,0,0,0,0
2,10003,0,0,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,10004,0,0,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,10005,0,0,1,1,1,1,1,1,1,...,0,1,0,1,0,0,0,0,0,0


In [28]:
# Check if any column name contains '_x', '_y', etc.
columns_with_suffixes = [col for col in X_train_all.columns if any(suffix in col for suffix in ['_x', '_y'])]

# Print the result
if columns_with_suffixes:
    print(f"Columns with suffixes (_x, _y, etc.): {columns_with_suffixes}")
else:
    print("No columns with suffixes (_x, _y, etc.) in their names.")


No columns with suffixes (_x, _y, etc.) in their names.


In [29]:
X_train_all.to_parquet('FS_MLRan_Datasets/MLRan_X_train_Chi.parquet', compression='snappy')

In [30]:
# Saving the data
X_train_all.to_csv("FS_MLRan_Datasets/MLRan_X_train_Chi.csv")

In [31]:
# Merging all X_test DataFrames by 'sample_id'
X_test_all = X_reduced_test_api.merge(X_reduced_test_reg, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_file, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_dir, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_str, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_net, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_sys, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_drop, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_sig, on='sample_id', how='inner')

print(f"Merged test set shape: {X_test_all.shape}")
display(X_test_all.head())

Merged test set shape: (977, 1062252)


Unnamed: 0,sample_id,1,2,3,4,5,7,8,9,10,...,6468255,6468256,6468258,6468260,6468261,6468264,6468266,6468267,6468269,6468270
0,11027,0,0,1,1,1,1,1,1,0,...,0,1,0,0,0,0,0,1,0,0
1,11030,0,0,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,11033,0,0,1,1,1,1,1,1,0,...,0,1,0,0,0,0,0,1,0,0
3,11035,0,0,1,1,1,1,1,1,0,...,0,1,0,0,0,0,0,1,0,0
4,11036,0,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# Check if any column name contains '_x', '_y', etc.
columns_with_suffixes = [col for col in X_test_all.columns if any(suffix in col for suffix in ['_x', '_y'])]

# Print the result
if columns_with_suffixes:
    print(f"Columns with suffixes (_x, _y, etc.): {columns_with_suffixes}")
else:
    print("No columns with suffixes (_x, _y, etc.) in their names.")


No columns with suffixes (_x, _y, etc.) in their names.


In [33]:
X_test_all.to_parquet('FS_MLRan_Datasets/MLRan_X_test_Chi.parquet', compression='snappy')

In [34]:
# Saving the data
X_test_all.to_csv("FS_MLRan_Datasets/MLRan_X_test_Chi.csv")