In [None]:
import sys
# you may not need to run this cell, you can just run the next cell
sys.path.append('/mnt/storage/faithfulco/python_packages')

In [None]:
import sys
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif



## FUNCTIONS

### Data-Loading Functions

In [3]:
def load_parquet_data(parquet_path, engine='fastparquet'):
    """
    Reads a Parquet file and returns a DataFrame.
    By default, it uses 'fastparquet'. If it fails,
    it automatically switches to 'pyarrow'.
    """
    try:
        df = pd.read_parquet(parquet_path, engine=engine)
    except Exception as e:
        print(f"Reading with '{engine}' failed. Error: {e}")
        print("Retrying with 'pyarrow'...")
        df = pd.read_parquet(parquet_path, engine='pyarrow')
    return df


def load_csv_data(csv_path):
    """
    Reads a CSV file and returns a DataFrame.
    """
    df = pd.read_csv(csv_path)
    return df


### Data Preparation Function

In [4]:
def prepare_features_and_target(df_features, df_labels, label_col="sample_type", drop_col="sample_id"):
    """
    Prepares the feature matrix X and target vector y.

    Parameters:
    -----------
    df_features : pd.DataFrame
        A DataFrame containing feature columns plus an index that can be reset.
    df_labels : pd.DataFrame
        A DataFrame containing labels (e.g., 'sample_type') corresponding 
        to the same samples.
    label_col : str
        The name of the column in df_labels that contains the target variable.
    drop_col : str
        The column to drop from df_features (e.g., 'sample_id'), 
        because it's not a feature.

    Returns:
    --------
    X : pd.DataFrame
        The feature matrix.
    y : pd.Series
        The target labels.
    """

    # Make sure the DataFrame index is a default RangeIndex (if needed)
    if df_features.index.name is not None:
        df_features.reset_index(inplace=True)

    # Separate the features (X) and the target (y)
    X = df_features.drop(columns=[drop_col])
    y = df_labels[label_col]

    return X, y


### Mutual Information Computation and Sorting

In [5]:
def compute_mutual_information(X, y, discrete=True):
    """
    Computes the mutual information (MI) for each feature relative to the target.

    Parameters:
    -----------
    X : pd.DataFrame
        Feature matrix.
    y : pd.Series or np.array
        Target vector.
    discrete : bool
        If True, assumes features are discrete (binary or categorical).

    Returns:
    --------
    mi_df : pd.DataFrame
        A DataFrame containing feature names and their MI scores, sorted descending.
    """
    # Compute the mutual information
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete)

    # Put into a DataFrame
    mi_df = pd.DataFrame({
        "Feature": X.columns,
        "Mutual Information": mi_scores
    })

    # Sort by MI descending
    mi_df.sort_values(by="Mutual Information", ascending=False, inplace=True)
    mi_df.reset_index(drop=True, inplace=True)

    return mi_df


### Filtering by Mutual Information Threshold

In [6]:
def filter_low_mi_features(mi_df, threshold=0.01):
    """
    Filters out features whose mutual information is below a specified threshold.

    Parameters:
    -----------
    mi_df : pd.DataFrame
        DataFrame with columns ['Feature', 'Mutual Information'].
    threshold : float
        The cutoff for dropping features. All features with MI < threshold are removed.

    Returns:
    --------
    low_mi_features : list
        A list of feature names that fall below the threshold.
    """
    low_mi_features = mi_df[mi_df["Mutual Information"] < threshold]["Feature"].tolist()
    return low_mi_features


### Putting It All Together in a Workflow Function

In [7]:
def select_features_by_mi(
    parquet_path, 
    labels_csv_path,
    train_sampleids_path,
    test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.01
):
    """
    A convenience function that:
      1) Loads data from a Parquet file,
      2) Loads labels from a CSV file,
      3) Prepares X, y,
      4) Computes MI scores,
      5) Filters out low-MI features,
      6) Returns X_reduced_train, X_reduced_test, and the MI DataFrame.

    Parameters:
    -----------
    parquet_path : str
        Path to the Parquet file with features.
    labels_csv_path : str
        Path to the CSV file with labels.
    label_col : str
        The name of the target column in the labels DataFrame (default "sample_type").
    drop_col : str
        Name of the column in the features DataFrame to be dropped (default "sample_id").
    threshold : float
        MI threshold below which features will be dropped (default 0.01).

    Returns:
    --------
    X_reduced_train : pd.DataFrame
        The reduced training feature matrix after dropping features below the MI threshold.
    X_reduced_test : pd.DataFrame
        The reduced test feature matrix after selecting the same features as in X_reduced_train.
    y : pd.Series
        The target vector.
    mi_df : pd.DataFrame
        The MI DataFrame (all features and their MI scores).
    """

    # 1) Load features from Parquet
    df_features = load_parquet_data(parquet_path)

    # 2) Load labels from CSV
    df_labels = load_csv_data(labels_csv_path)

    train_sampleids = load_csv_data(train_sampleids_path)
    test_sampleids = load_csv_data(test_sampleids_path)

    # Select rows in df where the sample_id is in train_sampleids
    df_features_train = df_features[df_features.index.isin(train_sampleids['sample_id'])]
    df_labels_train = df_labels[df_labels['sample_id'].isin(train_sampleids['sample_id'])]

    # Select rows in df where the sample_id is in test_sampleids
    df_features_test = df_features[df_features.index.isin(test_sampleids['sample_id'])]
    df_labels_test = df_labels[df_labels['sample_id'].isin(test_sampleids['sample_id'])]

    # 3) Prepare X, y
    X, y = prepare_features_and_target(df_features_train, df_labels_train, label_col, drop_col)

    # 4) Compute mutual information (MI)
    mi_df = compute_mutual_information(X, y, discrete=True)

    # 5) Filter out low-MI features
    low_mi_feats = filter_low_mi_features(mi_df, threshold=threshold)
    print(f"Number of features with MI < {threshold}: {len(low_mi_feats)}")

    # 6) Create X_reduced_train by dropping low-MI features
    X_reduced_train = X.drop(columns=low_mi_feats)

    # Add the drop_col (sample_id) as the first column in X_reduced_train
    X_reduced_train[drop_col] = df_features_train[drop_col]
    
    # Reorder columns so that drop_col is the first column
    X_reduced_train = X_reduced_train[[drop_col] + [col for col in X_reduced_train.columns if col != drop_col]]

    print(f"Reduced training feature set shape: {X_reduced_train.shape}")

    # 7) Create X_reduced_test by selecting the same features as in X_reduced_train
    X_reduced_test = df_features_test[X_reduced_train.columns[1:]]  # Select features, excluding the drop_col

    # Convert the index of df_features_test to a column
    X_reduced_test = X_reduced_test.reset_index()

    print(f"Reduced test feature set shape: {X_reduced_test.shape}")

    return X_reduced_train, X_reduced_test, y, mi_df


## 1. API

In [8]:
# Common labels CSV
labels_csv_path = '/mnt/storage/faithfulco/5_mlran_dataset/MLRan_labels.csv'
train_sampleids_path = 'FS_MLRan_Datasets/type_train_sampleids.csv'
test_sampleids_path = 'FS_MLRan_Datasets/type_test_sampleids.csv'

# 1) API Dataset
api_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/1_api_dataset.parquet'
X_reduced_train_api, X_reduced_test_api, y_api, chi_df_api = select_features_by_mi(
    parquet_path=api_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.01  # remove features whose p-value >= 0.05
)


Number of features with MI < 0.01: 176
Reduced training feature set shape: (3907, 138)
Reduced test feature set shape: (977, 138)


In [9]:
X_reduced_train_api.head()

Unnamed: 0,sample_id,1,2,4,5,8,10,16,18,19,...,229,231,232,233,234,236,238,246,252,265
0,10001,0,0,1,1,1,1,0,1,1,...,1,0,0,0,0,0,0,0,0,0
1,10002,0,0,0,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,10003,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,10004,0,0,1,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
4,10005,0,0,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,1


In [10]:
X_reduced_test_api.head()

Unnamed: 0,sample_id,1,2,4,5,8,10,16,18,19,...,229,231,232,233,234,236,238,246,252,265
0,11027,0,0,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,11030,0,0,1,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,11033,0,0,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,11035,0,0,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,11036,0,0,1,1,1,0,1,1,1,...,0,0,0,0,0,0,1,0,0,0


In [51]:
# 1) API Dataset
api_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/1_api_dataset.parquet'
X_reduced_train_api2, X_reduced_test_api2, y_api2, chi_df_api2 = select_features_by_mi(
    parquet_path=api_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.001  # remove features whose p-value >= 0.05
)

Number of features with MI < 0.001: 73
Reduced training feature set shape: (3907, 241)
Reduced test feature set shape: (977, 241)


## 2. REG

In [11]:
# 2) REG
reg_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/2_reg_dataset.parquet'
X_reduced_train_reg, X_reduced_test_reg, y_reg, mi_df_reg = select_features_by_mi(
    parquet_path=reg_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.01
)

Number of features with MI < 0.01: 520136
Reduced training feature set shape: (3907, 5370)
Reduced test feature set shape: (977, 5370)


In [27]:
X_reduced_train_reg.head()

Unnamed: 0,sample_id,1389,1390,1391,1402,1405,1406,5346,5347,6693,...,433978,434276,502902,502903,502904,502905,503230,503499,522066,525482
0,10001,0,0,0,0,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,0
1,10002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
# 2) REG
reg_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/2_reg_dataset.parquet'
X_reduced_train_reg2, X_reduced_test_reg2, y_reg2, mi_df_reg2 = select_features_by_mi(
    parquet_path=reg_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.001
)

Number of features with MI < 0.001: 503421
Reduced training feature set shape: (3907, 22085)
Reduced test feature set shape: (977, 22085)


## 3. FILE

In [12]:
# 3) FILE
file_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/3_file_dataset.parquet'
X_reduced_train_file, X_reduced_test_file, y_file, mi_df_file = select_features_by_mi(
    parquet_path=file_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.01
)

Number of features with MI < 0.01: 2064243
Reduced training feature set shape: (3907, 14045)
Reduced test feature set shape: (977, 14045)


In [32]:
X_reduced_train_file.head()

Unnamed: 0,sample_id,869429,869430,869431,869432,872835,872848,872850,872852,872943,...,2525874,2525885,2525887,2525900,2525940,2525944,2525946,2525949,2581123,2592955
0,10001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,10002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
# 3) FILE
file_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/3_file_dataset.parquet'
X_reduced_train_file2, X_reduced_test_file2, y_file2, mi_df_file2 = select_features_by_mi(
    parquet_path=file_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.001
)

Number of features with MI < 0.001: 2040045
Reduced training feature set shape: (3907, 38243)
Reduced test feature set shape: (977, 38243)


## 4. DIR

In [13]:
# 4) DIR
dir_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/4_dir_dataset.parquet'
X_reduced_train_dir, X_reduced_test_dir, y_dir, mi_df_dir = select_features_by_mi(
    parquet_path=dir_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.01
)

Number of features with MI < 0.01: 157303
Reduced training feature set shape: (3907, 821)
Reduced test feature set shape: (977, 821)


In [54]:
# 4) DIR
dir_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/4_dir_dataset.parquet'
X_reduced_train_dir2, X_reduced_test_dir2, y_dir2, mi_df_dir2 = select_features_by_mi(
    parquet_path=dir_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.001
)

Number of features with MI < 0.001: 139077
Reduced training feature set shape: (3907, 19047)
Reduced test feature set shape: (977, 19047)


## 5. STR

In [14]:
# 5) STR
str_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/5_str_dataset.parquet'
X_reduced_train_str, X_reduced_test_str, y_str, mi_df_str = select_features_by_mi(
    parquet_path=str_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.01
)

Number of features with MI < 0.01: 3628772
Reduced training feature set shape: (3907, 3348)
Reduced test feature set shape: (977, 3348)


In [55]:
# 5) STR
str_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/5_str_dataset.parquet'
X_reduced_train_str2, X_reduced_test_str2, y_str2, mi_df_str2 = select_features_by_mi(
    parquet_path=str_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.001
)

Number of features with MI < 0.001: 3586844
Reduced training feature set shape: (3907, 45276)
Reduced test feature set shape: (977, 45276)


## 6. NET

In [15]:
# 6) NET
net_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/6_net_dataset.parquet'
X_reduced_train_net, X_reduced_test_net, y_net, mi_df_net = select_features_by_mi(
    parquet_path=net_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.01
)

Number of features with MI < 0.01: 4811
Reduced training feature set shape: (3907, 4)
Reduced test feature set shape: (977, 4)


In [56]:
# 6) NET
net_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/6_net_dataset.parquet'
X_reduced_train_net2, X_reduced_test_net2, y_net2, mi_df_net2 = select_features_by_mi(
    parquet_path=net_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.001
)

Number of features with MI < 0.001: 4740
Reduced training feature set shape: (3907, 75)
Reduced test feature set shape: (977, 75)


## 7. SYS

In [16]:
# 7) SYS
sys_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/7_sys_dataset.parquet'
X_reduced_train_sys, X_reduced_test_sys, y_sys, mi_df_sys = select_features_by_mi(
    parquet_path=sys_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.01
)

Number of features with MI < 0.01: 16590
Reduced training feature set shape: (3907, 322)
Reduced test feature set shape: (977, 322)


In [57]:
# 7) SYS
sys_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/7_sys_dataset.parquet'
X_reduced_train_sys2, X_reduced_test_sys2, y_sys2, mi_df_sys2 = select_features_by_mi(
    parquet_path=sys_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.001
)

Number of features with MI < 0.001: 16012
Reduced training feature set shape: (3907, 900)
Reduced test feature set shape: (977, 900)


## 8. DROP

In [17]:
# 8) DROP
drop_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/8_drop_dataset.parquet'
X_reduced_train_drop, X_reduced_test_drop, y_drop, mi_df_drop = select_features_by_mi(
    parquet_path=drop_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.01
)

Number of features with MI < 0.01: 51576
Reduced training feature set shape: (3907, 76)
Reduced test feature set shape: (977, 76)


In [58]:
# 8) DROP
drop_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/8_drop_dataset.parquet'
X_reduced_train_drop2, X_reduced_test_drop2, y_drop2, mi_df_drop2 = select_features_by_mi(
    parquet_path=drop_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.001
)

Number of features with MI < 0.001: 51009
Reduced training feature set shape: (3907, 643)
Reduced test feature set shape: (977, 643)


## 9. SIG

In [18]:
# 9) SIG
sig_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/9_sig_dataset.parquet'
X_reduced_train_sig, X_reduced_test_sig, y_sig, mi_df_sig = select_features_by_mi(
    parquet_path=sig_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.01
)

Number of features with MI < 0.01: 157
Reduced training feature set shape: (3907, 47)
Reduced test feature set shape: (977, 47)


In [59]:
# 9) SIG
sig_parquet_path = '/mnt/storage/faithfulco/5_mlran_dataset/9_sig_dataset.parquet'
X_reduced_train_sig2, X_reduced_test_sig2, y_sig2, mi_df_sig2 = select_features_by_mi(
    parquet_path=sig_parquet_path, 
    labels_csv_path=labels_csv_path,
    train_sampleids_path=train_sampleids_path,
    test_sampleids_path=test_sampleids_path,
    label_col="sample_type",
    drop_col="sample_id",
    threshold=0.001
)

Number of features with MI < 0.001: 77
Reduced training feature set shape: (3907, 127)
Reduced test feature set shape: (977, 127)


## Merging the dataframe

In [43]:
df_tr = pd.read_csv('FS_MLRan_Datasets/type_train_sampleids.csv')
df_tr.shape

(3907, 1)

In [44]:
df_ts = pd.read_csv('FS_MLRan_Datasets/type_test_sampleids.csv')
df_ts.shape

(977, 1)

In [39]:
# Merging all X_train DataFrames by 'sample_id'
X_train_all = X_reduced_train_api.merge(X_reduced_train_reg, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_file, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_dir, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_str, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_net, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_sys, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_drop, on='sample_id', how='inner')
X_train_all = X_train_all.merge(X_reduced_train_sig, on='sample_id', how='inner')
print(f"Merged training set shape: {X_train_all.shape}")
display(X_train_all.head())

Merged training set shape: (3907, 24163)


Unnamed: 0,sample_id,1,2,4,5,8,10,16,18,19,...,6468225,6468226,6468229,6468230,6468242,6468252,6468253,6468256,6468258,6468260
0,10001,0,0,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,10002,0,0,0,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,10003,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,10004,0,0,1,1,0,0,1,1,1,...,0,0,0,0,1,0,0,0,0,0
4,10005,0,0,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,1,0,1


In [40]:
# Check if any column name contains '_x', '_y', etc.
columns_with_suffixes = [col for col in X_train_all.columns if any(suffix in col for suffix in ['_x', '_y'])]

# Print the result
if columns_with_suffixes:
    print(f"Columns with suffixes (_x, _y, etc.): {columns_with_suffixes}")
else:
    print("No columns with suffixes (_x, _y, etc.) in their names.")


No columns with suffixes (_x, _y, etc.) in their names.


In [45]:
X_train_all.to_parquet('FS_MLRan_Datasets/MLRan_X_train_MI_01.parquet', compression='snappy')

In [46]:
# Saving the data
X_train_all.to_csv("FS_MLRan_Datasets/MLRan_X_train_MI_01.csv", index=False)    

In [47]:
# Merging all X_test DataFrames by 'sample_id'
X_test_all = X_reduced_test_api.merge(X_reduced_test_reg, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_file, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_dir, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_str, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_net, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_sys, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_drop, on='sample_id', how='inner')
X_test_all = X_test_all.merge(X_reduced_test_sig, on='sample_id', how='inner')

print(f"Merged test set shape: {X_test_all.shape}")
display(X_test_all.head())

Merged test set shape: (977, 24163)


Unnamed: 0,sample_id,1,2,4,5,8,10,16,18,19,...,6468225,6468226,6468229,6468230,6468242,6468252,6468253,6468256,6468258,6468260
0,11027,0,0,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,1,0,0
1,11030,0,0,1,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,11033,0,0,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,1,0,0
3,11035,0,0,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
4,11036,0,0,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [48]:
# Check if any column name contains '_x', '_y', etc.
columns_with_suffixes = [col for col in X_test_all.columns if any(suffix in col for suffix in ['_x', '_y'])]

# Print the result
if columns_with_suffixes:
    print(f"Columns with suffixes (_x, _y, etc.): {columns_with_suffixes}")
else:
    print("No columns with suffixes (_x, _y, etc.) in their names.")


No columns with suffixes (_x, _y, etc.) in their names.


In [49]:
X_test_all.to_parquet('FS_MLRan_Datasets/MLRan_X_test_MI_01.parquet', compression='snappy')

In [50]:
# Saving the data
X_test_all.to_csv("FS_MLRan_Datasets/MLRan_X_test_MI_01.csv", index=False)

In [61]:
# Merging all X_train DataFrames by 'sample_id'
X_train_all2 = X_reduced_train_api2.merge(X_reduced_train_reg2, on='sample_id', how='inner')
X_train_all2 = X_train_all2.merge(X_reduced_train_file2, on='sample_id', how='inner')
X_train_all2 = X_train_all2.merge(X_reduced_train_dir2, on='sample_id', how='inner')
X_train_all2 = X_train_all2.merge(X_reduced_train_str2, on='sample_id', how='inner')
X_train_all2 = X_train_all2.merge(X_reduced_train_net2, on='sample_id', how='inner')
X_train_all2 = X_train_all2.merge(X_reduced_train_sys2, on='sample_id', how='inner')
X_train_all2 = X_train_all2.merge(X_reduced_train_drop2, on='sample_id', how='inner')
X_train_all2 = X_train_all2.merge(X_reduced_train_sig2, on='sample_id', how='inner')
print(f"Merged training set shape: {X_train_all2.shape}")
display(X_train_all2.head())

Merged training set shape: (3907, 126629)


Unnamed: 0,sample_id,1,2,3,4,5,6,7,8,9,...,6468255,6468256,6468258,6468260,6468261,6468264,6468266,6468267,6468269,6468270
0,10001,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,1,0,0
1,10002,0,0,1,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,0
2,10003,0,0,1,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,10004,0,0,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,1,0
4,10005,0,0,1,1,1,1,1,1,1,...,0,1,0,1,0,0,0,0,0,0


In [62]:
X_train_all2.to_parquet('FS_MLRan_Datasets/MLRan_X_train_MI_001.parquet', compression='snappy')

In [63]:
# Saving the data
X_train_all2.to_csv("FS_MLRan_Datasets/MLRan_X_train_MI_001.csv", index=False)    

In [64]:
# Merging all X_test DataFrames by 'sample_id'
X_test_all2 = X_reduced_test_api2.merge(X_reduced_test_reg2, on='sample_id', how='inner')
X_test_all2 = X_test_all2.merge(X_reduced_test_file2, on='sample_id', how='inner')
X_test_all2 = X_test_all2.merge(X_reduced_test_dir2, on='sample_id', how='inner')
X_test_all2 = X_test_all2.merge(X_reduced_test_str2, on='sample_id', how='inner')
X_test_all2 = X_test_all2.merge(X_reduced_test_net2, on='sample_id', how='inner')
X_test_all2 = X_test_all2.merge(X_reduced_test_sys2, on='sample_id', how='inner')
X_test_all2 = X_test_all2.merge(X_reduced_test_drop2, on='sample_id', how='inner')
X_test_all2 = X_test_all2.merge(X_reduced_test_sig2, on='sample_id', how='inner')

print(f"Merged test set shape: {X_test_all2.shape}")
display(X_test_all2.head())

Merged test set shape: (977, 126629)


Unnamed: 0,sample_id,1,2,3,4,5,6,7,8,9,...,6468255,6468256,6468258,6468260,6468261,6468264,6468266,6468267,6468269,6468270
0,11027,0,0,1,1,1,1,1,1,1,...,0,1,0,0,0,0,0,1,0,0
1,11030,0,0,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,11033,0,0,1,1,1,1,1,1,1,...,0,1,0,0,0,0,0,1,0,0
3,11035,0,0,1,1,1,1,1,1,1,...,0,1,0,0,0,0,0,1,0,0
4,11036,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [65]:
X_test_all2.to_parquet('FS_MLRan_Datasets/MLRan_X_test_MI_001.parquet', compression='snappy')

In [66]:
# Saving the data
X_test_all2.to_csv("FS_MLRan_Datasets/MLRan_X_test_MI_001.csv", index=False)