In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/acf-fd-table/FD_table.csv
/kaggle/input/ped-data/data_with_ratios.csv
/kaggle/input/acf-table-x/ACF_table.csv
/kaggle/input/new-dataset/classification_FD.csv
/kaggle/input/pcom-jcom/combined_output2.csv
/kaggle/input/be-data/classification_FD_nonphysical.csv


**Attribute selection**

  f = msb / msw


Selected Feature Names: ['Jcom_pre5', 'Jcom_ratio5', 'Jcom_ratio9']
Cross-Validation Score: 0.5900


In [58]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

def load_data(filepath):
    """Load data and filter columns starting with 'pcom' or 'Jcom'."""
    data = pd.read_csv(filepath)
    # Keep only relevant columns
    selected_columns = [col for col in data.columns]# if col.startswith(("pcom", "Jcom"))]
    X = data[selected_columns]  # Select only matching columns
    X = X.drop(['Sub_code','resp'], axis=1)
    y = data['resp'].map({'performer': 1, 'nonperformer': 0}).fillna(0).astype(int)
    return X, y

class WekaStyleSelector:
    def __init__(self, k=3, seed=1, num_folds=10):
        self.k = k
        self.seed = seed
        self.num_folds = num_folds
        self.pipeline = None
        self.feature_names = None

    def fit(self, X, y):
        """Fit the pipeline (feature selection + classifier)."""
        self.feature_names = X.columns.tolist()
        selector = SelectKBest(score_func=mutual_info_classif, k=self.k)
        clf = RandomForestClassifier(random_state=self.seed)
        self.pipeline = Pipeline([('selector', selector), ('clf', clf)])
        self.pipeline.fit(X, y)  # Fit on entire data (for inspection)

    def transform(self, X):
        """Return features selected during fitting."""
        return self.pipeline.named_steps['selector'].transform(X)

    def cross_validate(self, X, y):
        """Perform CV with per-fold feature selection (like Weka's FilteredClassifier)."""
        cv = StratifiedKFold(n_splits=self.num_folds, shuffle=True, random_state=self.seed)
        selector = SelectKBest(score_func=mutual_info_classif, k=self.k)
        pipeline = Pipeline([('selector', selector), ('clf', RandomForestClassifier(random_state=self.seed))])
        scores = cross_val_score(pipeline, X, y, cv=cv, n_jobs=-1)
        return np.mean(scores)

    def get_selected_feature_names(self):
        """Get names of features selected during fitting."""
        mask = self.pipeline.named_steps['selector'].get_support()
        return [self.feature_names[i] for i in np.where(mask)[0]]

# Load and shuffle data (Weka does not auto-shuffle; remove if undesired)
X, y = load_data('/kaggle/input/ped-data/data_with_ratios.csv')
X, y = shuffle(X, y, random_state=1)

# Initialize and run selector
selector = WekaStyleSelector(k=3, seed=1)
selector.fit(X, y)


# Get selected features (from full-dataset fit)
selected_features = selector.get_selected_feature_names()
print("Selected Features (Weka-style):", selected_features)

Selected Features (Weka-style): ['pcom_pre4', 'pcom_pre5', 'Jcom_pre5']


In [53]:
selected_features

['kur_tonic_TS_lh2',
 'skew_phasic_TS_rh5',
 'nor_kur_phasic_TS_rh5',
 'nor_var_phasic_TS_rh4',
 'Jcom_rh2']

**Ranker**

In [59]:
import numpy as np
import pandas as pd
from scipy.stats import entropy
from sklearn.base import BaseEstimator
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.utils.validation import check_X_y

class InfoGainAttributeEval(BaseEstimator):
    """Information Gain attribute evaluator"""
    
    def __init__(self, missing_merge=True, binarize=False):
        self.missing_merge = missing_merge
        self.binarize = binarize
        self.info_gains_ = None
        self.valid_features_ = []
        self.feature_names_ = []
        
    def fit(self, X, y):
        """Build the information gain evaluator"""
        # Add feature names capture
        if isinstance(X, pd.DataFrame):
            self.feature_names_ = X.columns.tolist()
        else:
            self.feature_names_ = [f"Feature_{i}" for i in range(X.shape[1])]
        X, y = check_X_y(X, y, dtype=None, force_all_finite='allow-nan')
        self.valid_features_ = list(range(X.shape[1]))
        self._preprocess_data(X, y)
        return self
        
    def _preprocess_data(self, X, y):
        """Handle discretization/binarization and missing values"""
        self.df = pd.DataFrame(X)
        self.classes_ = pd.Series(y).unique()
        
        # Handle numeric features
        for col in self.df.select_dtypes(include='number'):
            if self.binarize:
                self.df[col] = self._binarize(self.df[col])
            else:
                self.df[col] = self._discretize(self.df[col])
                
        # Store processed data and class labels
        self.df['__class__'] = y
        self._build_contingency_tables()
        
    def _discretize(self, feature):
        """Discretize numeric features using KBinsDiscretizer"""
        discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
        return discretizer.fit_transform(feature.values.reshape(-1, 1)).ravel()
    
    def _binarize(self, feature):
        """Binarize numeric features using median threshold"""
        return (feature > feature.median()).astype(int)
    
    def _build_contingency_tables(self):
        """Build contingency tables for each feature"""
        self.contingency_tables = {}
        
        for col in self.df.columns[:-1]:  # Exclude class column
            # Create contingency table
            cont_table = pd.crosstab(
                self.df[col], 
                self.df['__class__'],
                rownames=[col],
                colnames=['class'],
                dropna=False
            )
            
            # Handle missing values
            if self.missing_merge:
                cont_table = self._distribute_missing(cont_table)
                
            self.contingency_tables[col] = cont_table
            
    def _distribute_missing(self, cont_table):
        """Distribute missing values proportionally"""
        # Calculate missing proportions
        row_missing = cont_table.loc[np.nan] if np.nan in cont_table.index else pd.Series(0, index=cont_table.columns)
        col_missing = cont_table.loc[:, np.nan] if np.nan in cont_table.columns else pd.Series(0, index=cont_table.index)
        
        # Remove missing entries
        cont_table = cont_table.dropna(how='any', axis=0)
        cont_table = cont_table.dropna(how='any', axis=1)
        
        # Calculate distribution proportions
        row_totals = cont_table.sum(axis=1)
        col_totals = cont_table.sum(axis=0)
        total = cont_table.sum().sum()
        
        # Distribute row missing values
        for idx, count in row_missing.items():
            if count > 0 and total > 0:
                proportions = row_totals / total
                cont_table.loc[:, idx] += proportions * count
                
        # Distribute column missing values
        for idx, count in col_missing.items():
            if count > 0 and total > 0:
                proportions = col_totals / total
                cont_table.loc[idx, :] += proportions * count
                
        return cont_table.fillna(0)
    
    def _calculate_entropy(self, cont_table):
        """Calculate entropy for a contingency table"""
        class_counts = cont_table.sum(axis=0)
        total = class_counts.sum()
        class_probs = class_counts / total
        return entropy(class_probs, base=2)
    
    def _calculate_conditional_entropy(self, cont_table):
        """Calculate conditional entropy for a feature"""
        feature_counts = cont_table.sum(axis=1)
        total = feature_counts.sum()
        entropies = []
        
        for _, row in cont_table.iterrows():
            row_total = row.sum()
            if row_total == 0:
                continue
            probs = row / row_total
            ent = entropy(probs, base=2)
            entropies.append((row_total / total) * ent)
            
        return sum(entropies)
    
    def evaluate_attribute(self, attribute_idx):
        """Evaluate information gain for a specific attribute"""
        if self.info_gains_ is None:
            self._compute_info_gains()
            
        if attribute_idx >= len(self.info_gains_):
            raise ValueError(f"Invalid attribute index: {attribute_idx}")
            
        return self.info_gains_[attribute_idx]
    
    def _compute_info_gains(self):
        """Compute information gains for all attributes"""
        self.info_gains_ = []
        
        # Calculate overall entropy
        class_counts = self.df['__class__'].value_counts()
        total = class_counts.sum()
        class_probs = class_counts / total
        H_class = entropy(class_probs, base=2)
        
        for col in self.df.columns[:-1]:
            cont_table = self.contingency_tables[col]
            H_conditional = self._calculate_conditional_entropy(cont_table)
            self.info_gains_.append(H_class - H_conditional)
    
    def get_ranked_features(self):
        """Return features ranked by information gain"""
        if self.info_gains_ is None:
            self._compute_info_gains()
            
        # Create list of (index, gain) pairs
        features = list(range(len(self.info_gains_)))
        ranked = sorted(zip(features, self.info_gains_), 
                        key=lambda x: x[1], reverse=True)
        return ranked

    def print_ranking(self):
        """Print feature ranking with information gain scores"""
        ranked = self.get_ranked_features()
        print("\nFeature Ranking based on Information Gain:")
        print("Rank\tFeature\t\tInformation Gain")
        print("-------------------------------------------")
        for i, (feat_idx, gain) in enumerate(ranked):
            # Use actual feature names
            feat_name = self.feature_names_[feat_idx]
            print(f"{i+1}\t{feat_name}\t\t{gain:.4f}")


    
    
# Initialize and fit evaluator
ig = InfoGainAttributeEval()
X=X[selected_features]
ig.fit(X, y)
    
ranked = ig.print_ranking()


Feature Ranking based on Information Gain:
Rank	Feature		Information Gain
-------------------------------------------
1	pcom_pre5		0.1575
2	pcom_pre4		0.1430
3	Jcom_pre5		0.0727


**Random Forest**

In [60]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report, 
                             cohen_kappa_score, roc_auc_score, precision_recall_curve, auc,
                             matthews_corrcoef)
from sklearn.metrics import mean_absolute_error, mean_squared_error

class WekaStyleRandomForest:
    def __init__(self, num_trees=100, max_depth=None, seed=1):
        """
        Mirrors Weka's RandomForest defaults:
        - numTrees = 100
        - maxDepth = unlimited (None)
        - use entropy for splitting (criterion='entropy')
        - compute out-of-bag (OOB) error (oob_score=True)
        - feature selection: log2(num_features) + 1 (max_features='log2')
        """
        self.classifier = RandomForestClassifier(
            n_estimators=num_trees,
            criterion='entropy',  # Weka uses entropy, not Gini
            max_depth=max_depth,
            max_features='log2',   # Weka's default feature selection
            oob_score=True,        # Enable OOB error (like Weka)
            random_state=seed,
            n_jobs=-1
        )

    def cross_validate(self, X, y, num_folds=10):
        """Return both predicted labels and probabilities."""
        skf = StratifiedKFold(n_splits=num_folds, shuffle=False)
        y_proba = cross_val_predict(
            self.classifier, X, y, cv=skf, method='predict_proba'
        )[:, 1]  # Probabilities of class 1
        y_pred = np.round(y_proba).astype(int)  # Convert to class labels
        return y_pred, y_proba  # Return both

    def get_weka_metrics(self, y_true, y_pred, y_proba):
        """Use probabilities to compute ROC/PRC AUC."""
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        report = classification_report(y_true, y_pred, target_names=['Performer', 'NonPerpormer'], output_dict=True)
        
        # Compute AUC using probabilities
        roc_auc = roc_auc_score(y_true, y_proba)
        precision, recall, _ = precision_recall_curve(y_true, y_proba)
        prc_auc = auc(recall, precision)
        
 # Regression-style metrics
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        relative_absolute_error = mae / np.mean(np.abs(y_true))
        root_relative_squared_error = rmse / np.sqrt(np.mean(y_true**2))
        
        return {
            "confusion_matrix": [[tn, fp], [fn, tp]],
            "accuracy": accuracy_score(y_true, y_pred),
            "kappa": cohen_kappa_score(y_true, y_pred),
            "roc_auc": roc_auc,
            "prc_auc": prc_auc,
            "mcc": matthews_corrcoef(y_true, y_pred),
            "precision": report['Performer']['precision'],
            "recall": report['Performer']['recall'],
            "f1": report['Performer']['f1-score'],
            "weighted_avg_f1": report['weighted avg']['f1-score'],
            "mae": mae,
            "rmse": rmse,
            "relative_absolute_error": relative_absolute_error,
            "root_relative_squared_error": root_relative_squared_error
        }



# Initialize Weka-style classifier
rf = WekaStyleRandomForest(num_trees=100, max_depth=None, seed=1)

# Cross-validate (no shuffling, 10 folds)
y_pred, y_proba = rf.cross_validate(X, y, num_folds=10)


# Get metrics
metrics = rf.get_weka_metrics(y, y_pred, y_proba)  # Add y_proba if using predict_proba


# Print results (Weka-like format)
print(f"Correctly Classified Instances: {metrics['accuracy'] * len(y)} ({metrics['accuracy'] * 100:.2f}%)")
print(f"Incorrectly Classified Instances: {(1 - metrics['accuracy']) * len(y)} ({(1 - metrics['accuracy']) * 100:.2f}%)")
print(f"Kappa Statistic: {metrics['kappa']:.4f}")
print(f"Matthews Correlation Coefficient: {metrics['mcc']:.4f}")
print(f"ROC AUC: {metrics['roc_auc']:.4f}")
print(f"PRC AUC: {metrics['prc_auc']:.4f}")
print(f"Precision (Performer): {metrics['precision']:.4f}")
print(f"Recall (Performer): {metrics['recall']:.4f}")
print(f"F1-Score (Performer): {metrics['f1']:.4f}")
print(f"Weighted Avg F1-Score: {metrics['weighted_avg_f1']:.4f}")
print(f"Mean Absolute Error: {metrics['mae']:.4f}")
print(f"Root Mean Squared Error: {metrics['rmse']:.4f}")
print(f"Relative Absolute Error: {metrics['relative_absolute_error']:.4f}")
print(f"Root Relative Squared Error: {metrics['root_relative_squared_error']:.4f}")
print("\nConfusion Matrix:")
print(pd.DataFrame(metrics['confusion_matrix'], 
                   index=['Nonperformer (Actual)', 'Performer (Actual)'], 
                   columns=['Nonperformer (Predicted)', 'Performer (Predicted)']))

Correctly Classified Instances: 77.0 (77.00%)
Incorrectly Classified Instances: 23.0 (23.00%)
Kappa Statistic: 0.5344
Matthews Correlation Coefficient: 0.5345
ROC AUC: 0.7841
PRC AUC: 0.7775
Precision (Performer): 0.7333
Recall (Performer): 0.7500
F1-Score (Performer): 0.7416
Weighted Avg F1-Score: 0.7703
Mean Absolute Error: 0.2300
Root Mean Squared Error: 0.4796
Relative Absolute Error: 0.4107
Root Relative Squared Error: 0.6409

Confusion Matrix:
                       Nonperformer (Predicted)  Performer (Predicted)
Nonperformer (Actual)                        33                     11
Performer (Actual)                           12                     44


**Compute Ratios**

In [None]:
import pandas as pd
import numpy as np

def calculate_ratios(input_file, output_file):
    try:
        # Read the stacked CSV file
        df = pd.read_csv(input_file)
        
        # Add ratio columns for pcom values
        # pcom_ratio1 = pcom_pre1/pcom_pre1
        df['pcom_ratio1'] = df['pcom_pre1'] / df['pcom_pre1']  # This will be 1.0 for all non-NaN values
        
        # pcom_ratio2 through pcom_ratio5
        for i in range(2, 6):
            df[f'pcom_ratio{i}'] = df[f'pcom_pre{i}'] / df['pcom_pre1']
        
        # pcom_ratio6 through pcom_ratio10
        for i in range(1, 6):
            ratio_index = i + 5
            df[f'pcom_ratio{ratio_index}'] = df[f'pcom_post{i}'] / df['pcom_pre1']
        
        # Add ratio columns for Jcom values - same pattern as pcom
        # Jcom_ratio1 = Jcom_pre1/Jcom_pre1
        df['Jcom_ratio1'] = df['Jcom_pre1'] / df['Jcom_pre1']  # This will be 1.0 for all non-NaN values
        
        # Jcom_ratio2 through Jcom_ratio5
        for i in range(2, 6):
            df[f'Jcom_ratio{i}'] = df[f'Jcom_pre{i}'] / df['Jcom_pre1']
        
        # Jcom_ratio6 through Jcom_ratio10
        for i in range(1, 6):
            ratio_index = i + 5
            df[f'Jcom_ratio{ratio_index}'] = df[f'Jcom_post{i}'] / df['Jcom_pre1']
        
        # Save to a new CSV file
        df.to_csv(output_file, index=False)
        print(f"Successfully added ratio columns and saved to {output_file}")
        return True
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return False


input_file = "stacked_hemispheres_data.csv"  # Output from previous script
output_file = "data_with_ratios.csv"  
calculate_ratios(input_file, output_file)