# 

In [4]:
# Step 1: Load and Preprocess Data
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Fetch the dataset
statlog_landsat_satellite = fetch_ucirepo(id=146) 
  
# data (as pandas dataframes) 
X = statlog_landsat_satellite.data.features 
y = statlog_landsat_satellite.data.targets 

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 2: Statistical Feature Selection
from scipy.stats import ttest_ind, ks_2samp
from sklearn.metrics import roc_auc_score
import numpy as np
from statsmodels.stats.multitest import multipletests



IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [8]:
# Initialize lists for statistical metrics
t_values, ks_values, roc_auc_values = [], [], []

for i in range(X_train.shape[1]):
    feature = X_train[:, i]
    # One-vs-Others t-test
    t_pvals = []
    for label in np.unique(y_train):        
        group = feature[y_train == label]
        others = feature[y_train != label]
        t_stat, t_pval = ttest_ind(group, others)
        t_pvals.append(t_pval)
    t_values.append(np.mean(t_pvals))

    # One-vs-Others KS test
    ks_pvals = []
    for label in np.unique(y_train):
        group = feature[y_train == label]
        others = feature[y_train != label]
        ks_stat, ks_pval = ks_2samp(group, others)
        ks_pvals.append(ks_pval)
    ks_values.append(np.mean(ks_pvals))

    # One-vs-Others ROC AUC
    try:
        roc_auc = roc_auc_score((y_train == label).astype(int), feature)
    except:
        roc_auc = 0.5  # if score calculation fails
    roc_auc_values.append(roc_auc)

# FDR Adjustment
_, t_fdr = multipletests(t_values, method='fdr_bh')[:2]
_, ks_fdr = multipletests(ks_values, method='fdr_bh')[:2]

# Step 3: Heuristic Feature Selection (using Recursive Feature Elimination with Random Forest)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

rf = RandomForestClassifier(random_state=42)
rfe = RFE(estimator=rf, n_features_to_select=10)
rfe.fit(X_train, y_train)
rfe_ranking = rfe.ranking_

# Step 4: Principal Component Analysis (PCA)
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
print(f"Number of PCA components explaining 95% variance: {pca.n_components_}")

# Step 5: Linear Discriminant Analysis (LDA)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, classification_report

lda = LinearDiscriminantAnalysis()
lda.fit(X_train_pca, y_train)
y_pred = lda.predict(X_test_pca)

# Report Performance
print("LDA Classification Report:")
print(classification_report(y_test, y_pred))


[]


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
print

In [3]:
!pip install statsmodels


Collecting statsmodels
  Downloading statsmodels-0.14.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.2 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Downloading statsmodels-0.14.4-cp312-cp312-macosx_11_0_arm64.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.9/232.9 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: patsy, statsmodels
Successfully installed patsy-1.0.1 statsmodels-0.14.4
