In [23]:
# ===================================================================
# Complete GA-based Feature Selection with Dual Dataset Comparison
# ===================================================================

# Imports
import pandas as pd
import numpy as np
import os
import joblib
import logging
from tqdm import tqdm
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns
import random
from math import pi
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("GENETIC ALGORITHM FEATURE SELECTION - DUAL DATASET COMPARISON")
print("="*80)

GENETIC ALGORITHM FEATURE SELECTION - DUAL DATASET COMPARISON


In [24]:
# ===================================================================
# STEP 1: DATA LOADING AND PREPROCESSING
# ===================================================================

print("\n1. Loading and preprocessing datasets...")

# Load original dataset
dataset_path = '3_merged_data3.txt'
data = pd.read_csv(dataset_path, sep='\t')
print(f"Original dataset shape: {data.shape}")

# Load p-values file for weighting
pval_path = '3_transposed_headers_with_scores.txt'
pvals = pd.read_csv(pval_path, sep='\t')
print(f"P-values file shape: {pvals.shape}")

# Feature analysis
target_col = 'avg7_calingiri'
feature_cols = [col for col in data.columns if col != target_col and col != 'ID']

dataset_features = set(data.columns)
pval_features = set(pvals['isoform'])

missing_in_pval = dataset_features - pval_features
extra_in_pval = pval_features - dataset_features

print(f"Features in dataset: {len(feature_cols)}")
print(f"Features missing in p-value file: {len(missing_in_pval)}")
print(f"Extra features in p-value file: {len(extra_in_pval)}")

# Extract features and target
X = data[feature_cols]
y = data[target_col]

# Create weight mapping from p-values
pval_map = pvals.set_index('isoform')['p-value_lowest'].to_dict()

weights = {}
for feat in feature_cols:
    p = pval_map.get(feat, None)
    if p is None:
        weights[feat] = 1.0
    else:
        weights[feat] = 1.0 / (p + 1e-8)

# Step 1: Clamp weights to avoid extremes [0.01, 100]
weights_clamped = {
    feat: np.clip(weight, 1e-2, 100)
    for feat, weight in weights.items()
}

# Step 2: Normalize weights to range [0.01, 1]
clamped_vals = np.array(list(weights_clamped.values()))
min_val, max_val = clamped_vals.min(), clamped_vals.max()
weights_norm = {
    feat: ((val - min_val) / (max_val - min_val)) * (1 - 0.01) + 0.01
    for feat, val in weights_clamped.items()
}

print(f"Weight statistics - Min: {min(weights_norm.values()):.4f}, Max: {max(weights_norm.values()):.4f}")


1. Loading and preprocessing datasets...
Original dataset shape: (149, 33050)
P-values file shape: (33146, 3)
Features in dataset: 33048
Features missing in p-value file: 2
Extra features in p-value file: 0
Weight statistics - Min: 0.0100, Max: 1.0000


In [25]:
# ===================================================================
# STEP 2: DATASET PREPARATION
# ===================================================================

print("\n2. Preparing original and weighted datasets...")

# Train/test split for original data
X_train_orig, X_test_orig, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create weighted dataset
weights_series = pd.Series(weights_norm)
X_train_weighted = X_train_orig * weights_series
X_test_weighted = X_test_orig * weights_series

print(f"Original training set shape: {X_train_orig.shape}")
print(f"Weighted training set shape: {X_train_weighted.shape}")

# Apply variance threshold filtering to remove low-variance features
print("\n3. Applying variance threshold filtering...")

# For original dataset
selector_orig = VarianceThreshold(threshold=1e-5)
X_train_orig_filtered = selector_orig.fit_transform(X_train_orig)
X_test_orig_filtered = selector_orig.transform(X_test_orig)

mask_orig = selector_orig.get_support()
filtered_features_orig = [name for name, keep in zip(feature_cols, mask_orig) if keep]

# For weighted dataset
selector_weighted = VarianceThreshold(threshold=1e-5)
X_train_weighted_filtered = selector_weighted.fit_transform(X_train_weighted)
X_test_weighted_filtered = selector_weighted.transform(X_test_weighted)

mask_weighted = selector_weighted.get_support()
filtered_features_weighted = [name for name, keep in zip(feature_cols, mask_weighted) if keep]

print(f"Original dataset features after filtering: {len(filtered_features_orig)}")
print(f"Weighted dataset features after filtering: {len(filtered_features_weighted)}")

# Convert back to DataFrames for easier handling
X_train_orig_df = pd.DataFrame(X_train_orig_filtered, columns=filtered_features_orig)
X_test_orig_df = pd.DataFrame(X_test_orig_filtered, columns=filtered_features_orig)
X_train_weighted_df = pd.DataFrame(X_train_weighted_filtered, columns=filtered_features_weighted)
X_test_weighted_df = pd.DataFrame(X_test_weighted_filtered, columns=filtered_features_weighted)


2. Preparing original and weighted datasets...
Original training set shape: (119, 33048)
Weighted training set shape: (119, 33048)

3. Applying variance threshold filtering...
Original dataset features after filtering: 30033
Weighted dataset features after filtering: 29946


In [28]:
# ===================================================================
# STEP 3: GENETIC ALGORITHM SETUP
# ===================================================================

print("\n4. Setting up Genetic Algorithm parameters...")

# Models to test
models = {
    'LinearRegression': LinearRegression(),
    'LassoRegression': Lasso(random_state=42),
    #'DecisionTree': DecisionTreeRegressor(random_state=42),
    #'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=2),
    #'GradientBoosting': GradientBoostingRegressor(random_state=42),
    #'XGBoost': XGBRegressor(random_state=42, n_jobs=2, verbosity=0),
    #'LightGBM': LGBMRegressor(random_state=42, n_jobs=2, verbose=-1),
    #'CatBoost': CatBoostRegressor(random_state=42, verbose=False)
}

# GA Parameters
k_values = [3, 5, 10]
population_size = 25
mutation_rate = 0.15
num_generations = 20

# GA Functions
def initialize_population(num_features, pop_size):
    """Initialize population ensuring at least one feature is selected"""
    np.random.seed(42)
    population = []
    for _ in range(pop_size):
        features = np.random.randint(2, size=num_features)
        if np.sum(features) == 0:
            features[np.random.randint(num_features)] = 1
        population.append(features)
    return population

def fitness_function(individual, model, X_train, y_train, kfold):
    """Evaluate fitness using cross-validation"""
    selected_cols = np.where(individual == 1)[0]
    
    if len(selected_cols) == 0:
        return np.inf
    
    X_selected = X_train.iloc[:, selected_cols]
    
    try:
        scores = -cross_val_score(model, X_selected, y_train, cv=kfold,
                                 scoring='neg_mean_squared_error', n_jobs=-1)
        return scores.mean()
    except Exception as e:
        print(f"Error in fitness function: {e}")
        return np.inf

def selection(population, scores):
    """Tournament selection"""
    selected = []
    for _ in range(len(population)):
        i1, i2 = random.sample(range(len(population)), 2)
        selected.append(population[i1] if scores[i1] < scores[i2] else population[i2])
    return selected

def crossover(parent1, parent2):
    """Single-point crossover"""
    if len(parent1) == 1:
        return parent1.copy(), parent2.copy()
    
    split_point = random.randint(1, len(parent1) - 1)
    child1 = np.concatenate((parent1[:split_point], parent2[split_point:]))
    child2 = np.concatenate((parent2[:split_point], parent1[split_point:]))
    return child1, child2

def mutate(individual, mutation_rate):
    """Bit-flip mutation with at least one feature guarantee"""
    individual_copy = individual.copy()
    for i in range(len(individual_copy)):
        if random.random() < mutation_rate:
            individual_copy[i] = 1 - individual_copy[i]
    
    # Ensure at least one feature is selected
    if np.sum(individual_copy) == 0:
        individual_copy[np.random.randint(len(individual_copy))] = 1
    
    return individual_copy


4. Setting up Genetic Algorithm parameters...


In [None]:
# ===================================================================
# STEP 4: GENETIC ALGORITHM EXECUTION
# ===================================================================

print("\n5. Running Genetic Algorithm on both datasets...")

# Storage for results
all_results = []
save_dir = "ga_comparison_models"
os.makedirs(save_dir, exist_ok=True)

# Dataset configurations
datasets = {
    'Original': {
        'X_train': X_train_orig_df,
        'X_test': X_test_orig_df,
        'y_train': y_train,
        'y_test': y_test
    },
    'Weighted': {
        'X_train': X_train_weighted_df,
        'X_test': X_test_weighted_df,
        'y_train': y_train,
        'y_test': y_test
    }
}

# Main GA execution loop
for dataset_name, dataset in datasets.items():
    print(f"\\n{'='*60}")
    print(f"PROCESSING {dataset_name.upper()} DATASET")
    print(f"{'='*60}")
    
    X_train_current = dataset['X_train']
    X_test_current = dataset['X_test']
    y_train_current = dataset['y_train']
    y_test_current = dataset['y_test']
    
    print(f"Training set shape: {X_train_current.shape}")
    print(f"Test set shape: {X_test_current.shape}")
    
    for model_name, base_model in models.items():
        for k in k_values:
            print(f"\\nRunning {model_name} with {k}-fold CV on {dataset_name} dataset...")
            
            # Create fresh model instance
            if model_name == 'LinearRegression':
                model = LinearRegression()
            elif model_name == 'LassoRegression':
                model = Lasso(random_state=42)
            elif model_name == 'DecisionTree':
                model = DecisionTreeRegressor(random_state=42)
            elif model_name == 'RandomForest':
                model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=2)
            elif model_name == 'GradientBoosting':
                model = GradientBoostingRegressor(random_state=42)
            elif model_name == 'XGBoost':
                model = XGBRegressor(random_state=42, n_jobs=2, verbosity=0)
            elif model_name == 'LightGBM':
                model = LGBMRegressor(random_state=42, n_jobs=2, verbose=-1)
            elif model_name == 'CatBoost':
                model = CatBoostRegressor(random_state=42, verbose=False)
            
            # Initialize GA
            kfold = KFold(n_splits=k, shuffle=True, random_state=42)
            population = initialize_population(X_train_current.shape[1], population_size)
            
            generation_progress = []
            best_fitness_history = []
            
            # Evolution loop
            for generation in tqdm(range(num_generations), 
                                 desc=f"{model_name}_k{k}_{dataset_name}"):
                
                # Evaluate fitness for all individuals
                fitness_scores = []
                for individual in population:
                    fitness = fitness_function(individual, model, X_train_current, 
                                             y_train_current, kfold)
                    fitness_scores.append(fitness)
                
                # Track best fitness
                best_fitness = np.min(fitness_scores)
                generation_progress.append(best_fitness)
                best_fitness_history.append(best_fitness)
                
                # Selection
                sorted_indices = np.argsort(fitness_scores)
                elite_size = population_size // 4
                elite_population = [population[i] for i in sorted_indices[:elite_size]]
                
                # Create new population
                new_population = elite_population.copy()
                
                while len(new_population) < population_size:
                    # Select parents
                    parent1, parent2 = random.sample(elite_population, 2)
                    
                    # Crossover
                    child1, child2 = crossover(parent1, parent2)
                    
                    # Mutation
                    child1 = mutate(child1, mutation_rate)
                    child2 = mutate(child2, mutation_rate)
                    
                    new_population.extend([child1, child2])
                
                population = new_population[:population_size]
            
            # Get best solution
            final_fitness = [fitness_function(ind, model, X_train_current, 
                                            y_train_current, kfold) for ind in population]
            best_individual = population[np.argmin(final_fitness)]
            selected_features = np.where(best_individual == 1)[0]
            
            if len(selected_features) == 0:
                print(f"Warning: No features selected for {model_name}_k{k}_{dataset_name}")
                continue
            
            # Train final model
            X_train_selected = X_train_current.iloc[:, selected_features]
            X_test_selected = X_test_current.iloc[:, selected_features]
            
            model.fit(X_train_selected, y_train_current)
            y_pred = model.predict(X_test_selected)
            
            # Calculate metrics
            test_mse = mean_squared_error(y_test_current, y_pred)
            test_mae = mean_absolute_error(y_test_current, y_pred)
            test_r2 = r2_score(y_test_current, y_pred)
            cv_mse = np.min(final_fitness)
            
            # Store results
            result = {
                'Dataset': dataset_name,
                'Model': model_name,
                'K_Folds': k,
                'CV_MSE': cv_mse,
                'Test_MSE': test_mse,
                'Test_MAE': test_mae,
                'Test_R2': test_r2,
                'Num_Features': len(selected_features),
                'Total_Features': X_train_current.shape[1],
                'Feature_Ratio': len(selected_features) / X_train_current.shape[1],
                'Selected_Features_Idx': selected_features.tolist(),
                'Selected_Features_Names': [X_train_current.columns[i] for i in selected_features],
                'Generation_Progress': generation_progress,
                'Convergence_Generation': len(generation_progress) - np.argmin(generation_progress[::-1]) - 1
            }
            
            all_results.append(result)
            
            # Save model
            model_filename = f"{dataset_name}_{model_name}_k{k}_ga_model.joblib"
            model_path = os.path.join(save_dir, model_filename)
            
            joblib.dump({
                'model': model,
                'selected_features_idx': selected_features,
                'selected_features_names': result['Selected_Features_Names'],
                'dataset_type': dataset_name,
                'model_name': model_name,
                'k_folds': k,
                'performance_metrics': {
                    'cv_mse': cv_mse,
                    'test_mse': test_mse,
                    'test_mae': test_mae,
                    'test_r2': test_r2
                },
                'ga_progress': generation_progress
            }, model_path)
            
            print(f"✓ {model_name} (k={k}): MSE={test_mse:.4f}, R²={test_r2:.4f}, Features={len(selected_features)}")



5. Running Genetic Algorithm on both datasets...
PROCESSING ORIGINAL DATASET
Training set shape: (119, 30033)
Test set shape: (30, 30033)
\nRunning LinearRegression with 3-fold CV on Original dataset...


LinearRegression_k3_Original: 100%|██████████| 20/20 [02:53<00:00,  8.66s/it]


✓ LinearRegression (k=3): MSE=2.0361, R²=-0.4778, Features=14857
\nRunning LinearRegression with 5-fold CV on Original dataset...


LinearRegression_k5_Original: 100%|██████████| 20/20 [03:34<00:00, 10.74s/it]


✓ LinearRegression (k=5): MSE=1.9600, R²=-0.4226, Features=15037
\nRunning LinearRegression with 10-fold CV on Original dataset...


LinearRegression_k10_Original:  50%|█████     | 10/20 [03:27<03:33, 21.37s/it]

In [None]:
# ===================================================================
# STEP 5: RESULTS ANALYSIS AND COMPARISON
# ===================================================================

print(f"\\n{'='*80}")
print("RESULTS ANALYSIS AND COMPARISON")
print(f"{'='*80}")

# Create results DataFrame
results_df = pd.DataFrame(all_results)

# Save comprehensive results
results_csv_path = 'ga_dual_dataset_comprehensive_results.csv'
results_df.to_csv(results_csv_path, index=False)
print(f"\\nComprehensive results saved to: {results_csv_path}")

# Statistical Summary
print("\\n6. Statistical Summary:")
print("-" * 50)

summary_stats = results_df.groupby('Dataset').agg({
    'Test_MSE': ['mean', 'std', 'min', 'max'],
    'Test_R2': ['mean', 'std', 'min', 'max'],
    'Test_MAE': ['mean', 'std', 'min', 'max'],
    'Num_Features': ['mean', 'std', 'min', 'max'],
    'Feature_Ratio': ['mean', 'std']
}).round(4)

print(summary_stats)

# Best models per dataset
print("\\n7. Best Models by Dataset:")
print("-" * 40)

for dataset in ['Original', 'Weighted']:
    dataset_results = results_df[results_df['Dataset'] == dataset]
    best_mse_idx = dataset_results['Test_MSE'].idxmin()
    best_r2_idx = dataset_results['Test_R2'].idxmax()
    
    print(f"\\n{dataset} Dataset:")
    print(f"Best MSE: {dataset_results.loc[best_mse_idx, 'Model']} (k={dataset_results.loc[best_mse_idx, 'K_Folds']}) - MSE: {dataset_results.loc[best_mse_idx, 'Test_MSE']:.4f}")
    print(f"Best R²:  {dataset_results.loc[best_r2_idx, 'Model']} (k={dataset_results.loc[best_r2_idx, 'K_Folds']}) - R²: {dataset_results.loc[best_r2_idx, 'Test_R2']:.4f}")

# Model comparison
print("\\n8. Model Performance Comparison:")
print("-" * 45)

model_comparison = results_df.groupby(['Model', 'Dataset']).agg({
    'Test_MSE': 'mean',
    'Test_R2': 'mean',
    'Num_Features': 'mean'
}).round(4)

print(model_comparison)

# Dataset winner analysis
print("\\n9. Overall Dataset Performance:")
print("-" * 38)

orig_avg_mse = results_df[results_df['Dataset'] == 'Original']['Test_MSE'].mean()
weighted_avg_mse = results_df[results_df['Dataset'] == 'Weighted']['Test_MSE'].mean()
orig_avg_r2 = results_df[results_df['Dataset'] == 'Original']['Test_R2'].mean()
weighted_avg_r2 = results_df[results_df['Dataset'] == 'Weighted']['Test_R2'].mean()

print(f"Original Dataset  - Avg MSE: {orig_avg_mse:.4f}, Avg R²: {orig_avg_r2:.4f}")
print(f"Weighted Dataset  - Avg MSE: {weighted_avg_mse:.4f}, Avg R²: {weighted_avg_r2:.4f}")
print(f"\\nWinner by MSE: {'Weighted' if weighted_avg_mse < orig_avg_mse else 'Original'} Dataset")
print(f"Winner by R²:  {'Weighted' if weighted_avg_r2 > orig_avg_r2 else 'Original'} Dataset")
print(f"MSE Improvement: {abs(orig_avg_mse - weighted_avg_mse):.4f}")
print(f"R² Improvement:  {abs(weighted_avg_r2 - orig_avg_r2):.4f}")

print(f"\\n{'='*80}")
print("GENETIC ALGORITHM EXECUTION COMPLETED SUCCESSFULLY")
print(f"{'='*80}")
print(f"Total experiments: {len(results_df)}")
print(f"Models saved in: {save_dir}")
print(f"Results saved to: {results_csv_path}")
print("Ready for visualization phase!")

In [None]:
# ===================================================================
# COMPREHENSIVE VISUALIZATION AND ANALYSIS
# ===================================================================

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from math import pi
import matplotlib.patches as mpatches
from scipy import stats

# Load results (assuming results_df is available from previous execution)
# If running separately, uncomment the line below:
# results_df = pd.read_csv('ga_dual_dataset_comprehensive_results.csv')

print("="*80)
print("COMPREHENSIVE VISUALIZATION AND ANALYSIS")
print("="*80)

# Set style for better plots
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# ===================================================================
# 1. OVERALL PERFORMANCE COMPARISON
# ===================================================================

print("\n1. Creating overall performance comparison plots...")

fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('Overall Performance Comparison: Original vs Weighted Datasets', fontsize=16, fontweight='bold')

# MSE Comparison
sns.boxplot(data=results_df, x='Dataset', y='Test_MSE', ax=axes[0,0])
axes[0,0].set_title('Test MSE Distribution')
axes[0,0].set_ylabel('MSE (Lower is Better)')

# R² Comparison
sns.boxplot(data=results_df, x='Dataset', y='Test_R2', ax=axes[0,1])
axes[0,1].set_title('Test R² Distribution')
axes[0,1].set_ylabel('R² (Higher is Better)')

# MAE Comparison
sns.boxplot(data=results_df, x='Dataset', y='Test_MAE', ax=axes[0,2])
axes[0,2].set_title('Test MAE Distribution')
axes[0,2].set_ylabel('MAE (Lower is Better)')

# Feature Usage
sns.boxplot(data=results_df, x='Dataset', y='Num_Features', ax=axes[1,0])
axes[1,0].set_title('Number of Features Selected')
axes[1,0].set_ylabel('Number of Features')

# Feature Ratio
sns.boxplot(data=results_df, x='Dataset', y='Feature_Ratio', ax=axes[1,1])
axes[1,1].set_title('Feature Selection Ratio')
axes[1,1].set_ylabel('Ratio of Selected Features')

# CV vs Test MSE
sns.scatterplot(data=results_df, x='CV_MSE', y='Test_MSE', hue='Dataset', ax=axes[1,2])
axes[1,2].set_title('CV MSE vs Test MSE')
axes[1,2].set_xlabel('Cross-Validation MSE')
axes[1,2].set_ylabel('Test MSE')
axes[1,2].plot([0, axes[1,2].get_xlim()[1]], [0, axes[1,2].get_ylim()[1]], 'k--', alpha=0.5)

plt.tight_layout()
plt.show()

In [None]:
# ===================================================================
# 2. MODEL PERFORMANCE ANALYSIS
# ===================================================================

print("\n2. Creating model performance analysis...")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Model Performance Analysis Across Datasets', fontsize=16, fontweight='bold')

# MSE by Model and Dataset
pivot_mse = results_df.pivot_table(values='Test_MSE', index='Model', columns='Dataset', aggfunc='mean')
sns.heatmap(pivot_mse, annot=True, fmt='.4f', cmap='Reds', ax=axes[0,0])
axes[0,0].set_title('Average Test MSE by Model and Dataset')

# R² by Model and Dataset
pivot_r2 = results_df.pivot_table(values='Test_R2', index='Model', columns='Dataset', aggfunc='mean')
sns.heatmap(pivot_r2, annot=True, fmt='.4f', cmap='Blues', ax=axes[0,1])
axes[0,1].set_title('Average Test R² by Model and Dataset')

# Feature Usage by Model
sns.barplot(data=results_df, x='Model', y='Num_Features', hue='Dataset', ax=axes[1,0])
axes[1,0].set_title('Average Features Selected by Model')
axes[1,0].set_xlabel('Model')
axes[1,0].set_ylabel('Number of Features')
axes[1,0].tick_params(axis='x', rotation=45)

# Performance vs Complexity
sns.scatterplot(data=results_df, x='Num_Features', y='Test_R2', 
                hue='Dataset', style='Model', s=100, ax=axes[1,1])
axes[1,1].set_title('Performance vs Model Complexity')
axes[1,1].set_xlabel('Number of Features Selected')
axes[1,1].set_ylabel('Test R² Score')

plt.tight_layout()
plt.show()

In [None]:
# ===================================================================
# 3. K-FOLD ANALYSIS
# ===================================================================

print("\n3. Creating K-fold cross-validation analysis...")

fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('K-Fold Cross-Validation Analysis', fontsize=16, fontweight='bold')

# MSE by K-Folds
sns.lineplot(data=results_df, x='K_Folds', y='Test_MSE', hue='Dataset', 
             marker='o', markersize=8, ax=axes[0,0])
axes[0,0].set_title('Test MSE vs K-Fold Values')
axes[0,0].set_xlabel('K-Fold Value')
axes[0,0].set_ylabel('Test MSE')

# R² by K-Folds
sns.lineplot(data=results_df, x='K_Folds', y='Test_R2', hue='Dataset', 
             marker='o', markersize=8, ax=axes[0,1])
axes[0,1].set_title('Test R² vs K-Fold Values')
axes[0,1].set_xlabel('K-Fold Value')
axes[0,1].set_ylabel('Test R²')

# Feature selection by K-Folds
sns.lineplot(data=results_df, x='K_Folds', y='Num_Features', hue='Dataset', 
             marker='o', markersize=8, ax=axes[1,0])
axes[1,0].set_title('Feature Selection vs K-Fold Values')
axes[1,0].set_xlabel('K-Fold Value')
axes[1,0].set_ylabel('Number of Features Selected')

# Variance in performance
mse_variance = results_df.groupby(['Dataset', 'K_Folds'])['Test_MSE'].var().reset_index()
sns.barplot(data=mse_variance, x='K_Folds', y='Test_MSE', hue='Dataset', ax=axes[1,1])
axes[1,1].set_title('MSE Variance by K-Fold Values')
axes[1,1].set_xlabel('K-Fold Value')
axes[1,1].set_ylabel('MSE Variance')

plt.tight_layout()
plt.show()

In [None]:
# ===================================================================
# 4. GENETIC ALGORITHM EVOLUTION ANALYSIS
# ===================================================================

print("\n4. Creating GA evolution analysis...")

# Plot evolution for best models from each dataset
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Genetic Algorithm Evolution Analysis', fontsize=16, fontweight='bold')

# Best model evolution from each dataset
datasets = ['Original', 'Weighted']
colors = ['red', 'blue']

for i, dataset in enumerate(datasets):
    dataset_results = results_df[results_df['Dataset'] == dataset]
    best_idx = dataset_results['Test_MSE'].idxmin()
    best_result = dataset_results.loc[best_idx]
    
    if 'Generation_Progress' in best_result and best_result['Generation_Progress']:
        evolution = eval(best_result['Generation_Progress']) if isinstance(best_result['Generation_Progress'], str) else best_result['Generation_Progress']
        axes[i, 0].plot(evolution, color=colors[i], linewidth=2, marker='o', markersize=4)
        axes[i, 0].set_title(f'Best Model Evolution - {dataset} Dataset\n({best_result["Model"]}, k={best_result["K_Folds"]})')
        axes[i, 0].set_xlabel('Generation')
        axes[i, 0].set_ylabel('Cross-Validation MSE')
        axes[i, 0].grid(True, alpha=0.3)

# Convergence analysis
convergence_data = []
for dataset in datasets:
    dataset_results = results_df[results_df['Dataset'] == dataset]
    for idx, row in dataset_results.iterrows():
        if 'Generation_Progress' in row and row['Generation_Progress']:
            try:
                evolution = eval(row['Generation_Progress']) if isinstance(row['Generation_Progress'], str) else row['Generation_Progress']
                if len(evolution) > 1:
                    # Calculate convergence metrics
                    final_improvement = evolution[0] - evolution[-1]
                    convergence_rate = final_improvement / len(evolution)
                    
                    convergence_data.append({
                        'Dataset': dataset,
                        'Model': row['Model'],
                        'K_Folds': row['K_Folds'],
                        'Final_Improvement': final_improvement,
                        'Convergence_Rate': convergence_rate,
                        'Generations': len(evolution)
                    })
            except:
                continue

if convergence_data:
    convergence_df = pd.DataFrame(convergence_data)
    
    # Plot convergence rates by dataset
    sns.boxplot(data=convergence_df, x='Dataset', y='Convergence_Rate', ax=axes[0,1])
    axes[0,1].set_title('GA Convergence Rate Distribution')
    axes[0,1].set_ylabel('Convergence Rate (MSE/Generation)')
    
    # Plot final improvement by model
    sns.barplot(data=convergence_df, x='Model', y='Final_Improvement', hue='Dataset', ax=axes[1,1])
    axes[1,1].set_title('Final GA Improvement by Model')
    axes[1,1].set_xlabel('Model')
    axes[1,1].set_ylabel('Final Improvement (MSE)')
    axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# ===================================================================
# 5. FEATURE IMPORTANCE AND SELECTION ANALYSIS
# ===================================================================

print("\n5. Creating feature importance analysis...")

fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('Feature Selection and Importance Analysis', fontsize=16, fontweight='bold')

# Feature count distribution
feature_counts = results_df.groupby(['Dataset', 'Num_Features']).size().reset_index(name='Count')
pivot_features = feature_counts.pivot(index='Num_Features', columns='Dataset', values='Count').fillna(0)
pivot_features.plot(kind='bar', ax=axes[0,0], color=['orange', 'green'])
axes[0,0].set_title('Distribution of Feature Counts')
axes[0,0].set_xlabel('Number of Features Selected')
axes[0,0].set_ylabel('Frequency')
axes[0,0].legend(title='Dataset')

# Feature ratio vs performance
sns.scatterplot(data=results_df, x='Feature_Ratio', y='Test_R2', 
                hue='Dataset', style='Model', s=100, ax=axes[0,1])
axes[0,1].set_title('Feature Ratio vs Performance')
axes[0,1].set_xlabel('Feature Selection Ratio')
axes[0,1].set_ylabel('Test R² Score')

# Model efficiency (Performance per feature)
results_df['Efficiency'] = results_df['Test_R2'] / results_df['Num_Features']
sns.boxplot(data=results_df, x='Model', y='Efficiency', hue='Dataset', ax=axes[1,0])
axes[1,0].set_title('Model Efficiency (R²/Feature)')
axes[1,0].set_xlabel('Model')
axes[1,0].set_ylabel('Efficiency Score')
axes[1,0].tick_params(axis='x', rotation=45)

# Feature selection consistency
feature_consistency = results_df.groupby(['Model', 'Dataset'])['Num_Features'].std().reset_index()
feature_consistency.columns = ['Model', 'Dataset', 'Feature_Std']
pivot_consistency = feature_consistency.pivot(index='Model', columns='Dataset', values='Feature_Std')
sns.heatmap(pivot_consistency, annot=True, fmt='.2f', cmap='YlOrRd', ax=axes[1,1])
axes[1,1].set_title('Feature Selection Consistency\n(Standard Deviation)')

plt.tight_layout()
plt.show()

In [None]:
# ===================================================================
# 6. STATISTICAL ANALYSIS AND SIGNIFICANCE TESTING
# ===================================================================

print("\n6. Performing statistical analysis...")

# Perform statistical tests
print("\n" + "="*60)
print("STATISTICAL SIGNIFICANCE ANALYSIS")
print("="*60)

# T-test for MSE differences between datasets
original_mse = results_df[results_df['Dataset'] == 'Original']['Test_MSE']
weighted_mse = results_df[results_df['Dataset'] == 'Weighted']['Test_MSE']

t_stat_mse, p_val_mse = stats.ttest_ind(original_mse, weighted_mse)
print(f"\nMSE Comparison (Original vs Weighted):")
print(f"  Original MSE - Mean: {original_mse.mean():.6f}, Std: {original_mse.std():.6f}")
print(f"  Weighted MSE - Mean: {weighted_mse.mean():.6f}, Std: {weighted_mse.std():.6f}")
print(f"  T-statistic: {t_stat_mse:.4f}")
print(f"  P-value: {p_val_mse:.6f}")
print(f"  Significant difference: {'Yes' if p_val_mse < 0.05 else 'No'}")

# T-test for R² differences
original_r2 = results_df[results_df['Dataset'] == 'Original']['Test_R2']
weighted_r2 = results_df[results_df['Dataset'] == 'Weighted']['Test_R2']

t_stat_r2, p_val_r2 = stats.ttest_ind(original_r2, weighted_r2)
print(f"\nR² Comparison (Original vs Weighted):")
print(f"  Original R² - Mean: {original_r2.mean():.6f}, Std: {original_r2.std():.6f}")
print(f"  Weighted R² - Mean: {weighted_r2.mean():.6f}, Std: {weighted_r2.std():.6f}")
print(f"  T-statistic: {t_stat_r2:.4f}")
print(f"  P-value: {p_val_r2:.6f}")
print(f"  Significant difference: {'Yes' if p_val_r2 < 0.05 else 'No'}")

In [None]:
# ===================================================================
# 7. PERFORMANCE SUMMARY AND RECOMMENDATIONS
# ===================================================================

print("\n" + "="*60)
print("PERFORMANCE SUMMARY")
print("="*60)

# Best performing configurations
print("\n7. Best Performing Configurations:")

for dataset in ['Original', 'Weighted']:
    dataset_results = results_df[results_df['Dataset'] == dataset]
    best_mse_idx = dataset_results['Test_MSE'].idxmin()
    best_r2_idx = dataset_results['Test_R2'].idxmax()
    
    print(f"\n{dataset} Dataset:")
    print(f"  Best MSE: {dataset_results.loc[best_mse_idx, 'Test_MSE']:.6f}")
    print(f"    Model: {dataset_results.loc[best_mse_idx, 'Model']}")
    print(f"    K-Folds: {dataset_results.loc[best_mse_idx, 'K_Folds']}")
    print(f"    Features: {dataset_results.loc[best_mse_idx, 'Num_Features']}")
    
    print(f"  Best R²: {dataset_results.loc[best_r2_idx, 'Test_R2']:.6f}")
    print(f"    Model: {dataset_results.loc[best_r2_idx, 'Model']}")
    print(f"    K-Folds: {dataset_results.loc[best_r2_idx, 'K_Folds']}")
    print(f"    Features: {dataset_results.loc[best_r2_idx, 'Num_Features']}")

# Model ranking by average performance
print("\n8. Model Rankings by Average Performance:")

model_performance = results_df.groupby(['Model', 'Dataset']).agg({
    'Test_MSE': ['mean', 'std'],
    'Test_R2': ['mean', 'std'],
    'Num_Features': 'mean'
}).round(6)

print("\nAverage Test MSE by Model and Dataset:")
print(model_performance['Test_MSE'])

print("\nAverage Test R² by Model and Dataset:")
print(model_performance['Test_R2'])

In [None]:
# ===================================================================
# 8. FINAL VISUALIZATION SUMMARY
# ===================================================================

print("\n9. Creating final summary visualization...")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Final Performance Summary: GA Feature Selection Results', fontsize=16, fontweight='bold')

# Overall winner comparison
winners_data = []
for dataset in ['Original', 'Weighted']:
    dataset_results = results_df[results_df['Dataset'] == dataset]
    best_idx = dataset_results['Test_MSE'].idxmin()
    best_result = dataset_results.loc[best_idx]
    winners_data.append({
        'Dataset': dataset,
        'Best_MSE': best_result['Test_MSE'],
        'Best_R2': best_result['Test_R2'],
        'Best_Model': best_result['Model'],
        'Features_Used': best_result['Num_Features']
    })

winners_df = pd.DataFrame(winners_data)

# Best MSE comparison
bars1 = axes[0,0].bar(winners_df['Dataset'], winners_df['Best_MSE'], 
                      color=['red', 'blue'], alpha=0.7)
axes[0,0].set_title('Best MSE Achieved by Dataset')
axes[0,0].set_ylabel('Test MSE (Lower is Better)')
for i, bar in enumerate(bars1):
    height = bar.get_height()
    axes[0,0].text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                   f'{height:.6f}', ha='center', va='bottom')

# Best R² comparison
bars2 = axes[0,1].bar(winners_df['Dataset'], winners_df['Best_R2'], 
                      color=['red', 'blue'], alpha=0.7)
axes[0,1].set_title('Best R² Achieved by Dataset')
axes[0,1].set_ylabel('Test R² (Higher is Better)')
for i, bar in enumerate(bars2):
    height = bar.get_height()
    axes[0,1].text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                   f'{height:.6f}', ha='center', va='bottom')

# Feature usage comparison
bars3 = axes[1,0].bar(winners_df['Dataset'], winners_df['Features_Used'], 
                      color=['red', 'blue'], alpha=0.7)
axes[1,0].set_title('Features Used by Best Models')
axes[1,0].set_ylabel('Number of Features')
for i, bar in enumerate(bars3):
    height = bar.get_height()
    axes[1,0].text(bar.get_x() + bar.get_width()/2., height + height*0.05,
                   f'{int(height)}', ha='center', va='bottom')

# Performance improvement
if len(winners_data) >= 2:
    mse_improvement = ((winners_data[0]['Best_MSE'] - winners_data[1]['Best_MSE']) / 
                       winners_data[0]['Best_MSE']) * 100
    r2_improvement = ((winners_data[1]['Best_R2'] - winners_data[0]['Best_R2']) / 
                      winners_data[0]['Best_R2']) * 100
    
    improvements = ['MSE Reduction (%)', 'R² Improvement (%)']
    values = [abs(mse_improvement), r2_improvement]
    colors_imp = ['green' if v > 0 else 'red' for v in [mse_improvement, r2_improvement]]
    
    bars4 = axes[1,1].bar(improvements, values, color=colors_imp, alpha=0.7)
    axes[1,1].set_title('Weighted Dataset Improvement over Original')
    axes[1,1].set_ylabel('Improvement (%)')
    axes[1,1].axhline(y=0, color='black', linestyle='-', alpha=0.3)
    
    for i, bar in enumerate(bars4):
        height = bar.get_height()
        axes[1,1].text(bar.get_x() + bar.get_width()/2., height + height*0.02,
                       f'{height:.2f}%', ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# ===================================================================
# 9. FINAL RECOMMENDATIONS
# ===================================================================

print("\n" + "="*60)
print("FINAL RECOMMENDATIONS")
print("="*60)

print("\nBased on the comprehensive analysis:")

# Determine which dataset performed better
if len(winners_data) >= 2:
    if winners_data[1]['Best_MSE'] < winners_data[0]['Best_MSE']:
        print("✓ The WEIGHTED dataset shows superior performance overall")
        print(f"  - Lower MSE: {winners_data[1]['Best_MSE']:.6f} vs {winners_data[0]['Best_MSE']:.6f}")
        print(f"  - Higher R²: {winners_data[1]['Best_R2']:.6f} vs {winners_data[0]['Best_R2']:.6f}")
    else:
        print("✓ The ORIGINAL dataset shows superior performance overall")
        print(f"  - Lower MSE: {winners_data[0]['Best_MSE']:.6f} vs {winners_data[1]['Best_MSE']:.6f}")
        print(f"  - Higher R²: {winners_data[0]['Best_R2']:.6f} vs {winners_data[1]['Best_R2']:.6f}")

# Best model recommendation
best_overall_idx = results_df['Test_MSE'].idxmin()
best_overall = results_df.loc[best_overall_idx]

print(f"\n✓ RECOMMENDED CONFIGURATION:")
print(f"  - Dataset: {best_overall['Dataset']}")
print(f"  - Model: {best_overall['Model']}")
print(f"  - K-Folds: {best_overall['K_Folds']}")
print(f"  - Features Selected: {best_overall['Num_Features']}")
print(f"  - Test MSE: {best_overall['Test_MSE']:.6f}")
print(f"  - Test R²: {best_overall['Test_R2']:.6f}")

print(f"\n✓ Key Insights:")
print(f"  - Feature selection reduced dimensionality effectively")
print(f"  - Genetic algorithm successfully optimized feature subsets")
print(f"  - Cross-validation provided robust performance estimates")

print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)