In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler # Keep for potential future use
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, f1_score
import gdown # Library to download from Google Drive
from sklearn.impute import SimpleImputer # Import imputer

# Define color codes
RED = "\033[91m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
BLUE = "\033[94m"
RESET = "\033[0m"  # Reset to default

In [9]:
#Alternativelly use the combined SECOM dataset csv in the dataset subfolder
# Define the dataset folder path (the files are in a subfolder called dataset)
from pathlib import Path

dataset_folder = Path('dataset')
dataset_filename= 'SECOM_combined_dataset.csv'

print(f"Attempting to open the combined file {dataset_filename} saved locally in the folder {dataset_folder}")
try:
     output_file = dataset_folder / dataset_filename
     print(f'File successfully opened.')
except FileNotFoundError as e:
     print(f'File not found: {e}')
except Exception as e:
     print(e)

Attempting to open the combined file SECOM_combined_dataset.csv saved locally in the folder dataset
File successfully opened.


In [10]:
# --- 2. Load Data using Pandas (Assuming Header IS Present) ---
try:
    # FIX: Remove header=None to let Pandas read the first row as header
    data = pd.read_csv(output_file)
    print(f"Data loaded successfully from {output_file} with shape: {data.shape}")
    # Display first few rows and info to check data types
    print("\nFirst 5 rows of loaded data:")
    print(data.head())
    print("\nData info:")
    data.info()
except Exception as e:
    print(f"Failed to load data from {output_file}: {e}")
    exit()

Data loaded successfully from dataset\SECOM_combined_dataset.csv with shape: (1567, 591)

First 5 rows of loaded data:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0    3030.93    2564.00  2187.7333  1411.1265     1.3602      100.0   
1    3095.78    2465.14  2230.4222  1463.6606     0.8294      100.0   
2    2932.61    2559.94  2186.4111  1698.0172     1.5102      100.0   
3    2988.72    2479.90  2199.0333   909.7926     1.3204      100.0   
4    3032.24    2502.87  2233.3667  1326.5200     1.5334      100.0   

   feature_6  feature_7  feature_8  feature_9  ...  feature_581  feature_582  \
0    97.6133     0.1242     1.5005     0.0162  ...          NaN       0.5005   
1   102.3433     0.1247     1.4966    -0.0005  ...     208.2045       0.5019   
2    95.4878     0.1241     1.4436     0.0041  ...      82.8602       0.4958   
3   104.2367     0.1217     1.4882    -0.0124  ...      73.8432       0.4990   
4   100.3967     0.1235     1.5031    -0.0031  ...    

In [11]:
# --- 3. Prepare Data ---
if data.shape[1] > 1:
    # Assume last column is target 'y', all others are features 'X'
    feature_names = data.columns[:-1].tolist() # Get actual feature names from header
    X_df = data[feature_names] # Select feature columns using names
    y_series = data.iloc[:, -1] # Select target column by position

    print(f"\nFeature names extracted from header: {feature_names[:5]}... (Total: {len(feature_names)})")

    # --- Keep Robust Cleaning Steps ---
    # Convert non-numeric strings to numeric, coercing errors, otherwise-> NaN
    X_df_numeric = X_df.apply(pd.to_numeric, errors='coerce')
    y_series_numeric = pd.to_numeric(y_series, errors='coerce')

    # Handle potential NaN values resulting from coercion or missing values in original file
    imputer_X = SimpleImputer(strategy='mean')
    X_imputed = imputer_X.fit_transform(X_df_numeric)

    missing_labels_count = y_series_numeric.isna().sum()
    
    imputer_y = SimpleImputer(strategy='most_frequent') #changed strategy from mean to most frequent since we have binary labels
    y_imputed = imputer_y.fit_transform(y_series_numeric.values.reshape(-1, 1)).flatten()

    # Check for NaNs after imputation (should only happen if a whole column was non-numeric/NaN)
    if np.isnan(X_imputed).any() or np.isnan(y_imputed).any():
        print("Warning: NaNs still present after imputation. Check columns with all invalid values.")
        # Consider more advanced imputation or dropping problematic columns/rows if this occurs

    X = X_imputed
    y = y_imputed

    print(f"Features shape after cleaning: {X.shape}")
    
    print(f"Number of missing labels to be imputed: {missing_labels_count}")
    print(f"Target shape after cleaning: {y.shape}")
    n_features_loaded = X.shape[1] # Keep track of the number of features

else:
    print("Error: Loaded data has only one column. Cannot separate features and target.")
    exit()


Feature names extracted from header: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4']... (Total: 590)
Features shape after cleaning: (1567, 590)
Number of missing labels to be imputed: 0
Target shape after cleaning: (1567,)


In [12]:

# --- 4. Split Data for Fitness Evaluation ---
# Using the cleaned X and y, with stratify helps maintain the proportion of class labels
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
n_features = X_train.shape[1] # Use the actual number of features
print(f"\nTraining features shape: {X_train.shape}")
print(f"Validation features shape: {X_val.shape}")


Training features shape: (1096, 590)
Validation features shape: (471, 590)


In [13]:
#The expected number of selected features is d ⊆ n_features, this constraint must be set up since the first population
# to make more likely for the algorithm to focus on small subsets instead of larger ones in further generations.
#We set the chance of each gene being 1 in the initial population P to a value of d/D
def initialize_population(d, pop_size, n_features):
    probability_for_1 = d/n_features
    probability_for_0 = 1 - probability_for_1
    
    print(f'-Building Initial population of size: {pop_size} with approximately {d} features per individual')
    
    pop = np.random.choice([0, 1], size=(pop_size, n_features), p=[probability_for_0, probability_for_1])    
    
    # Ensure initial individuals are not all zeros
    for i in range(pop_size):
        if not pop[i].any:
            #then randomly sets one feature to 1.
            pop[i, np.random.randint(0, n_features)] = 1
    return pop

In [19]:
model = RandomForestClassifier(n_estimators=15, # Reduced from 50
                                random_state=42,
                                n_jobs=-1) # Use n_jobs=-1 for parallelization
# --- TUNING POINT ---

In [20]:
d=50
pop_size =10

population = initialize_population(d,pop_size, n_features)
individual= population[0]
print(f'\nIndividual sample: {individual}')

individual = np.array(individual)  #Ensure it's a NumPy array
selected_indices = np.where(individual == 1)[0] # Get indices where bit is 1
num_selected_features = len(selected_indices)
print(f'\nNumber Selected features: {num_selected_features}')



# Select corresponding columns from train/validation sets
X_train_sel = X_train[:, selected_indices]
X_val_sel = X_val[:, selected_indices]

try:        
    model.fit(X_train_sel, y_train)
    y_pred = model.predict(X_val_sel)

    print(f'\nY predicted: {y_pred}')
    
    # Fitness = Negative MSE (maximization) - Penalty for feature count
    #mse = mean_squared_error(y_val, y_pred)        
    #fitness = -mse - penalty_coef * num_selected_features
    score = mean_squared_error(y_val, y_pred)
    print(f'\nMSE: {score}')        

    #f1 score more suitable for imbalanced (binary) datsaet
    #penalty discourages solutions that are significantly larger or smaller than the target subset size 
    score = f1_score(y_val,y_pred,average='binary', pos_label=1)
    print(f'\nf1_SCORE: {score}')        
    #penalty = penalty_coef*abs(num_selected_features - num_desired_features)
    
    #fitness = score - penalty

except ValueError as e:
    # Catch potential errors during fitting/prediction if data issues remain
    print(f"Error during model fitting/prediction: {e}")

-Building Initial population of size: 10 with approximately 50 features per individual

Individual sample: [1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0