In [89]:
import pandas as pd
import pyswarms as ps
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder


# Define the file path
#file_path = 'featureoptimization/featureoptimization/densenet201_lbp_histog_feats.csv'
file_path = 'featureoptimization/featureoptimization/densenet201feat.csv'
#file_path = 'featureoptimization/featureoptimization/efficientnetb3newfeat.csv'
#file_path = 'featureoptimization/featureoptimization/combined_DIEnet_features.csv'
#file_path = 'featureoptimization/featureoptimization/inceptionv4newfeat.csv'




# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)
df

Unnamed: 0,id_code,dense201_feature_0,dense201_feature_1,dense201_feature_2,dense201_feature_3,dense201_feature_4,dense201_feature_5,dense201_feature_6,dense201_feature_7,dense201_feature_8,...,dense201_feature_1911,dense201_feature_1912,dense201_feature_1913,dense201_feature_1914,dense201_feature_1915,dense201_feature_1916,dense201_feature_1917,dense201_feature_1918,dense201_feature_1919,label
0,000c1434d8d7.png,-0.000443,0.000154,0.008510,-0.01436,-0.001654,-0.003633,-0.107240,-0.06610,-0.001102,...,1.784,-1.0070,-0.33150,0.25980,-0.6030,0.6523,0.2448,0.3103,-0.89700,2
1,000c1434d8d7_aug1.png,-0.000448,0.000154,-0.006504,-0.06824,-0.001775,-0.003572,0.079100,0.14030,-0.001102,...,2.973,-2.2970,1.65100,0.21420,-1.7390,2.5200,1.5320,0.1005,-1.58800,2
2,000c1434d8d7_aug2.png,0.000141,0.000154,-0.010620,0.01137,-0.001896,-0.003150,0.023270,-0.07810,-0.001102,...,1.195,-0.7246,-0.38800,1.08500,-0.6196,0.7354,-0.6035,-0.1263,-1.25800,2
3,001639a390f0.png,-0.000527,0.000154,0.031020,-0.12310,0.002342,-0.005016,-0.109200,-0.02481,-0.001102,...,2.594,-2.0680,0.95850,0.11910,-1.6410,1.2330,1.3240,-0.3455,-1.61700,4
4,001639a390f0_aug1.png,-0.000412,0.000154,0.024730,-0.18380,0.000162,-0.006462,0.002012,-0.04570,-0.001102,...,1.463,-2.1290,2.39500,-0.20070,-1.7320,1.9830,3.0880,-1.3310,-0.11786,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16920,ffd97f8cd5aa.png,-0.000572,0.000154,0.041200,-0.21970,0.001616,-0.004597,-0.021710,0.02580,-0.001102,...,-1.004,0.8706,-0.58500,-1.24900,0.6226,-0.9487,1.3620,0.7534,1.49000,0
16921,ffd97f8cd5aa_aug1.png,-0.000015,0.000154,0.008385,-0.21510,-0.001533,-0.005077,0.152800,-0.13760,-0.001102,...,-0.710,-0.0340,-0.04608,-1.27200,0.1924,-0.4465,1.8380,1.5210,-0.03564,0
16922,ffec9a18a3ce.png,-0.000056,0.000154,0.012020,0.11570,-0.000685,-0.006040,-0.248400,-0.11743,-0.001102,...,0.589,-0.6465,0.63430,0.02954,-1.0120,0.4570,0.7686,-0.9316,-0.69600,2
16923,ffec9a18a3ce_aug1.png,0.000141,0.000154,0.013830,-0.06970,-0.003347,-0.002008,0.124630,-0.06384,-0.001102,...,0.950,-1.2840,0.44850,1.67400,-1.5030,0.8540,-0.6270,-2.0040,-1.25400,2


In [90]:
# Drop rows with null values
df_cleaned = df.dropna()

# Check for null values
null_values = df_cleaned.isnull().sum()

# Display the null value counts
print("Null value counts:")
print(null_values)

Null value counts:
id_code                  0
dense201_feature_0       0
dense201_feature_1       0
dense201_feature_2       0
dense201_feature_3       0
                        ..
dense201_feature_1916    0
dense201_feature_1917    0
dense201_feature_1918    0
dense201_feature_1919    0
label                    0
Length: 1922, dtype: int64


In [91]:
# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Apply label encoding to categorical columns
categorical_columns = ['id_code']
for column in categorical_columns:
    df_cleaned[column] = label_encoder.fit_transform(df_cleaned[column])

# Display the DataFrame after label encoding
print(df_cleaned.head())

   id_code  dense201_feature_0  dense201_feature_1  dense201_feature_2  \
0        0           -0.000443            0.000154            0.008510   
1        1           -0.000448            0.000154           -0.006504   
2        2            0.000141            0.000154           -0.010620   
3        3           -0.000527            0.000154            0.031020   
4        4           -0.000412            0.000154            0.024730   

   dense201_feature_3  dense201_feature_4  dense201_feature_5  \
0            -0.01436           -0.001654           -0.003633   
1            -0.06824           -0.001775           -0.003572   
2             0.01137           -0.001896           -0.003150   
3            -0.12310            0.002342           -0.005016   
4            -0.18380            0.000162           -0.006462   

   dense201_feature_6  dense201_feature_7  dense201_feature_8  ...  \
0           -0.107240            -0.06610           -0.001102  ...   
1            0.079100   

In [92]:
X = df_cleaned.drop('label', axis=1).values  # Feature matrix
y = df_cleaned['label'].values  # Target variable
# Define binary PSO
options = {'c1': 0.5, 'c2': 0.3, 'w':0.9}
# Define bounds
max_bound = 1.0 * np.ones(X.shape[1])
min_bound = 0.0 * np.ones(X.shape[1])
bounds = (min_bound, max_bound)


In [93]:
def get_pca(x_train, x_test, n_components):
    pca = PCA(n_components=n_components, svd_solver='full')
    pca.fit(x_train)
    x_train = pca.transform(x_train)
    x_test = pca.transform(x_test)
    print(x_train.shape, x_test.shape)
    return x_train, x_test

In [94]:
# Objective function
def f_per_particle(m, alpha):
    """Computes for the objective function per particle

    Inputs
    ------
    m : numpy.ndarray
        Binary mask that can be obtained from BinaryPSO, will
        be used to mask features.
    alpha: float
        Penalty factor to control the number of features selected.

    Returns
    -------
    numpy.ndarray
    """
    total_features = X.shape[1]
    # Apply mask to features
    X_subset = X[:,m>0.5]
    if X_subset.shape[1] == 0:
        return float('inf')
    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.3, random_state=42)
    temp_train_data = X_train
    temp_test_data = X_test
    temp_train_data, temp_test_data = get_pca(temp_train_data, temp_test_data, 0.98)
    # Fit the model
    #clf = RandomForestClassifier(n_estimators=50, random_state=42)
    clf=SVC(C = 19,kernel='rbf', decision_function_shape='ovr')
    clf.fit(temp_train_data, y_train)
    # Predict and calculate accuracy
    y_pred = clf.predict(temp_test_data)
    accuracy = accuracy_score(y_test, y_pred)
    print("The Accuracy:")
    print(accuracy)
    # Calculate objective
    j = (alpha * (1.0 - accuracy) + (1.0 - alpha) * (1 - (X_subset.shape[1] / total_features)))
    print("The j:")
    print(j)
    return j

In [95]:
# Define objective function
def f(x, alpha=0.5):
    """Higher-level method to do classification in the
    whole swarm.

    Inputs
    ------
    x: numpy.ndarray of shape (n_particles, dimensions)
        The swarm that will perform the search

    Returns
    -------
    numpy.ndarray of shape (n_particles, )
        The computed loss for each particle
    """
    n_particles = x.shape[0]
    j = [f_per_particle(x[i], alpha) for i in range(n_particles)]
    return np.array(j)
# Initialize swarm
optimizer = ps.single.GlobalBestPSO(n_particles=50, dimensions=X.shape[1], options=options, bounds=bounds)

# Perform optimization
cost, pos = optimizer.optimize(f, iters=5)
#'pos' is the variable containing the best position returned by optimizer.optimize()
best_pos = np.array(pos)

print("Position as NumPy Array:", best_pos)
# Define a threshold to determine whether a feature is selected
threshold = 0.6

# Create a boolean mask from 'best_pos' based on the threshold
selected_features_mask = best_pos > threshold

# df_cleaned.columns[:-1] gives you all feature names excluding the target variable
feature_names = df_cleaned.columns[:-1]  # Modify this as necessary

# Use the mask to select the names of the features
selected_features = feature_names[selected_features_mask]

print("Selected Features:", selected_features)

2025-04-28 19:08:03,202 - pyswarms.single.global_best - INFO - Optimize for 5 iters with {'c1': 0.5, 'c2': 0.3, 'w': 0.9}
pyswarms.single.global_best:   0%|                                                                                 |0/5

(11847, 359) (5078, 359)
The Accuracy:
0.9456478928712091
The j:
0.28771743825986656
(11847, 385) (5078, 385)
The Accuracy:
0.9436786136274123
The j:
0.2746468982878035
(11847, 1) (5078, 1)
The Accuracy:
0.2203623473808586
The j:
0.6428120590008772
(11847, 377) (5078, 377)
The Accuracy:
0.9423001181567546
The j:
0.2779389570590511
(11847, 340) (5078, 340)
The Accuracy:
0.9403308389129579
The j:
0.2911568085497678
(11847, 362) (5078, 362)


pyswarms.single.global_best:   0%|                                                                                 |0/5


KeyboardInterrupt: 