In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import os
import glob
import numpy as np 

import warnings
warnings.filterwarnings("ignore")

In [None]:
DT = pd.DataFrame()
KNN = pd.DataFrame()
SVM = pd.DataFrame()
NB = pd.DataFrame()

In [None]:
def convert_to_accuracy_percentages(x):
    return (100*(1-x))

In [None]:
# List of datasets
data_list = ['christensen', 'sun', 'alon', 'nakayama', 'tian', 'chin', 'chowdary', 
             'subramanian', 'chiaretti', 'golub', 'shipp', 'su', 'gordon', 'khan', 
             'singh', 'gravier', 'borovecki', 'sorlie', 'west', 'yeoh', 'burczynski',
             'pomeroy']

wrapper_list = ['Decision Tree', 'k-Nearest Neighbor', 'Naive Bayes', 'Support Vector Machine']

# Path to the folder containing data files
PATH_TEST = '/Users/erirosv/fun/Master-Thesis/plotting-result/plots_v2/data_v2'
p = os.path.abspath(PATH_TEST)

datasets = []
for d in data_list:
    csv_files = glob.glob(os.path.join(p, f'results_{d}.csv'))
    for csv_file in csv_files:
        dataset = pd.read_csv(csv_file)
        dataset['score_mean'] = 100 * (1 - pd.to_numeric(dataset['cv_error_mean'], errors='coerce'))
        datasets.append(dataset)  # Append the dataset to the list

In [None]:
datasets

In [None]:
# Define a function to convert a value to accuracy percentages
def convert_to_accuracy_percentages(x):
    return (100 * (1 - x))

# Calculate method results for each classifier
def calculate_method_results(data, col_names):
    method_results = {}
    for col_name in col_names:
        if col_name == 'num_features_algo' or col_name == '_wrapper' or col_name == 'score_mean':
            continue
        if col_name == 'CFS':
            continue  # Skip 'CFS' column
        method_results[col_name] = data.pivot(index='num_features_algo', columns='_wrapper', values=col_name)
    return method_results


# Define the column names to calculate results
col_names = ['CFS', 'FScore', 'GA', 'InfoGain', 'MRMR', 'ReliefF', 'SFS', 'SPFSR', 'RFI']

datasets[0].head(10)

In [None]:
combined_df_unique_names = pd.concat(datasets, ignore_index=True)
unique_datasets = combined_df_unique_names['dataset'].unique()
print(unique_datasets)

In [None]:
test = datasets[0]
test.head()

ds_name = set()
for dataset in datasets:
    for d in dataset['dataset']:
        ds_name.add(d)
unique_ds_names = list(ds_name)

col_names = []
for d in test.columns:
    if 'cv_' in d or d in ['num_features_algo', '_wrapper', 'score_mean']:
        col_names.append(d)

ds_feature = set()
for dataset in datasets:
    for d in dataset['num_features_algo']:
        ds_feature.add(d)
unique_ds_features = list(ds_feature)

unique_folders = datasets[0]['dataset'].unique()
unique_classifiers = datasets[0]['_wrapper'].unique()

print('----- TEST values -----')
print(unique_folders)
print(unique_classifiers)
print('----- Actual values -----')
print(f'Dataset Names: {unique_ds_names}')
print(f'Column Names: {col_names}')
print(f'Features: {unique_ds_features}')

In [None]:
matching_DT = []
matching_SVM = []
matching_NB = []
matching_KNN = []

for dataset in datasets:
    if '_wrapper' in dataset.columns and 'DT' in dataset['_wrapper'].values:
        dt_rows = dataset[dataset['_wrapper'] == 'DT']
        matching_DT.append(dt_rows)

    if '_wrapper' in dataset.columns and 'SVM' in dataset['_wrapper'].values:
        dt_rows = dataset[dataset['_wrapper'] == 'SVM']
        matching_SVM.append(dt_rows)

    if '_wrapper' in dataset.columns and 'NB' in dataset['_wrapper'].values:
        dt_rows = dataset[dataset['_wrapper'] == 'NB']
        matching_NB.append(dt_rows)

    if '_wrapper' in dataset.columns and 'KNN' in dataset['_wrapper'].values:
        dt_rows = dataset[dataset['_wrapper'] == 'KNN']
        matching_KNN.append(dt_rows)

DT = pd.concat(matching_DT, ignore_index=True)
SVM = pd.concat(matching_SVM, ignore_index=True)
NB = pd.concat(matching_NB, ignore_index=True)
KNN = pd.concat(matching_KNN, ignore_index=True)

In [None]:
fs_methods_name = list(DT['fs_method'].unique())

## PLOTS

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
unique_fs_methods = combined_df_unique_names['fs_method'].unique()
print(unique_fs_methods)

In [None]:
# Define different line styles and markers
line_styles = ['-', '--', '-.', ':', '-', '--', '-.', ':', '-']
markers = ['o', 's', 'D', 'X', '^', 'v', '<', '>', '+', '*']

# List of classifiers and their corresponding dataframes
classifiers = ['DT', 'KNN', 'NB', 'SVM']
classifier_dataframes = [DT, KNN, NB, SVM]

# Create a new folder for saving the plots
new_folder_name = "FINAL_VERSION_INDIVIDUAL"
current_folder = os.getcwd()
new_folder_path = os.path.join(current_folder, new_folder_name)
if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)

# Define the classifiers and methods to plot
classifiers = unique_classifiers
methods = unique_fs_methods

# Iterate through each dataset
for dataset_name in data_list:
    for i, (classifier, dataframe) in enumerate(zip(classifiers, classifier_dataframes)):
        classifier_df = dataframe[dataframe['_wrapper'] == classifier]

        # Create a figure for the current dataset and classifier
        fig, ax = plt.subplots(figsize=(12, 6))

        # Define a custom color palette with distinct colors for each fs_method
        custom_palette = sns.color_palette("husl", len(classifier_df['fs_method'].unique()))

        # Iterate through each fs_method
        for j, (fs_method, fs_method_data) in enumerate(classifier_df.groupby('fs_method')):
            # Filter the data for the current dataset
            if fs_method == 'PCA':
                continue  # Skip plotting 'PCA'
            dataset_data = fs_method_data[fs_method_data['dataset'] == dataset_name]

            # Plot the data for the current fs_method with custom line style, marker, and color
            sns.lineplot(
                data=dataset_data, x='num_features_algo', y='score_mean',
                label=fs_method, linestyle=line_styles[j % len(line_styles)], marker=markers[j % len(markers)],
                color=custom_palette[j],  # Use a distinct color from the custom palette
                ci=None,
                ax=ax
            )
        FONT_SIZE = 28
        # Set plot labels and title
        ax.set_xlabel('Number of Selected Features', fontsize=FONT_SIZE)
        ax.set_ylabel('Accuracy (%)', fontsize=FONT_SIZE)
        ax.set_title(f'{classifier} - {dataset_name}', fontsize=FONT_SIZE + 2)

        # Add a legend to the plot
        ax.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize=FONT_SIZE - 16)

        # Set custom x-axis ticks and labels
        custom_ticks = [5, 10, 15, 20, 25]
        ax.set_xticks(custom_ticks)
        ax.set_xticklabels(custom_ticks)

        # Save the individual plot for the dataset and classifier as .png and .eps
        plot_filename = f"{dataset_name}_{classifier}_plot"
        plot_filepath_png = os.path.join(new_folder_path, f"{plot_filename}.png")
        plot_filepath_eps = os.path.join(new_folder_path, f"{plot_filename}.eps")
        fig.savefig(plot_filepath_png, format='png', dpi=300, bbox_inches='tight')
        fig.savefig(plot_filepath_eps, format='eps', dpi=300, bbox_inches='tight')

        # Show and close the plot to release resources
        plt.show()
        plt.close()