In [None]:
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import glob

In [None]:
# List of datasets
data_list = ['christensen', 'sun', 'alon', 'nakayama', 'tian', 'chin', 'chowdary', 
             'subramanian', 'chiaretti', 'golub', 'shipp', 'su', 'gordon', 'khan', 
             'singh', 'gravier', 'borovecki', 'sorlie', 'west']

wrapper_list = ['Decision Tree', 'k-Nearest Neighbor', 'Naive Bayes', 'Support Vector Machine']

# Path to the folder containing data files
PATH_TEST = '/Users/erirosv/fun/Master-Thesis/plotting-result/version2/merged-data'
p = os.path.abspath(PATH_TEST)

datasets = []
for d in data_list:
    csv_files = glob.glob(os.path.join(p, f'results_{d}.csv'))
    for csv_file in csv_files:
        dataset = pd.read_csv(csv_file)
        dataset['score_mean'] = 100 * (1 - pd.to_numeric(dataset['cv_error_mean'], errors='coerce'))
        datasets.append(dataset)  # Append the dataset to the list

In [None]:
datasets[0].head(10)

In [None]:
# Define a function to convert a value to accuracy percentages
def convert_to_accuracy_percentages(x):
    return (100 * (1 - x))

# Calculate method results for each classifier
def calculate_method_results(data, col_names):
    method_results = {}
    for col_name in col_names:
        if col_name == 'num_features_algo' or col_name == '_wrapper' or col_name == 'score_mean':
            continue
        if col_name == 'CFS':
            continue  # Skip 'CFS' column
        method_results[col_name] = data.pivot(index='num_features_algo', columns='_wrapper', values=col_name)
    return method_results

# Define the column names to calculate results
col_names = ['CFS', 'FScore', 'GA', 'InfoGain', 'MRMR', 'ReliefF', 'SFS', 'SPFSR', 'RFI']

datasets[0].head(10)

In [None]:
test = datasets[0]
test.head()

ds_name = set()
for dataset in datasets:
    for d in dataset['dataset']:
        ds_name.add(d)
unique_ds_names = list(ds_name)

col_names = []
for d in test.columns:
    if 'cv_' in d or d in ['num_features_algo', '_wrapper', 'score_mean']:
        col_names.append(d)

ds_feature = set()
for dataset in datasets:
    for d in dataset['num_features_algo']:
        ds_feature.add(d)
unique_ds_features = list(ds_feature)

unique_folders = datasets[0]['dataset'].unique()
unique_classifiers = datasets[0]['_wrapper'].unique()

print('----- TEST values -----')
print(unique_folders)
print(unique_classifiers)
print('----- Actual values -----')
print(f'Dataset Names: {unique_ds_names}')
print(f'Column Names: {col_names}')
print(f'Features: {unique_ds_features}')

In [None]:
matching_DT = []
matching_SVM = []
matching_NB = []
matching_KNN = []

for dataset in datasets:
    if '_wrapper' in dataset.columns and 'DT' in dataset['_wrapper'].values:
        dt_rows = dataset[dataset['_wrapper'] == 'DT']
        matching_DT.append(dt_rows)

    if '_wrapper' in dataset.columns and 'SVM' in dataset['_wrapper'].values:
        dt_rows = dataset[dataset['_wrapper'] == 'SVM']
        matching_SVM.append(dt_rows)

    if '_wrapper' in dataset.columns and 'NB' in dataset['_wrapper'].values:
        dt_rows = dataset[dataset['_wrapper'] == 'NB']
        matching_NB.append(dt_rows)

    if '_wrapper' in dataset.columns and 'KNN' in dataset['_wrapper'].values:
        dt_rows = dataset[dataset['_wrapper'] == 'KNN']
        matching_KNN.append(dt_rows)

DT = pd.concat(matching_DT, ignore_index=True)
SVM = pd.concat(matching_SVM, ignore_index=True)
NB = pd.concat(matching_NB, ignore_index=True)
KNN = pd.concat(matching_KNN, ignore_index=True)

In [None]:
KNN.head(10)

In [None]:
def calculate_method_results(dataframe):
    method_results = {}

    # Calculate the mean for 'num_features_algo', 'score_mean', and 'fs_method'
    method_results['num_features_algo'] = dataframe.groupby(['dataset', 'num_features_algo'])['score_mean'].mean().reset_index()
    
    # Remove duplicates from '_wrapper' column and save it in a separate key
    method_results['_wrapper'] = dataframe.drop_duplicates(['dataset', '_wrapper'])[['_wrapper']]
    
    # Calculate the mean for 'score_mean' without grouping by '_wrapper'
    method_results['score_mean'] = dataframe.groupby(['dataset'])['score_mean'].mean().reset_index()
    
    # Assuming 'fs_method' is a column in the dataframe, calculate its mean
    method_results['fs_method'] = dataframe.groupby(['dataset', 'fs_method'])['score_mean'].mean().reset_index()

    return method_results


In [None]:
method_result_DT = calculate_method_results(DT)
method_result_NB = calculate_method_results(NB)
method_result_SVM = calculate_method_results(SVM)
method_result_KNN = calculate_method_results(KNN)

In [None]:
NB.head(20)

In [None]:
print(method_result_DT.keys())
print()
print(method_result_DT.items())
print()
print(method_result_DT)

## Potting

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
line_styles = ['-', '--', '-.', ':', '-', '--', '-.', ':', '-']
markers = ['o', 's', 'D', 'X', '^', 'v', '<', '>', '+']

classifiers = ['DT', 'KNN', 'NB', 'SVM']
classifier_dataframes = [DT, KNN, NB, SVM]

new_folder_name = "acc_num_feature"
current_folder = os.getcwd()
new_folder_path = os.path.join(current_folder, new_folder_name)
if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)

for classifier, dataframe in zip(classifiers, classifier_dataframes):
    plt.figure(figsize=(12, 6))
    print(classifier)
    classifier_df = dataframe[dataframe['_wrapper'] == classifier]
    custom_palette = sns.color_palette("husl", len(classifier_df['fs_method'].unique()))

    for i, (fs_method, fs_method_data) in enumerate(classifier_df.groupby('fs_method')):
        sns.lineplot(
            data=fs_method_data, x='num_features_algo', y='score_mean',
            label=fs_method, linestyle=line_styles[i % len(line_styles)], marker=markers[i % len(markers)],
            color=custom_palette[i],  
            errorbar=None  
        )

    FONT_SIZE = 28
    custom_ticks = [5, 10, 15, 20, 25]
    plt.xlabel('Number of Selected Features', fontsize=FONT_SIZE)
    plt.ylabel('Accuracy (%)', fontsize=FONT_SIZE)
    plt.title(f'{classifier}', fontsize=FONT_SIZE + 2)
    plt.xticks(custom_ticks, labels=custom_ticks)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=FONT_SIZE - 16)

    plot_filename_png = f"{classifier}_plot.png"
    plot_filepath_png = os.path.join(new_folder_path, plot_filename_png)
    plt.savefig(plot_filepath_png, format='png', dpi=300)

    plot_filename_eps = f"{classifier}_plot.eps"
    plot_filepath_eps = os.path.join(new_folder_path, plot_filename_eps)
    plt.savefig(plot_filepath_eps, format='eps', dpi=300)

    plt.tight_layout()
    plt.show()
