#### Import the packages and functions

In [None]:
## --- Import the packages and functions
from tools.functions import *
import matplotlib.pyplot as plt
import pickle

#### Definitions, parameters and directory paths

In [None]:
# --- Definitions, parameters and directory paths
# Define the base path for your images
base_path       = '../img'

# Define the path to your dataset within the base path and using gray images
dataset_path    = f'{base_path}/dataset/gray/'

# List of ratios to be used for splitting the dataset into training and testing sets
train_ratio     = [0.75, 0.8, 0.85, 0.9]

# Threshold for normalizing the image data
thresh_normalization = 0.500

# List of thresholds to be considered 'good' in evaluation
thresh_good     = [0.00, 0.10, 0.15]

# Range of iterations to be performed during model training
iterations      = range(1, 6)

# Define different classification types for the model to learn. Each type is associated with a list of categories.
classification_types = {'n_grains': ['50', '60', '70', '80', '90', '100'], 
                        'defect_stratified': ['0%', '10%', '15%', '20%', '25%', '30%'], 
                        'defect_thresholded': ['With defects', 'Healthy']}

# Define labels for the x and y axes in plotting
x_label = 'Predicted'
y_label = 'True'

## ---

#### Reading the dataframes and classification results

In [43]:
files = ['../resources/df_train.pkl', '../resources/df_test.pkl', 
         '../resources/classification_metrics_results.pickle', '../resources/confusion_matrices.pickle']

df_train, df_test, classification_metrics_results, confusion_matrices = (pickle.load(open(file, 'rb')) for file in files)

#### Exports a certain confusion matrix in pdf format

In [None]:
train_ratio_item = train_ratio[0]
thresh_good_item = thresh_good[1]
classification_type = 'n_grains'
image_name = 'cm_075_ngrains'

cm = confusion_matrices[train_ratio_item][thresh_good_item][classification_type]['mean']
export_confusion_matrix_as_image(cm, classification_types[classification_type], f"..\{image_name}.pdf", [x_label, y_label])

#### Shows the classification results for a certain specification

In [None]:
train_ratio_item = train_ratio[0]
thresh_good_item = thresh_good[1]
classification_type = 'defect_thresholded'

display(pd.DataFrame(classification_metrics_results[train_ratio_item][thresh_good_item][classification_type]['mean']).transpose())

#### Visualize the effect of normalization

In [None]:
# Function to plot the relation between light/dark pixels before and after normalization
def plot_relation(df_train, df_test, normalization_thresh, save_path):
    # Copy the training DataFrame
    df_copy = df_train.copy()
    # Normalize the dataset based on the given normalization threshold
    df_copy, df_test = normalize_dataset(df_copy, df_test, 'ratio_80to255_by_1to80', normalization_thresh)
    # Filter the DataFrame for the specific condition and select the required columns
    df_copy = df_copy.loc[(df_copy['defect_percentage'] == 0.00) & (df_copy['grain_quantity'] == 100), ['npixels_1to255_per_grain', 'ratio_80to255_by_1to80', 'normalized_ratio_80to255_by_1to80']].copy()
    
    # Create the x-axis data as a range of the length of the DataFrame
    x = np.arange(len(df_copy))
    # Extract the y-axis data for the two plots
    y1 = df_copy['ratio_80to255_by_1to80']
    y2 = df_copy['normalized_ratio_80to255_by_1to80']

    # Set the style and context for a more elegant plot
    sns.set(style='whitegrid', context='notebook')
    plt.figure(figsize=(10, 6))

    # Plot the two series with their respective labels
    plt.plot(x, y1, label='Ratio between light/dark pixels', marker='o', linestyle='-', linewidth=2, markersize=4)
    plt.plot(x, y2, label='Normalized ratio between light/dark pixels', marker='o', linestyle='-', linewidth=2, markersize=4)

    # Set the labels for the x and y axis
    plt.xlabel('Occurrence', fontsize=18)
    plt.ylabel('Relation', fontsize=18)

    # Display the legend, set the font size for the tick labels
    plt.legend(fontsize=18)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)

    # Set the color and width of the plot borders
    for spine in plt.gca().spines.values():
        spine.set_edgecolor('black')
        spine.set_linewidth(1)

    # Save the plot to the specified path and display it
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()

# List of normalization thresholds
normalization_thresholds = [100.0, 1.000, 0.750, 0.500]
# Corresponding save paths for the plots
save_paths = [f'{base_path}\without_normalization.pdf', f'{base_path}\\normalization_upTo_1.pdf', f'{base_path}\\normalization_upTo_075.pdf', f'{base_path}\\normalization_upTo_050.pdf']

# For each normalization threshold and corresponding save path
for normalization_thresh, save_path in zip(normalization_thresholds, save_paths):
    # Call the function to plot the relation
    plot_relation(df_train, df_test, normalization_thresh, save_path)


#### Plots the boxplot to verify the importance of cross-validation

In [None]:
# Define the train ratios and the range of iterations
train_ratio = [0.75, 0.8, 0.85, 0.9]
iterations = range(1, 6)

# Initialize lists to store accuracy data and corresponding x-axis labels
accuracy_data = []
x_labels = []

# For each training ratio, calculate accuracy for each iteration and append to 'accuracy_data'
# Also, append the training ratio to 'x_labels'
for ratio in train_ratio:
    accuracy = []
    for i in iterations:
        accuracy.append(classification_metrics_results[ratio][thresh_good[1]]['defect_stratified'][i]['accuracy'])

    accuracy_data.append(accuracy)
    x_labels.append(ratio)

# Set up the plot with a specified size and style
plt.figure(figsize=(10, 6))
plt.style.use('default')

# Create the boxplot using the 'accuracy_data' and label the x-axis with 'x_labels'
plt.boxplot(accuracy_data, labels=x_labels)

# Set the x and y axis labels
plt.xlabel('Training Percentage', fontsize=16)
plt.ylabel('Accuracy', fontsize=16)

# Set the font size for x and y tick labels
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Set the plot borders to black and set the border line width
for spine in plt.gca().spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(1)

# Remove gridlines from the plot
plt.grid(False)

# Save the figure as a PDF file with a specified dpi and bounding box tightness
plt.savefig(r'..\accuracy_boxplot_defect_stratified.pdf', dpi=300, bbox_inches='tight')

# Display the plot
plt.show()