## Setup

In [None]:
%pip install scipy
%pip install matplotlib
%pip install sentence_transformers

## Import

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

## Setup Directories

In [None]:
dataset_name = "RedHat"

# Defining the results and model save paths using the dataset name
results_path = f'./RESULTS/{dataset_name}'

if not os.path.exists(results_path):
    os.makedirs(results_path)

## Metrics Plot

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval

# Load the data
test_results_data = pd.read_csv('test_results_monthly.csv', index_col='index')
test_results_data.index = pd.to_datetime(test_results_data.index)

# Initialize lists to store the extracted data
precision_data = []
recall_data = []
f1_score_data = []
accuracy_data = []

# Extract the data
for row in test_results_data.itertuples():
    macro_avg_metrics = literal_eval(row._4)  # Adjust the index if necessary based on your DataFrame
    precision_data.append(macro_avg_metrics['precision'])
    recall_data.append(macro_avg_metrics['recall'])
    f1_score_data.append(macro_avg_metrics['f1-score'])
    accuracy_data.append(row.accuracy)  # Assuming 'accuracy' is directly accessible

# Create new columns in the DataFrame
test_results_data['Precision'] = precision_data
test_results_data['Recall'] = recall_data
test_results_data['F1-Score'] = f1_score_data
test_results_data['Accuracy'] = accuracy_data

# Filter out data from 2022
test_results_data = test_results_data[test_results_data.index.year != 2022]

# Function to plot metrics
def plot_two_metrics(data, metric1, metric2, title, is_annual=False):
    plt.figure(figsize=(10, 6))
    plt.plot(data.index, data[metric1], marker='o', label=metric1)
    plt.plot(data.index, data[metric2], marker='o', label=metric2)
    plt.title(title)
    plt.xlabel('Year' if is_annual else 'Month')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    if is_annual:
        plt.xticks(data.index, [str(year.year) for year in data.index])
    else:
        plt.xticks(pd.date_range(start=data.index.min(), end=data.index.max(), freq='YS'), rotation=45)
    plt.tight_layout()
    plt.savefig(f'{results_path}/{title}.png')
    plt.show()

# Monthly plots
plot_two_metrics(test_results_data, 'F1-Score', 'Accuracy', 'F1-Score and Accuracy (Monthly)')
plot_two_metrics(test_results_data, 'Precision', 'Recall', 'Precision and Recall (Monthly)')

# Annual plots
annual_metrics_data = test_results_data[['Precision', 'Recall', 'F1-Score', 'Accuracy']].resample('AS').mean()
plot_two_metrics(annual_metrics_data, 'F1-Score', 'Accuracy', 'F1-Score and Accuracy (Annual)', is_annual=True)
plot_two_metrics(annual_metrics_data, 'Precision', 'Recall', 'Precision and Recall (Annual)', is_annual=True)


## Cosine Similarity Plot

In [None]:
# Load the cosine similarity data from a CSV file and set 'Month' as the index column
cosine_similarity_df = pd.read_csv("cosine_similarity_monthly.csv", index_col='Month')
cosine_similarity_df.index = pd.to_datetime(cosine_similarity_df.index)  # Convert index to datetime

cosine_similarity_df = cosine_similarity_df[cosine_similarity_df.index.year != 2022] # Remove 2022 data

# Function to plot cosine similarity with correct annual labels
def plot_cosine_similarity(data, title, is_annual=False):
    plt.figure(figsize=(10, 6))
    plt.plot(data.index, data['Cosine Similarity'], marker='o', label='Cosine Similarity')
    plt.title(title)
    plt.xlabel('Year' if is_annual else 'Month')
    plt.ylabel('Cosine Similarity')
    plt.legend()
    plt.grid(True)

    if is_annual:
        # Adjusting the x-ticks for annual data
        plt.xticks(data.index, [str(year.year) for year in data.index])
    else:
        # For monthly data, ensure x-ticks show only the first month of each year
        plt.xticks(pd.date_range(start=data.index.min(), end=data.index.max(), freq='YS'), rotation=45)

    plt.tight_layout()
    plt.savefig(f'{results_path}/{title}.png')
    plt.show()

# Monthly plot for cosine similarity with annual labels
plot_cosine_similarity(cosine_similarity_df, 'Monthly Cosine Similarity')

# Annual plot for cosine similarity
# Using 'AS' to label based on the start of the year
annual_cosine_similarity = cosine_similarity_df.resample('AS').mean()
plot_cosine_similarity(annual_cosine_similarity, 'Annual Cosine Similarity', is_annual=True)

## Shapiro-Wilk Test for Normality
This test will help determine if the variables have a normal distribution, which is a prerequisite for performing Pearson's correlation.

In [None]:
from scipy.stats import shapiro

# Function to perform Shapiro-Wilk test
def shapiro_wilk_test(data, alpha=0.05):
    stat, p = shapiro(data)
    print('Statistics=%f, p=%f' % (stat, p))
    if p > alpha:
        print('Sample looks normally distributed')
    else:
        print('Sample does not look normally distributed')

Execute Shapiro-Wilk test on f1-score and cosine similarity

In [None]:
# Testing if the recall values have a normal distribution
print("Recall Shapiro-Wilk Test:")
shapiro_wilk_test(recall_data)
# Testing if the precision values have a normal distribution
print("Precision Shapiro-Wilk Test:")
shapiro_wilk_test(precision_data)
# Testing if the cosine similarity values have a normal distribution
print("Cosine Similarity Shapiro-Wilk Test:")
shapiro_wilk_test(cosine_similarity_df['Cosine Similarity'])
# Testing if the f1-score values have a normal distribution
print("F1-Score Shapiro-Wilk Test:")
shapiro_wilk_test(f1_score_data)

## Calculate Pearson Correlation Coefficient
This section calculates Pearson's correlation coefficient to explore the relationship between the F1 scores and cosine similarity values.

In [None]:
from scipy.stats import pearsonr

def pearson(data):
    pearson_corr, p_value = pearsonr(data, cosine_similarity_df['Cosine Similarity'])

    print(f"Correlation coefficient: {pearson_corr}")
    print(f"P-value: {p_value}")

# Calculate Pearson's correlation coefficient for precision
print("Precision Pearson's Correlation:")
pearson(precision_data)
# Calculate Pearson's correlation coefficient for recall
print("Recall Pearson's Correlation:")
pearson(recall_data)
# Calculate Pearson's correlation coefficient for f1-score
print("F1-Score Pearson's Correlation:")
pearson(f1_score_data)

## Calculate Spearman Correlation Coefficient

In [None]:
from scipy.stats import spearmanr

def spearman(data):
    # Calculate Spearman's correlation coefficient between F1 scores and cosine similarity
    spearman_corr, spearman_p_value = spearmanr(data, cosine_similarity_df['Cosine Similarity'])

    print(f"Correlation coefficient: {spearman_corr}")
    print(f"P-value: {spearman_p_value}")
    
# Calculate Spearman's correlation coefficient for precision
print("Precision Spearman's Correlation:")
spearman(precision_data)
# Calculate Spearman's correlation coefficient for recall
print("Recall Spearman's Correlation:")
spearman(recall_data)
# Calculate Spearman's correlation coefficient for f1-score
print("F1-Score Spearman's Correlation:")
spearman(f1_score_data)