## Setup

In [None]:
%pip install scipy

## Import

In [None]:
import pandas as pd

## Load from CSV

In [None]:
from ast import literal_eval

# Load the data from the CSV files
cosine_similarity_data = pd.read_csv('cosine_similarity_monthly.csv')
test_results_data = pd.read_csv('test_results_monthly.csv')

# Extract cosine similarity data
cosine_similarity_data = cosine_similarity_data['Cosine Similarity'].dropna().tolist()

precision_data = []
recall_data = []
f1_score_data = []

# Iterate through the 'macro avg' column to extract precision and recall
for item in test_results_data['macro avg']:
    metrics = literal_eval(item)  # Convert string to dictionary
    precision_data.append(metrics['precision'])
    recall_data.append(metrics['recall'])
    f1_score_data.append(metrics['f1-score'])

## Shapiro-Wilk Test for Normality
This test will help determine if the variables have a normal distribution, which is a prerequisite for performing Pearson's correlation.

In [None]:
from scipy.stats import shapiro

# Function to perform Shapiro-Wilk test
def shapiro_wilk_test(data, alpha=0.05):
    stat, p = shapiro(data)
    print('Statistics=%f, p=%f' % (stat, p))
    if p > alpha:
        print('Sample looks normally distributed')
    else:
        print('Sample does not look normally distributed')

Execute Shapiro-Wilk test on f1-score and cosine similarity

In [None]:
# Testing if the recall values have a normal distribution
print("Recall Shapiro-Wilk Test:")
shapiro_wilk_test(recall_data)
# Testing if the precision values have a normal distribution
print("Precision Shapiro-Wilk Test:")
shapiro_wilk_test(precision_data)
# Testing if the cosine similarity values have a normal distribution
print("Cosine Similarity Shapiro-Wilk Test:")
shapiro_wilk_test(cosine_similarity_data)
# Testing if the f1-score values have a normal distribution
print("F1-Score Shapiro-Wilk Test:")
shapiro_wilk_test(f1_score_data)

## Calculate Pearson Correlation Coefficient
This section calculates Pearson's correlation coefficient to explore the relationship between the F1 scores and cosine similarity values.

In [None]:
from scipy.stats import pearsonr

def pearson(data):
    pearson_corr, p_value = pearsonr(data, cosine_similarity_data)

    print(f"Correlation coefficient: {pearson_corr}")
    print(f"P-value: {p_value}")

# Calculate Pearson's correlation coefficient for precision
print("Precision Pearson's Correlation:")
pearson(precision_data)
# Calculate Pearson's correlation coefficient for recall
print("Recall Pearson's Correlation:")
pearson(recall_data)
# Calculate Pearson's correlation coefficient for f1-score
print("F1-Score Pearson's Correlation:")
pearson(f1_score_data)

## Calculate Spearman Correlation Coefficient

In [None]:
from scipy.stats import spearmanr

def spearman(data):
    # Calculate Spearman's correlation coefficient between F1 scores and cosine similarity
    spearman_corr, spearman_p_value = spearmanr(data, cosine_similarity_data)

    print(f"Correlation coefficient: {spearman_corr}")
    print(f"P-value: {spearman_p_value}")
    
# Calculate Spearman's correlation coefficient for precision
print("Precision Spearman's Correlation:")
spearman(precision_data)
# Calculate Spearman's correlation coefficient for recall
print("Recall Spearman's Correlation:")
spearman(recall_data)
# Calculate Spearman's correlation coefficient for f1-score
print("F1-Score Spearman's Correlation:")
spearman(f1_score_data)