# Step 0: Installing dependencies




In [2]:
# !pip install bert-score
# !pip install lexical-diversity
# !pip install nltk
# !pip install openai
# !pip install openpipe
# !pip install seaborn
# !pip install sentence-transformers
# !pip install spacy
# !pip install transformers
# !pip install vaderSentiment
# !pip install tqdm
# !pip install ollama
# !pip install statsmodels

# Step 1: Bulding the metrics and calculating them.


In [None]:
# Standard libraries
from collections import Counter
import json
import os
import re
import time
from datetime import timedelta
import warnings

# Data manipulation and analysis
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# NLP and ML libraries
import nltk
import spacy
from nltk.util import ngrams
from transformers import AutoTokenizer, AutoModel
from bert_score import score, BERTScorer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from lexical_diversity import lex_div as ld
from difflib import SequenceMatcher

# OpenAI and OpenPipe
import openai
from openai import OpenAI
from openpipe import OpenAI
import ollama

## Creating a function for each evaluation metric.


Note: Multiple metrics have been tested other than what we use. For testing purposes there are some extra metrics still there. Feel free to include them in the calculation function!

In [5]:
# Load spacy model for syntactic similarity
nlp = spacy.load('en_core_web_sm')

# BERTScore Calculation
def bert_score(reference, candidate):
    scorer = BERTScorer(model_type='bert-base-uncased')#, clean_up_tokenization_spaces=True)
    P, R, F1 = scorer.score([candidate], [reference])
    return P.mean().item(), R.mean().item(), F1.mean().item()


# Syntactic similarity
def syntactic_similarity(reference, candidate):
    ref_doc = nlp(reference)
    cand_doc = nlp(candidate)
    ref_deps = [token.dep_ for token in ref_doc]
    cand_deps = [token.dep_ for token in cand_doc]
    matcher = SequenceMatcher(None, ref_deps, cand_deps)
    return matcher.ratio()

# Lexical diversity (ONLY MTLD - we also tested TTR and MATTR)
def lexical_diversity(text):
    mtld = ld.mtld(text)
    return mtld

# Sentiment analysis
def sentiment_analysis(text):
    analyzer = SentimentIntensityAnalyzer()
    return analyzer.polarity_scores(text)

# Embedding-based similarity Another alternative to BERTScore of doing this, is using the embedding similarity from the sentence transformer library.
# def embedding_similarity(reference, candidate):
#     model = SentenceTransformer('all-MiniLM-L6-v2')
#     ref_embedding = model.encode([reference], convert_to_tensor=True)
#     cand_embedding = model.encode([candidate], convert_to_tensor=True)
#     similarity = cosine_similarity(ref_embedding, cand_embedding)
#     return similarity[0][0]

# N-gram overlap -its useless to use if we know that its not going to provide a valuable insights
# def ngram_overlap(reference, candidate, n=2):
#     ref_tokens = reference.split()
#     cand_tokens = candidate.split()
#     ref_ngrams = list(ngrams(ref_tokens, n))
#     cand_ngrams = list(ngrams(cand_tokens, n))
#     ref_counter = Counter(ref_ngrams)
#     cand_counter = Counter(cand_ngrams)
#     overlap = sum((ref_counter & cand_counter).values())
#     total_ngrams = len(ref_ngrams) + len(cand_ngrams)
#     return (2 * overlap) / total_ngrams * 100 if total_ngrams > 0 else 0

In [6]:
# Ignore specific FutureWarning about clean_up_tokenization_spaces because it is not fixable.
warnings.filterwarnings('ignore', category=FutureWarning, message='`clean_up_tokenization_spaces`')


## Creating a function to compute all metrics.

In [7]:
def calculate_metrics_v1(reference, candidate):

    bert_precision, bert_recall, bert_f1 = bert_score(reference, candidate)
    
    syntactic_score = syntactic_similarity(reference, candidate)
    
    ref_mtld = lexical_diversity(reference)
    cand_mtld = lexical_diversity(candidate)
    
    ref_sentiment = sentiment_analysis(reference)
    cand_sentiment = sentiment_analysis(candidate)
  
    # embedding_score = embedding_similarity(reference, candidate)
    
    # ngram_score = ngram_overlap(reference, candidate, n=2)

    # Compile all metrics into a dictionary
    metrics = {
        'BERTScore_Precision': bert_precision,
        'BERTScore_Recall': bert_recall,
        'BERTScore_F1': bert_f1,
        'Syntactic_Similarity': syntactic_score,
        'Reference_MTLD': ref_mtld,
        'Candidate_MTLD': cand_mtld,
        'Reference_Sentiment_Neg': ref_sentiment['neg'],
        'Reference_Sentiment_Neu': ref_sentiment['neu'],
        'Reference_Sentiment_Pos': ref_sentiment['pos'],
        'Reference_Sentiment_Compound': ref_sentiment['compound'],
        'Candidate_Sentiment_Neg': cand_sentiment['neg'],
        'Candidate_Sentiment_Neu': cand_sentiment['neu'],
        'Candidate_Sentiment_Pos': cand_sentiment['pos'],
        'Candidate_Sentiment_Compound': cand_sentiment['compound'],
        # 'Embedding_Similarity': embedding_score
        # '2gram_Overlap': ngram_score,
    }
    
    return metrics


## Testing the function with some examples.

In [None]:
reference = "test"
candidate = "Not a test!"
calculate_metrics_v1(reference, candidate)

In [None]:

reference = "My mom drives me to school everyday. I love her very much. I am so happy."
candidate = "I go to school everyday with my mother. I truly love my mom. I am so happy!"
calculate_metrics_v1(reference, candidate)


In [None]:
reference = "NLTK (Natural Language Toolkit) is a popular Python library for natural language processing (NLP)."
candidate = "In natural language processing, n-grams are a contiguous sequence of n items from a given sample of text or speech."
calculate_metrics_v1(reference, candidate)

# Step 2: Generating Responses

Fine-tuning LLMs can be done in a few different ways. One of them is through hugging face API or as (VM et al., 2024) does. We will use the openpipe to fine-tune and call the models.


In [12]:
def load_data(filepath, start=0, limit=None):
    with open(filepath, 'r') as f:
        data = []
        for i, line in enumerate(f):
            if i < start:
                continue  # Skip lines before the starting index
            if limit and i >= start + limit:
                break
            data.append(json.loads(line))
    return data


In [13]:
def get_openpipe_response(system_msg, chat_thread, model_name, temperature_value, max_retries=3):
    # Load environment variables
    load_dotenv()
    openai_api_key = os.getenv('OPENAI_API_KEY')
    openpipe_api_key = os.getenv('OPENPIPE_API_KEY')

    client = OpenAI(
        api_key=openai_api_key,
        openpipe={
            "api_key": openpipe_api_key,
            "base_url": "https://app.openpipe.ai/api/v1",
        }
    )

    # Combine system message and chat history
    chat_thread = [{"role": "system", "content": system_msg}] + chat_thread
    
    response_text = ''
    for attempt in range(max_retries):
        try:
            # Call the OpenPipe model and generate response
            response = client.chat.completions.create(
                model=model_name,
                messages=chat_thread,
                temperature=temperature_value,
                openpipe={
                    "tags": {},
                    "log_request": True
                }
            )
            
            # Extract the response content
            for attr_name, attr_value in response:
                if attr_name == 'choices':
                    for choice in attr_value:
                        if choice.message.content is not None:
                            response_text += choice.message.content
            return response_text

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Attempt {attempt + 1} failed: {e}. Retrying...")
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                print(f"All attempts failed for response generation: {e}")
                return None

In [14]:
def extract_and_generate_response(row, model_name, temperature_value):
    messages = row['messages']
    
    # Get the last assistant message (reference)
    reference = None
    last_assistant_index = None
    for idx in reversed(range(len(messages))):
        if messages[idx]['role'] == 'assistant':
            reference = messages[idx]['content']
            last_assistant_index = idx
            break
    
    # Ensure we found a reference
    if reference is None:
        return None, None, None  # Return None for context, reference, and candidate
    
    # Exclude the last assistant message from the chat history
    system_msg = messages[0]['content']  # First message is the system message
    chat_thread = messages[1:last_assistant_index]  # Exclude system message and last assistant message
    
    # Extract context (excluding the system message and last assistant message)
    context = ' '.join([f"{msg['role']}: {msg['content']}" for msg in chat_thread])

    # Call the model to generate a response (candidate)
    candidate = get_openpipe_response(system_msg, chat_thread, model_name, temperature_value)
    
    return context, reference, candidate


In [15]:
def process_and_save_results(data, model_name, temperature_value, output_csv):
    results = []

    for row in data:
        context, reference, candidate = extract_and_generate_response(row, model_name, temperature_value)
        if reference and candidate:
            metrics = calculate_metrics_v1(reference, candidate)
            
            # Add context, reference, and candidate to the metrics
            metrics['Context'] = context if context else ''
            metrics['Reference'] = reference
            metrics['Candidate'] = candidate
            
            results.append(metrics)
            
    
    # Convert results to a DataFrame
    df = pd.DataFrame(results)
    
    # Save the results to a CSV file
    df.to_csv(output_csv, index=False)
    
    stats = df.describe().loc[['min', 'max', 'mean']]

    # print("Metrics Statistics (min, max, mean):")
    # print(stats.T)
    
    return df


## Parameter cell - make any changes in the following cell:
### Takes ages to run - so I've added tracking at each step!

In [None]:
# Parameter Configuration
temperature_value = 0.0

# Define creators and their corresponding test datasets
creators = {
    'creator1': {
        'test_file': '', # Add test file path for creator1
        'models': ['Strategy A Creator 1', 'Strategy B Creator 1', 'Strategy C Creator 1']
    },
    'creator2': {
        'test_file': '', # Add test file path for creator2
        'models': ['Strategy A Creator 2', 'Strategy B Creator 2', 'Strategy C Creator 2']
    },
    'creator3': {
        'test_file': '', # Add test file path for creator3
        'models': ['Strategy A Creator 3', 'Strategy B Creator 3', 'Strategy C Creator 3']
    }
}

data_limit = None
data_start = 0

start_time = time.time()

for creator, config in creators.items():
    print(f"\nProcessing {creator}'s data")
    output_dir = f'results_{creator}'
    os.makedirs(output_dir, exist_ok=True)
    
    # Create checkpoint file path
    checkpoint_file = os.path.join(output_dir, 'checkpoint.json')
    
    # Load checkpoint if exists
    completed_models = []
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as f:
            completed_models = json.load(f)
        print(f"Found checkpoint with completed models: {completed_models}")
    
    data = load_data(config['test_file'], start=data_start, limit=data_limit)
    
    for i, model_name in enumerate(config['models'], 1):
        if model_name in completed_models:
            print(f"Skipping {model_name} (already completed)")
            continue
            
        print(f"Processing model {i}/{len(config['models'])}: {model_name}")
        try:
            process_and_save_results(data, model_name, temperature_value, f"{output_dir}/Strategy_{i}.csv")
            
            # Update checkpoint
            completed_models.append(model_name)
            with open(checkpoint_file, 'w') as f:
                json.dump(completed_models, f)
                
            print(f"Completed {model_name}")
            
        except Exception as e:
            print(f"Error processing {model_name}: {e}")
            continue

print(f"\nAll processing completed in {timedelta(seconds=int(time.time() - start_time))}")

In [17]:
### Parameter cell - make any changes in the following cell:
metrics_to_analyze = {
    'semantic': 'BERTScore_F1',
    'syntax': 'Syntactic_Similarity',
    'lexical': ['Reference_MTLD', 'Candidate_MTLD'],
    'sentiment': ['Reference_Sentiment_Compound', 'Candidate_Sentiment_Compound']
}

# Sample size threshold for different tests
SAMPLE_SIZE_THRESHOLD = 30  # For normality assumptions

# Step 3: Calling the non fine-tuned model

##### There are two important things above else to note.
##### Why are we doing it in this part of the code, and why do we limit the data?

##### 1. Personally, it was easier to do it after Generating the responses of the Fine-tuned models, because building them gave me the default code (just using it again with minor modifications) required to generate the responses. 

##### 2. Why do we limit the data? How are we building it? To answer the RQ1 (pretrained vs fine-tuned) we also need a pretrained model. How do we show the pretrained model what to learn from or more correctly said what to replicate. I'm giving it same data from training that goes into any Strategy. Why will there be a limit? Well we are simply feeding into its system message some sample data. And the context window of LLMs are limited to about 100k tokens. Some have more or less. We also cant share sensitive data with the model so we make it in the form of any Strategy employed (Json format) just like the fine-tune models get it. The difference is that we are instructing it by appending to the initial sentence of the system message about what the format is and what it should pay attention to (to make it as fair as possible). So, with a simple token counter, we concluded that 3k rows of the training file (csv) are roughly 80k tokens, giving us the space to also prepend the actual system message that instructs the model.

##### For this part we will download the model using the opensource ollama.


Taking the first 3k rows of the training csv file (Not randomly choosing 3k rows because it will ruin the integrity of conversations). It would be redunant to make one for each Creator so we are choosing the data from the first Creator.

In [None]:
# Read the CSV file
file_path =  #filepath to the training file of the first creator
df = pd.read_csv(file_path)

# Select first 3000 rows
sampled_df = df[:3000]

# Create new filename by adding '_3k' before the extension
new_file_path = 'content_for_sysmsg.csv'

# Save to new file
sampled_df.to_csv(new_file_path, index=False)

print(f"Original dataset size: {len(df)}")
print(f"Sampled dataset size: {len(sampled_df)}")
print(f"Saved to: {new_file_path}")
# print(sampled_df.head(2))

Converting the selected training data into Strategy A (one pair) format. Its the best way to keep the token count lower to maximize the amount of data included in the system message.

In [None]:
def generate_model_a_pairs(df, creator):
    jsonl_pairs = []
    
    # Process each row
    for i in range(len(df) - 1):  # Go up to second-to-last row
        current_row = df.iloc[i]
        next_row = df.iloc[i + 1]
        
        # Skip if current message is from assistant (creator)
        if current_row['sender_handle'] == creator:
            continue
            
        # Check if we have a valid user-assistant pair
        if (current_row['sender_handle'] != creator and 
            next_row['sender_handle'] == creator):
            
            pair = {
                "messages": [
                    {
                        "role": "user",
                        "content": current_row['text']
                    },
                    {
                        "role": "assistant",
                        "content": next_row['text']
                    }
                ]
            }
            jsonl_pairs.append(pair)
    
    return jsonl_pairs

def save_to_jsonl(jsonl_pairs, output_file):
    with open(output_file, 'w') as file:
        for pair in jsonl_pairs:
            file.write(json.dumps(pair) + '\n')

def process_and_save_pairs(df, creator, output_path):
    # Generate the pairs and save to JSONL
    pairs = generate_model_a_pairs(df, creator)
    save_to_jsonl(pairs, output_path)
    print(f"Generated {len(pairs)} user-assistant pairs for {creator}")

# Process the sampled data
process_and_save_pairs(sampled_df, 'creator1', 'content_for_sysmsg.jsonl') 

Similar function to get the responses

In [24]:

def get_llama_response(system_msg, messages, model="", temperature=0): # the model is Meta Llama 3.1 8B Instruct just like the fine-tuned models. 
    # https://ollama.com/library/llama3.1:8b/blobs/8eeb52dfb3bb
    
    # Construct the full message list starting with system message
    full_messages = [
        {
            "role": "system",
            "content": system_msg
        }
    ]
    # Add the rest of the messages
    full_messages.extend(messages)
    try:
        response = ollama.chat(
            model=model,
            messages=full_messages,
            options={"temperature": temperature}  # Temperature is passed in options
        )
        return response["message"]["content"]
    except Exception as e:
        print(f"Error getting Llama response: {e}")
        return None

Obviously we are going to generate the responses against the same TESTSet as the fine-tuned models.

In [None]:
with open('filepath', 'r') as f: #change for desired path
    lines = f.read()
system_msg = "System message goes here " + lines

def load_data(filepath, start=0, limit=None):
    with open(filepath, 'r') as f:
        data = []
        for i, line in enumerate(f):
            if i < start:
                continue  # Skip lines before the starting index
            if limit and i >= start + limit:
                break
            data.append(json.loads(line))
    return data

In [None]:
def extract_and_generate_response(row, model_name, temperature_value=0):
    messages = row['messages']
    # Get the last assistant message (reference)
    reference = None
    last_assistant_index = None
    for idx in reversed(range(len(messages))):
        if messages[idx]['role'] == 'assistant':
            reference = messages[idx]['content']
            last_assistant_index = idx
            break
    # Ensure we found a reference
    if reference is None:
        return None, None, None  # Return None for context, reference, and candidate
    chat_thread = messages[0:last_assistant_index]  # Exclude system message and last assistant message
    # Extract context (excluding the system message and last assistant message)
    context = ' '.join([f"{msg['role']}: {msg['content']}" for msg in chat_thread])
    # Call the model to generate a response (candidate)
    candidate = get_llama_response(system_msg, chat_thread, model_name, temperature_value)
    # print(candidate)
    return context, reference, candidate

In [None]:
def process_and_save_results(data, model_name, temperature_value, output_csv):
    results = []
    # Add tqdm progress bar
    for row in tqdm(data, desc='Processing conversations'):
        context, reference, candidate = extract_and_generate_response(row, model_name)
        results.append({
            'context': context,
            'reference': reference,
            'candidate': candidate
        })
    # Convert results to a DataFrame
    df = pd.DataFrame(results)
    # Save the results to a CSV file
    df.to_csv(output_csv, index=False)
    return df

data = load_data('', 0, None) # same path as before
process_and_save_results(data, '', 0, '') # Dont forget to change to model and filepath

We also have a file now with the responses of the pretraiend models. The next steps are to compute the metrics, and comparing against model one, answering to RQ1. If our assumption stands, that the fine-tuned strategies will perform better, we continue by comparing then the fine-tuned models against themselves.

# Step 4: Answering to RQ1. Compare the pretrained model vs fine-tuned models of the creator 1.

#### Loading the results and calculating the metrics externally (for the fine-tunes we did it internally).

In [68]:
# Load existing data with metrics
existing_df = pd.read_csv('') # path with results
print(f"Existing data shape: {existing_df.shape}")

# Load new data to append
new_df = pd.read_csv('')  # Replace with your new data path
print(f"New data shape: {new_df.shape}")

# Calculate metrics only for new rows
print("Calculating metrics for new data...")
new_metrics_results = []
for _, row in new_df.iterrows():
    metrics = calculate_metrics_v1(row['reference'], row['candidate'])
    new_metrics_results.append(metrics)

# Convert new metrics results to DataFrame
new_metrics_df = pd.DataFrame(new_metrics_results)
new_df_with_metrics = pd.concat([new_df, new_metrics_df], axis=1)

# Append new data with metrics to existing data
combined_df = pd.concat([existing_df, new_df_with_metrics], axis=0, ignore_index=True)
print(f"Combined data shape: {combined_df.shape}")

# Save the combined results
combined_df.to_csv('', index=False) # path
print("Combined data saved successfully!")

In [None]:
pretrained_df = pd.read_csv('') # path

# Calculate metrics for pretrained model
print("Calculating metrics for pretrained model...")
metrics_results = []
for _, row in pretrained_df.iterrows():
    metrics = calculate_metrics_v1(row['reference'], row['candidate'])
    metrics_results.append(metrics)  

# Convert metrics results to DataFrame and keep original columns
metrics_df = pd.DataFrame(metrics_results)
pretrained_df_with_metrics = pd.concat([pretrained_df, metrics_df], axis=1)

# Save the results with metrics
pretrained_df_with_metrics.to_csv('', index=False) # path

print("Metrics calculated and saved successfully!")

RQ1; compare pretrained vs fine-tune across metrics:

In [None]:
def load_results(base_dir='.'):
    all_results = {}
    
    # Load pretrained model results
    print("Loading pretrained model results...")
    pretrained_df = pd.read_csv('') # path
    all_results['Pretrained'] = pretrained_df
    
    # Load creator1 results only
    creator_dir = 'results_creator1'
    creator_path = os.path.join(base_dir, creator_dir)
    
    # Initialize results for creator1
    strategy_dfs = {}
    
    # Load all strategy files for creator1
    for file in os.listdir(creator_path):
        if file.startswith('Strategy_') and file.endswith('.csv'):
            strategy_num = file.split('_')[1].split('.')[0]
            file_path = os.path.join(creator_path, file)
            
            if os.path.isfile(file_path):
                df = pd.read_csv(file_path)
                strategy_dfs[f'Strategy {strategy_num}'] = df
    
    all_results['creator1'] = strategy_dfs
    
    return all_results

# Load results and set style for incoming plots!
results = load_results()
sns.set_style('whitegrid')
sns.set_palette('Set2')

# Let's verify the data is loaded correctly
print("Available models:", list(results.keys()))
for model_type, model_results in results.items():
    if model_type == 'Pretrained':
        print(f"\nPretrained model shape:", model_results.shape)
    else:
        print(f"\nCreator: {model_type}")
        print("Available strategies:", list(model_results.keys()))
        for strategy, df in model_results.items():
            print(f"Strategy {strategy} shape:", df.shape)

## Results

### BertScore

In [None]:
metric = 'BERTScore_F1'

# Create figure
fig, ax = plt.subplots(figsize=(10, 6))
fig.suptitle(f'{metric} Distribution by Model', fontsize=16, y=1.05)

# Prepare data for plotting
data = []

# Add pretrained model data
pretrained_values = results['Pretrained'][metric].dropna()
for value in pretrained_values:
    data.append({'Model': 'Pre-trained', metric: value})

# Add strategy data
for strategy_name, df in results['creator1'].items():
    values = df[metric].dropna()
    for value in values:
        data.append({'Model': strategy_name, metric: value})

metric_df = pd.DataFrame(data)

# Create box plot
sns.boxplot(x='Model', y=metric, data=metric_df, ax=ax)
ax.set_xlabel('Model')
ax.set_ylabel(metric)

# Calculate and display means
means = metric_df.groupby('Model')[metric].mean()
for i, mean_val in enumerate(means):
    ax.text(i, mean_val, f'{mean_val:.3f}', 
            horizontalalignment='center', 
            verticalalignment='bottom')

# Save statistics
stats_output = f"statistics_metrics.txt"
with open(stats_output, 'w') as f:
    f.write(f"{metric} Statistics\n{'='*50}\n\n")
    summary = metric_df.groupby('Model')[metric].agg(['mean', 'std', 'min', 'max']).round(4)
    f.write(f"{summary.to_string()}\n\n")

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Syntactic Similarity

In [None]:
metric = 'Syntactic_Similarity'

# Create figure
fig, ax = plt.subplots(figsize=(10, 6))
fig.suptitle(f'{metric} Distribution by Model', fontsize=16, y=1.05)

# Prepare data for plotting
data = []

# Add pretrained model data
pretrained_values = results['Pretrained'][metric].dropna()
for value in pretrained_values:
    data.append({'Model': 'Pre-trained', metric: value})

# Add strategy data
for strategy_name, df in results['creaor1'].items():
    values = df[metric].dropna()
    for value in values:
        data.append({'Model': strategy_name, metric: value})

metric_df = pd.DataFrame(data)

# Create box plot
sns.boxplot(x='Model', y=metric, data=metric_df, ax=ax)
ax.set_xlabel('Model')
ax.set_ylabel(metric)

# Calculate and display means
means = metric_df.groupby('Model')[metric].mean()
for i, mean_val in enumerate(means):
    ax.text(i, mean_val, f'{mean_val:.3f}', 
            horizontalalignment='center', 
            verticalalignment='bottom')

# Append statistics to file
stats_output = "statistics_metrics.txt"
with open(stats_output, 'a') as f:
    f.write(f"\n\n{metric} Statistics\n{'='*50}\n\n")
    summary = metric_df.groupby('Model')[metric].agg(['mean', 'std', 'min', 'max']).round(4)
    f.write(f"{summary.to_string()}\n\n")

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Lexical diversity

In [None]:
# Get reference MTLD from a Strategy file
reference_mtld = results['creator1']['Strategy 1']['Reference_MTLD'].dropna()

# Create figure
fig, ax = plt.subplots(figsize=(10, 6))
fig.suptitle('MTLD Distribution by Model', fontsize=16, y=1.05)

# Prepare data for plotting
data = []

# Add reference data first
for value in reference_mtld:
    data.append({
        'Model': 'Reference',
        'MTLD': value
    })

# Add pretrained model data
pretrained_values = results['Pretrained']['Candidate_MTLD'].dropna()
for value in pretrained_values:
    data.append({
        'Model': 'Pre-trained',
        'MTLD': value
    })

# Add strategy data
for strategy_name, df in results['creator1'].items():
    values = df['Candidate_MTLD'].dropna()
    for value in values:
        data.append({
            'Model': strategy_name,
            'MTLD': value
        })

metric_df = pd.DataFrame(data)

# Create box plot
sns.boxplot(x='Model', y='MTLD', data=metric_df, ax=ax)
ax.set_xlabel('Model')
ax.set_ylabel('MTLD Score')

# Calculate and display means
means = metric_df.groupby('Model')['MTLD'].mean()
for i, mean_val in enumerate(means):
    ax.text(i, mean_val, f'{mean_val:.3f}', 
            horizontalalignment='center', 
            verticalalignment='bottom')

# Append statistics to file
stats_output = "statistics_metrics.txt"
with open(stats_output, 'a') as f:
    f.write(f"\n\nMTLD Statistics\n{'='*50}\n\n")
    summary = metric_df.groupby('Model')['MTLD'].agg(['mean', 'std', 'min', 'max']).round(4)
    f.write(f"{summary.to_string()}\n\n")

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Sentiment

In [None]:
# Get reference Sentiment from a Strategy 
reference_sentiment = results['creator1']['Strategy 1']['Reference_Sentiment_Compound'].dropna()

# Create figure
fig, ax = plt.subplots(figsize=(10, 6))
fig.suptitle('Sentiment Compound Score Distribution by Model', fontsize=16, y=1.05)

# Prepare data for plotting
data = []

# Add reference data first
for value in reference_sentiment:
    data.append({
        'Strategy': 'Reference',
        'Sentiment': value
    })

# Add pretrained model data
pretrained_values = results['Pretrained']['Candidate_Sentiment_Compound'].dropna()
for value in pretrained_values:
    data.append({
        'Strategy': 'Pre-trained',
        'Sentiment': value
    })

# Add strategy data
for strategy_name, df in results['creator1'].items():
    values = df['Candidate_Sentiment_Compound'].dropna()
    for value in values:
        data.append({
            'Strategy': strategy_name,
            'Sentiment': value
        })

metric_df = pd.DataFrame(data)

# Create box plot
sns.boxplot(x='Strategy', y='Sentiment', data=metric_df, ax=ax)
ax.set_xlabel('Model')
ax.set_ylabel('Sentiment Compound Score')

# Calculate and display means
means = metric_df.groupby('Strategy')['Sentiment'].mean()
for i, mean_val in enumerate(means):
    ax.text(i, mean_val, f'{mean_val:.3f}', 
            horizontalalignment='center', 
            verticalalignment='bottom')

# Append statistics to file
stats_output = "statistics_metrics.txt"
with open(stats_output, 'a') as f:
    f.write(f"\n\nSentiment Statistics\n{'='*50}\n\n")
    summary = metric_df.groupby('Strategy')['Sentiment'].agg(['mean', 'std', 'min', 'max']).round(4)
    f.write(f"{summary.to_string()}\n\n")

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Create figure with 3 subplots (one for each sentiment component)
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Sentiment Distribution by Component', fontsize=16, y=1.05)

# Define sentiment components
sentiment_components = ['Sentiment_Neg', 'Sentiment_Neu', 'Sentiment_Pos']
titles = ['Negative', 'Neutral', 'Positive']

# Get reference sentiment from Strategy 1
reference_data = results['creator1']['Strategy 1']

for idx, (component, title) in enumerate(zip(sentiment_components, titles)):
    data = []
    
    # Add reference data
    ref_values = reference_data[f'Reference_{component}'].dropna()
    for value in ref_values:
        data.append({
            'Strategy': 'Reference',
            'Score': value
        })
    
    # Add pretrained model data
    pretrained_values = results['Pretrained'][f'Candidate_{component}'].dropna()
    for value in pretrained_values:
        data.append({
            'Strategy': 'Pre-trained',
            'Score': value
        })
    
    # Add strategy data
    for strategy_name, df in results['creator1'].items():
        values = df[f'Candidate_{component}'].dropna()
        for value in values:
            data.append({
                'Strategy': strategy_name,
                'Score': value
            })
    
    metric_df = pd.DataFrame(data)
    
    # Create box plot
    sns.boxplot(x='Strategy', y='Score', data=metric_df, ax=axes[idx])
    axes[idx].set_title(f'{title} Sentiment')
    axes[idx].set_xlabel('Model')
    axes[idx].set_ylabel('Score' if idx == 0 else '')
    
    # Calculate and display means slightly to the right of the boxes
    means = metric_df.groupby('Strategy')['Score'].mean()
    for i, mean_val in enumerate(means):
        axes[idx].text(i + 0.1, mean_val, f'{mean_val:.3f}', 
                      horizontalalignment='left', 
                      verticalalignment='bottom')
    
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# RQ1: Compare pretrained model with Creator 1's strategies
fig, ax = plt.subplots(figsize=(10, 6))
fig.suptitle('RQ1: Sentiment Vector Distance Analysis (Creator 1)', fontsize=16, y=1.05)

distances = []

# Get reference from Creator 1's first strategy
first_strategy = list(results['creator1'].keys())[0]
ref_df = results['creator1'][first_strategy]

# Load pretrained model results separately
pretrained_df = pd.read_csv('#path')

# Calculate distances for pretrained model
for i in range(len(pretrained_df)):
    ref_point = np.array([ref_df['Reference_Sentiment_Pos'].iloc[i],
                         ref_df['Reference_Sentiment_Neu'].iloc[i],
                         ref_df['Reference_Sentiment_Neg'].iloc[i]])
    
    cand_point = np.array([pretrained_df['Candidate_Sentiment_Pos'].iloc[i],
                          pretrained_df['Candidate_Sentiment_Neu'].iloc[i],
                          pretrained_df['Candidate_Sentiment_Neg'].iloc[i]])
    
    distance = np.linalg.norm(ref_point - cand_point)
    distances.append({
        'Model': 'Pre-trained',
        'Distance': distance
    })

# Calculate distances for Creator 1's strategies
for strategy_name, df in results['creator1'].items():
    for i in range(len(df)):
        ref_point = np.array([df['Reference_Sentiment_Pos'].iloc[i],
                            df['Reference_Sentiment_Neu'].iloc[i],
                            df['Reference_Sentiment_Neg'].iloc[i]])
        
        cand_point = np.array([df['Candidate_Sentiment_Pos'].iloc[i],
                             df['Candidate_Sentiment_Neu'].iloc[i],
                             df['Candidate_Sentiment_Neg'].iloc[i]])
        
        distance = np.linalg.norm(ref_point - cand_point)
        distances.append({
            'Model': strategy_name,
            'Distance': distance
        })

# Create violin plot
distance_df = pd.DataFrame(distances)
sns.violinplot(x='Model', y='Distance', data=distance_df, ax=ax)

ax.set_xlabel('Model')
ax.set_ylabel('Sentiment Vector Distance')
ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Step 5: Starting to answer RQ2. Plotting for all creators at once. Comparing the fine-tuned models.

## Loading the results. Making Sure everything is on point!

In [None]:
def load_results(base_dir='.'):
    all_results = {}
    
    # Look for results_{creator} directories
    for creator_dir in ['results_creator1', 'results_creator2', 'results_creator3']:
        creator_name = creator_dir.replace('results_', '')
        creator_path = os.path.join(base_dir, creator_dir) # path
        
        # Initialize results for this creator
        strategy_dfs = {}
        
        # Load all strategy files for this creator
        for file in os.listdir(creator_path):
            if file.startswith('Strategy_') and file.endswith('.csv'):
                strategy_num = file.split('_')[1].split('.')[0]
                file_path = os.path.join(creator_path, file) # path
                
                if os.path.isfile(file_path):
                    df = pd.read_csv(file_path)
                    strategy_dfs[f'Strategy {strategy_num}'] = df
        
        all_results[creator_name] = strategy_dfs
    
    return all_results

# Load results and set style
results = load_results()
sns.set_style('whitegrid')
sns.set_palette('Set2')

# Let's verify the data is loaded correctly
print("Available creators:", list(results.keys()))
for creator, creator_results in results.items():
    print(f"\nCreator: {creator}")
    print("Available strategies:", list(creator_results.keys()))
    for strategy, df in creator_results.items():
        print(f"Strategy {strategy} shape:", df.shape)

## BERTScore

In [None]:
metric = 'BERTScore_F1'

# Create figure with 3 subplots (one for each creator)
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle(f'{metric} Distribution by Strategy', fontsize=16, y=1.05)

# Prepare statistics file
stats_output = f"statistics_{metric}.txt"
with open(stats_output, 'w') as f:
    f.write(f"{metric} Statistics\n{'='*50}\n\n")

# Plot for each creator
for idx, (creator, creator_results) in enumerate(results.items(), 1):
    # Prepare data for plotting
    data = []
    for strategy_name, df in creator_results.items():
        values = df[metric].dropna()
        for value in values:
            data.append({'Strategy': strategy_name, metric: value})
    
    metric_df = pd.DataFrame(data)
    
    # Create box plot
    sns.boxplot(x='Strategy', y=metric, data=metric_df, ax=axes[idx-1])
    axes[idx-1].set_title(f'Creator {idx}')
    axes[idx-1].set_xlabel('Strategy')
    axes[idx-1].set_ylabel(metric if idx == 1 else '')
    
    # Calculate and display means
    means = metric_df.groupby('Strategy')[metric].mean()
    for i, mean_val in enumerate(means):
        axes[idx-1].text(i, mean_val, f'{mean_val:.3f}', 
                      horizontalalignment='center', 
                      verticalalignment='bottom')
    
    # Save statistics to file
    with open(stats_output, 'a') as f:
        f.write(f"Creator {idx}\n{'-'*20}\n")
        summary = metric_df.groupby('Strategy')[metric].agg(['count', 'mean', 'std', 'min', 'max']).round(4)
        f.write(f"{summary.to_string()}\n\n")

plt.tight_layout()
plt.show()

## Syntactic Similarity

In [None]:
metric = 'Syntactic_Similarity'

# Create figure with 3 subplots (one for each creator)
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle(f'{metric} Distribution by Strategy', fontsize=16, y=1.05)

# Append to statistics file
stats_output = "statistics_metrics.txt"
with open(stats_output, 'a') as f:
    f.write(f"\n\n{metric} Statistics\n{'='*50}\n\n")

# Plot for each creator
for idx, (creator, creator_results) in enumerate(results.items(), 1):
    # Prepare data for plotting
    data = []
    for strategy_name, df in creator_results.items():
        values = df[metric].dropna()
        for value in values:
            data.append({'Strategy': strategy_name, metric: value})
    
    metric_df = pd.DataFrame(data)
    
    # Create box plot
    sns.boxplot(x='Strategy', y=metric, data=metric_df, ax=axes[idx-1])
    axes[idx-1].set_title(f'Creator {idx}')
    axes[idx-1].set_xlabel('Strategy')
    axes[idx-1].set_ylabel(metric if idx == 1 else '')
    
    # Calculate and display means
    means = metric_df.groupby('Strategy')[metric].mean()
    for i, mean_val in enumerate(means):
        axes[idx-1].text(i, mean_val, f'{mean_val:.3f}', 
                      horizontalalignment='center', 
                      verticalalignment='bottom')
    
    # Save statistics to file
    with open(stats_output, 'a') as f:
        f.write(f"Creator {idx}\n{'-'*20}\n")
        summary = metric_df.groupby('Strategy')[metric].agg(['count', 'mean', 'std', 'min', 'max']).round(4)
        f.write(f"{summary.to_string()}\n\n")

plt.tight_layout()
plt.show()

## Lexical Diversity

In [None]:
metrics = ['Reference_MTLD', 'Candidate_MTLD']

# Create figure with 3 subplots (one for each creator)
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('MTLD Distribution by Strategy', fontsize=16, y=1.05)

# Append to statistics file
stats_output = "statistics_metrics.txt"
with open(stats_output, 'a') as f:
    f.write(f"\n\nMTLD Statistics\n{'='*50}\n\n")

# Plot for each creator
for idx, (creator, creator_results) in enumerate(results.items(), 1):
    # Prepare data for plotting
    data = []
    for strategy_name, df in creator_results.items():
        for mtld_metric in metrics:
            values = df[mtld_metric].dropna()
            for value in values:
                data.append({
                    'Strategy': strategy_name,
                    'MTLD_Type': mtld_metric.replace('_MTLD', ''),
                    'MTLD': value
                })
    
    metric_df = pd.DataFrame(data)
    
    # Create box plot
    sns.boxplot(x='Strategy', y='MTLD', hue='MTLD_Type', data=metric_df, ax=axes[idx-1])
    axes[idx-1].set_title(f'Creator {idx}')
    axes[idx-1].set_xlabel('Strategy')
    axes[idx-1].set_ylabel('MTLD' if idx == 1 else '')
    
    # Save statistics to file
    with open(stats_output, 'a') as f:
        f.write(f"Creator {idx}\n{'-'*20}\n")
        summary = metric_df.groupby(['Strategy', 'MTLD_Type'])['MTLD'].agg(['count', 'mean', 'std', 'min', 'max']).round(4)
        f.write(f"{summary.to_string()}\n\n")

plt.tight_layout()
plt.show()

In [None]:
# Create figure with 3 subplots (one for each creator)
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('MTLD Distribution by Strategy with Reference Comparison', fontsize=16, y=1.05)

# Plot for each creator
for idx, (creator, creator_results) in enumerate(results.items()):
    # Prepare data for plotting
    data = []
    
    # Get the first strategy (whatever its name is)
    first_strategy = list(creator_results.keys())[0]
    
    # Add reference data from first strategy
    reference_values = creator_results[first_strategy]['Reference_MTLD'].dropna()
    for value in reference_values:
        data.append({
            'Strategy': 'Reference',
            'MTLD': value
        })
    
    # Add strategy data
    for strategy_name, df in creator_results.items():
        values = df['Candidate_MTLD'].dropna()
        for value in values:
            data.append({
                'Strategy': strategy_name,
                'MTLD': value
            })
    
    metric_df = pd.DataFrame(data)
    
    # Create box plot
    sns.boxplot(x='Strategy', y='MTLD', data=metric_df, ax=axes[idx])
    axes[idx].set_title(f'Creator {idx+1}')
    axes[idx].set_xlabel('Strategy')
    axes[idx].set_ylabel('MTLD Score' if idx == 0 else '')
    
    # Calculate and display means with slightly higher vertical offset
    means = metric_df.groupby('Strategy')['MTLD'].mean()
    for i, mean_val in enumerate(means):
        axes[idx].text(i, mean_val + 0.7, f'{mean_val:.3f}', 
                      horizontalalignment='center', 
                      verticalalignment='bottom')
    
    # Rotate x-axis labels
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Save statistics to file
stats_output = "statistics_metrics.txt"
with open(stats_output, 'a') as f:
    f.write(f"\n\nMTLD Statistics with Reference Comparison\n{'='*50}\n\n")
    for creator, creator_results in results.items():
        f.write(f"\nCreator: {creator}\n{'-'*20}\n")
        
        # Prepare data for statistics
        data = []
        # Add reference data from first strategy
        first_strategy = list(creator_results.keys())[0]
        reference_values = creator_results[first_strategy]['Reference_MTLD'].dropna()
        data.append({
            'Strategy': 'Reference',
            'Values': reference_values
        })
        
        # Add strategy data
        for strategy_name, df in creator_results.items():
            data.append({
                'Strategy': strategy_name,
                'Values': df['Candidate_MTLD'].dropna()
            })
        
        # Calculate statistics
        for item in data:
            values = item['Values']
            f.write(f"\n{item['Strategy']}:\n")
            f.write(f"Mean: {values.mean():.4f}\n")
            f.write(f"Std: {values.std():.4f}\n")
            f.write(f"Min: {values.min():.4f}\n")
            f.write(f"Max: {values.max():.4f}\n")

In [None]:
# Create figure with 3 subplots for average MTLD values
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Average MTLD by Strategy', fontsize=16, y=1.05)

# Plot for each creator
for idx, (creator, creator_results) in enumerate(results.items(), 1):
    # Prepare data for plotting
    data = []
    for strategy_name, df in creator_results.items():
        for mtld_metric in metrics:
            values = df[mtld_metric].dropna()
            data.append({
                'Strategy': strategy_name,
                'MTLD_Type': mtld_metric.replace('_MTLD', ''),
                'MTLD': values.mean(),
                'std': values.std(),
                'count': len(values)
            })
    
    metric_df = pd.DataFrame(data)
    
    # Calculate confidence intervals
    confidence = 0.95
    metric_df['sem'] = metric_df['std'] / np.sqrt(metric_df['count'])
    metric_df['h'] = metric_df['sem'] * stats.t.ppf((1 + confidence) / 2., metric_df['count'] - 1)
    
    # Create bar plot
    sns.barplot(x='Strategy', y='MTLD', hue='MTLD_Type', data=metric_df, ax=axes[idx-1], capsize=0.2)
    
    axes[idx-1].set_title(f'Creator {idx}')
    axes[idx-1].set_xlabel('Strategy')
    axes[idx-1].set_ylabel('Average MTLD' if idx == 1 else '')

plt.tight_layout()
plt.show()

## Sentiment Analysis

In [None]:
# Compound Sentiment Comparison across creators
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Sentiment Compound Score Distribution by Strategy', fontsize=16, y=1.05)

# Plot for each creator
for idx, (creator, creator_results) in enumerate(results.items(), 1):
    plot_data = []
    
    # Get reference data from Strategy 1 (consistent reference)
    reference_values = creator_results['Strategy 1']['Reference_Sentiment_Compound'].dropna()
    for value in reference_values:
        plot_data.append({
            'Strategy': 'Reference',
            'Compound': value
        })
    
    # Add strategy data
    for strategy_name, df in creator_results.items():
        values = df['Candidate_Sentiment_Compound'].dropna()
        for value in values:
            plot_data.append({
                'Strategy': strategy_name,
                'Compound': value
            })
    
    plot_df = pd.DataFrame(plot_data)
    
    # Create box plot
    sns.boxplot(x='Strategy', y='Compound', data=plot_df, ax=axes[idx-1])
    axes[idx-1].set_title(f'Creator {idx}')
    axes[idx-1].set_xlabel('Strategy')
    axes[idx-1].set_ylabel('Compound Score' if idx == 1 else '')
    
    # Add means as text, shifted slightly to the right
    means = plot_df.groupby('Strategy')['Compound'].mean()
    for i, mean_val in enumerate(means):
        axes[idx-1].text(i + 0.2, mean_val, f'{mean_val:.3f}', 
                        horizontalalignment='center', 
                        verticalalignment='bottom')
    
    axes[idx-1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Sentiment Components (Pos, Neg, Neu) Comparison
sentiment_components = ['Sentiment_Neg', 'Sentiment_Neu', 'Sentiment_Pos']
titles = ['Negative', 'Neutral', 'Positive']

fig, axes = plt.subplots(3, 3, figsize=(20, 15))  # 3 rows (one per component) x 3 cols (one per creator)
fig.suptitle('Sentiment Components Distribution by Strategy', fontsize=16, y=1.02)

for row, (component, title) in enumerate(zip(sentiment_components, titles)):
    for col, (creator, creator_results) in enumerate(results.items()):
        plot_data = []
        
        # Get reference data from Strategy 1
        ref_values = creator_results['Strategy 1'][f'Reference_{component}'].dropna()
        for value in ref_values:
            plot_data.append({
                'Strategy': 'Reference',
                'Score': value
            })
        
        # Add strategy data
        for strategy_name, df in creator_results.items():
            values = df[f'Candidate_{component}'].dropna()
            for value in values:
                plot_data.append({
                    'Strategy': strategy_name,
                    'Score': value
                })
        
        plot_df = pd.DataFrame(plot_data)
        
        # Create box plot
        sns.boxplot(x='Strategy', y='Score', data=plot_df, ax=axes[row, col])
        
        # Set titles and labels
        if row == 0:
            axes[row, col].set_title(f'Creator {col+1}')
        if col == 0:
            axes[row, col].set_ylabel(f'{title} Score')
        else:
            axes[row, col].set_ylabel('')
            
        axes[row, col].set_xlabel('')
        
        # Add means as text, shifted slightly to the right
        means = plot_df.groupby('Strategy')['Score'].mean()
        for i, mean_val in enumerate(means):
            axes[row, col].text(i + 0.2, mean_val, f'{mean_val:.3f}', 
                              horizontalalignment='left', 
                              verticalalignment='bottom')
        
        axes[row, col].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
sentiment_metrics = ['Sentiment_Neg', 'Sentiment_Neu', 'Sentiment_Pos', 'Sentiment_Compound']

# Append to statistics file
stats_output = "statistics_metrics.txt"
with open(stats_output, 'a') as f:
    f.write(f"\n\nSentiment Analysis Statistics\n{'='*50}\n\n")

# Create figure with 3 subplots for Compound sentiment (as a basic view)
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Sentiment Compound Score Distribution by Strategy', fontsize=16, y=1.05)

# Plot for each creator
for idx, (creator, creator_results) in enumerate(results.items(), 1):
    # Prepare data and calculate statistics for all sentiment metrics
    stats_data = []
    plot_data = []
    
    for strategy_name, df in creator_results.items():
        # For plotting (just compound scores)
        ref_compound = df['Reference_Sentiment_Compound'].dropna()
        cand_compound = df['Candidate_Sentiment_Compound'].dropna()
        
        for value in ref_compound:
            plot_data.append({'Strategy': strategy_name, 'Type': 'Reference', 'Compound': value})
        for value in cand_compound:
            plot_data.append({'Strategy': strategy_name, 'Type': 'Candidate', 'Compound': value})
        
        # For statistics (all sentiment metrics)
        for prefix in ['Reference', 'Candidate']:
            stats = {
                'Strategy': strategy_name,
                'Type': prefix
            }
            for metric in sentiment_metrics:
                col_name = f'{prefix}_{metric}'
                if col_name in df.columns:
                    stats[metric] = df[col_name].mean()
            stats_data.append(stats)
    
    # Save statistics to file
    with open(stats_output, 'a') as f:
        f.write(f"Creator {idx}\n{'-'*20}\n")
        stats_df = pd.DataFrame(stats_data)
        f.write(f"{stats_df.to_string()}\n\n")
    
    # Create box plot for compound scores
    plot_df = pd.DataFrame(plot_data)
    sns.boxplot(x='Strategy', y='Compound', hue='Type', data=plot_df, ax=axes[idx-1])
    axes[idx-1].set_title(f'Creator {idx}')
    axes[idx-1].set_xlabel('Strategy')
    axes[idx-1].set_ylabel('Compound Score' if idx == 1 else '')

plt.tight_layout()
plt.show()

In [None]:
# 3. Distance from Reference Plot
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Sentiment Distance from Reference', fontsize=16, y=1.05)

for idx, (creator, creator_results) in enumerate(results.items(), 1):
    distances = []
    
    for strategy_name, df in creator_results.items():
        # Calculate Euclidean distance in sentiment space
        for i in range(len(df)):
            ref_point = np.array([df['Reference_Sentiment_Pos'].iloc[i],
                                df['Reference_Sentiment_Neu'].iloc[i],
                                df['Reference_Sentiment_Neg'].iloc[i]])
            
            cand_point = np.array([df['Candidate_Sentiment_Pos'].iloc[i],
                                 df['Candidate_Sentiment_Neu'].iloc[i],
                                 df['Candidate_Sentiment_Neg'].iloc[i]])
            
            distance = np.linalg.norm(ref_point - cand_point)
            distances.append({
                'Strategy': strategy_name,
                'Distance': distance
            })
    
    # Create violin plot of distances
    distance_df = pd.DataFrame(distances)
    sns.violinplot(x='Strategy', y='Distance', data=distance_df, ax=axes[idx-1])
    
    axes[idx-1].set_title(f'Creator {idx}')
    axes[idx-1].set_xlabel('Strategy')
    axes[idx-1].set_ylabel('Distance from Reference' if idx == 1 else '')

plt.tight_layout()
plt.show()

# Step 6: Statistical Analysis

In [90]:
def check_normality(data, metric_name):
    # Shapiro-Wilk test for each strategy and creator
    for creator, creator_results in results.items():
        print(f"\nNormality Tests for {metric_name} - {creator}")
        for strategy, df in creator_results.items():
            stat, p_value = stats.shapiro(df[metric_name].dropna())
            print(f"{strategy}: p-value = {p_value:.4f}")

In [None]:
def perform_statistical_analysis(metric_name):
    print(f"\n{'='*20} Statistical Analysis for {metric_name} {'='*20}\n")
    
    # 1. Normality Tests
    print("1. Normality Test (Shapiro-Wilk)")
    print("---------------------------------")
    all_normal = True
    
    for creator, creator_results in results.items():
        print(f"\nCreator: {creator}")
        for strategy, df in creator_results.items():
            stat, p_value = stats.shapiro(df[metric_name].dropna())
            is_normal = p_value > 0.05
            all_normal = all_normal and is_normal
            print(f"{strategy}: p-value = {p_value:.4f} ({'Normal' if is_normal else 'Non-normal'})")
    
    # 2. Strategy Comparison Tests
    print("\n2. Strategy Comparison")
    print("----------------------")
    
    for creator, creator_results in results.items():
        print(f"\nCreator: {creator}")
        
        # Prepare data for comparison
        strategy_data = [df[metric_name].dropna() for df in creator_results.values()]
        
        if all_normal:
            # Use one-way ANOVA for normal distributions
            f_stat, p_value = stats.f_oneway(*strategy_data)
            print(f"One-way ANOVA: p-value = {p_value:.4f}")
            
            if p_value < 0.05:
                # Post-hoc Tukey test
                
                data = []
                groups = []
                for i, (strategy, df) in enumerate(creator_results.items()):
                    data.extend(df[metric_name].dropna())
                    groups.extend([f"Strategy {i+1}"] * len(df[metric_name].dropna()))
                
                tukey = pairwise_tukeyhsd(data, groups)
                print("\nTukey's HSD test:")
                print(tukey)
        else:
            # Use Kruskal-Wallis H-test for non-normal distributions
            h_stat, p_value = stats.kruskal(*strategy_data)
            print(f"Kruskal-Wallis H-test: p-value = {p_value:.4f}")
            
            if p_value < 0.05:
                # Post-hoc Mann-Whitney U tests with Bonferroni correction
                print("\nPairwise Mann-Whitney U tests (with Bonferroni correction):")
                strategies = list(creator_results.keys())
                for i in range(len(strategies)):
                    for j in range(i+1, len(strategies)):
                        stat, p = stats.mannwhitneyu(
                            creator_results[strategies[i]][metric_name].dropna(),
                            creator_results[strategies[j]][metric_name].dropna()
                        )
                        # Bonferroni correction
                        p_adjusted = p * (len(strategies) * (len(strategies)-1) / 2)
                        print(f"{strategies[i]} vs {strategies[j]}: p-value = {p_adjusted:.4f}")
    
    # 3. Effect Size Analysis
    print("\n3. Effect Size Analysis")
    print("----------------------")
    for creator, creator_results in results.items():
        print(f"\nCreator: {creator}")
        strategies = list(creator_results.keys())
        for i in range(len(strategies)):
            for j in range(i+1, len(strategies)):
                d = cohen_d(
                    creator_results[strategies[i]][metric_name].dropna(),
                    creator_results[strategies[j]][metric_name].dropna()
                )
                print(f"Cohen's d ({strategies[i]} vs {strategies[j]}): {d:.4f}")

# Helper function for Cohen's d
def cohen_d(x, y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    return (x.mean() - y.mean()) / np.sqrt(((nx-1)*x.std()**2 + (ny-1)*y.std()**2) / dof)

# Run analysis for each metric
metrics_to_analyze = ['BERTScore_F1', 'Syntactic_Similarity', 'Reference_MTLD', 'Candidate_MTLD', 'Reference_Sentiment_Compound', 'Candidate_Sentiment_Compound']

for metric in metrics_to_analyze:
    perform_statistical_analysis(metric)

In [None]:
### 1. Data Quality Check
def check_data_quality():
    print("DATA QUALITY REPORT")
    print("==================\n")
    
    for creator, creator_results in results.items():
        print(f"Creator: {creator}")
        print("-" * 20)
        
        for strategy, df in creator_results.items():
            print(f"\nStrategy: {strategy}")
            print(f"Sample size: {len(df)}")
            print("\nMissing values:")
            print(df.isnull().sum())
            print("\nBasic statistics:")
            print(df.describe().round(4))
        print("\n")

check_data_quality()

In [None]:
### 2. Cross-Metric Correlation Analysis and Visualization


def analyze_metric_correlations():
    print("CROSS-METRIC CORRELATION ANALYSIS")
    print("================================\n")
    
    correlation_metrics = [
        'BERTScore_F1',
        'Syntactic_Similarity',
        'Candidate_MTLD',
        'Candidate_Sentiment_Compound'
    ]
    
    for creator, creator_results in results.items():
        print(f"Creator: {creator}")
        print("-" * 20)
        
        for strategy, df in creator_results.items():
            print(f"\nStrategy: {strategy}")
            
            # Correlation matrix
            corr_matrix = df[correlation_metrics].corr()
            print("\nCorrelation Matrix:")
            print(corr_matrix.round(4))
            
            # Create heatmap
            plt.figure(figsize=(10, 8))
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
            plt.title(f'Correlation Matrix - {creator} - {strategy}')
            plt.tight_layout()
            plt.show()
            
            # Statistical significance
            print("\nCorrelation P-values:")
            p_values = pd.DataFrame(index=correlation_metrics, columns=correlation_metrics)
            for i in correlation_metrics:
                for j in correlation_metrics:
                    if i != j:
                        stat, p = stats.pearsonr(
                            df[i].dropna().values,
                            df[j].dropna().values
                        )
                        p_values.loc[i, j] = p
            print(p_values.round(4))
        print("\n")

analyze_metric_correlations()

In [96]:
def perform_power_analysis():
    print("STATISTICAL POWER ANALYSIS")
    print("=========================\n")
    
    # Parameters
    alpha = 0.05
    sample_sizes = []
    effect_sizes = []
    
    metrics_list = [
        'BERTScore_F1',
        'Syntactic_Similarity',
        'Candidate_MTLD',
        'Candidate_Sentiment_Compound'
    ]
    
    for creator, creator_results in results.items():
        for metric in metrics_list:
            strategies = list(creator_results.keys())
            for i in range(len(strategies)-1):
                for j in range(i+1, len(strategies)):
                    data1 = creator_results[strategies[i]][metric].dropna()
                    data2 = creator_results[strategies[j]][metric].dropna()
                    
                    sample_sizes.append(min(len(data1), len(data2)))
                    effect_size = abs(cohen_d(data1, data2))
                    if not np.isnan(effect_size):  # Check for valid effect size
                        effect_sizes.append(effect_size)
    
    if effect_sizes:  # Check if we have valid effect sizes
        mean_effect_size = float(np.mean(effect_sizes))  # Convert to float
        mean_sample_size = float(np.mean(sample_sizes))  # Convert to float
        
        power_analysis = TTestPower()
        power = power_analysis.power(
            effect_size=mean_effect_size,
            nobs=mean_sample_size,
            alpha=alpha
        )
        
        print(f"Average effect size: {mean_effect_size:.4f}")
        print(f"Average sample size: {int(mean_sample_size)}")
        print(f"Statistical power: {power:.4f}")
        
        required_n = power_analysis.solve_power(
            effect_size=mean_effect_size,
            power=0.8,
            alpha=alpha
        )
        print(f"Required sample size for 80% power: {int(required_n)}")
    else:
        print("No valid effect sizes calculated")

In [None]:
def create_summary_report():
    print("SUMMARY STATISTICS REPORT")
    print("========================\n")
    
    # Define metrics to analyze (use the same list as before)
    metrics_list = [
        'BERTScore_F1',
        'Syntactic_Similarity',
        'Candidate_MTLD',
        'Candidate_Sentiment_Compound'
    ]
    
    summary_data = []
    
    for creator, creator_results in results.items():
        for strategy, df in creator_results.items():
            for metric in metrics_list:  # Use metrics_list instead of metrics_to_analyze
                summary = {
                    'Creator': creator,
                    'Strategy': strategy,
                    'Metric': metric,
                    'Mean': df[metric].mean(),
                    'Std': df[metric].std(),
                    'Median': df[metric].median(),
                    'N': len(df[metric].dropna())
                }
                summary_data.append(summary)
    
    summary_df = pd.DataFrame(summary_data)
    
    # Print summary table
    print("Overall Summary Statistics:")
    print(summary_df.round(4))
    
    # Create summary plots
    fig, axes = plt.subplots(2, 2, figsize=(15, 15))
    fig.suptitle('Summary of Key Metrics Across Strategies and Creators')
    
    # Plot means with error bars
    for i, metric in enumerate(metrics_list):  # Use metrics_list here too
        row = i // 2
        col = i % 2
        
        sns.barplot(data=summary_df[summary_df['Metric'] == metric],
                   x='Strategy', y='Mean', hue='Creator',
                   ax=axes[row, col])
        
        axes[row, col].set_title(f'{metric} by Strategy and Creator')
        axes[row, col].set_ylabel('Mean Value')
        axes[row, col].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

create_summary_report()

In [None]:
# Create figure with 3 subplots for the different metrics
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Summary of Key Metrics Across Strategies and Creators', fontsize=16, y=1.05)

metrics = {
    'BERTScore_F1': 'BERTScore',
    'Syntactic_Similarity': 'Syntactic Similarity', 
    'Candidate_MTLD': 'MTLD'
}

# Prepare data
summary_data = []

for creator, creator_results in results.items():
    creator_num = f'Creator {list(results.keys()).index(creator) + 1}'
    
    for strategy, df in creator_results.items():
        for metric_key, metric_name in metrics.items():
            # Calculate median instead of mean to account for outliers
            summary = {
                'Creator': creator_num,
                'Strategy': strategy,
                'Metric': metric_name,
                'Value': df[metric_key].median()
            }
            summary_data.append(summary)
            
            # Add reference MTLD if this is the MTLD metric
            if metric_key == 'Candidate_MTLD':
                summary = {
                    'Creator': creator_num,
                    'Strategy': strategy,
                    'Metric': f'{metric_name} Reference',
                    'Value': df['Reference_MTLD'].median()
                }
                summary_data.append(summary)

summary_df = pd.DataFrame(summary_data)

# Create plots
for idx, (metric_key, metric_name) in enumerate(metrics.items()):
    ax = axes[idx]
    
    if metric_name == 'MTLD':
        # For MTLD, plot both candidate and reference side by side
        plot_data = summary_df[
            (summary_df['Metric'].isin([metric_name, f'{metric_name} Reference']))
        ]
        
        sns.barplot(data=plot_data,
                   x='Strategy', y='Value', hue='Creator',
                   ax=ax, ci=None)  # Remove confidence intervals
        
        # Add pattern to distinguish reference bars
        for i, bar in enumerate(ax.patches):
            if i >= len(ax.patches)/2:  # Second half of bars are reference
                bar.set_hatch('//')
                bar.set_alpha(0.7)
            
            # Add value labels on bars
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.2f}',
                   ha='center', va='bottom')
    else:
        # For other metrics, plot only candidate values
        plot_data = summary_df[summary_df['Metric'] == metric_name]
        sns.barplot(data=plot_data,
                   x='Strategy', y='Value', hue='Creator',
                   ax=ax)
        
        # Add value labels on bars
        for i, bar in enumerate(ax.patches):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.2f}',
                   ha='center', va='bottom')
    
    ax.set_title(metric_name)
    ax.set_xlabel('Strategy')
    ax.set_ylabel('Median Value')
    ax.tick_params(axis='x', rotation=45)
    
    # Only keep one legend (on the rightmost plot)
    if idx < 2:
        ax.get_legend().remove()

plt.tight_layout()
plt.show()

# Print numerical summary
print("\nNumerical Summary:")
summary_pivot = summary_df.pivot_table(
    values='Value',
    index=['Creator', 'Strategy'],
    columns=['Metric'],
    aggfunc='first'
).round(4)
print(summary_pivot)

# Step 6.1: New Statistics

In [None]:
def compute_comprehensive_statistics():
    metrics = ['BERTScore_F1', 'Syntactic_Similarity', 'Candidate_MTLD', 'Candidate_Sentiment_Compound']
    pretrained_df = pd.read_csv('#path')
    
    with open("comprehensive_statistics.txt", 'w') as f:
        # 1. RQ1: Pretrained vs Fine-tuned (First Creator, all strategies)
        f.write("="*80 + "\n")
        f.write("RQ1: PRETRAINED VS FINE-TUNED (First Creator)\n")
        f.write("="*80 + "\n\n")
        
        for metric in metrics:
            f.write(f"\n{metric} Statistics:\n{'-'*50}\n")
            
            # Pretrained stats
            pretrained_values = pretrained_df[metric].dropna()
            f.write("\nPretrained Model:\n")
            f.write(f"Mean: {pretrained_values.mean():.4f}\n")
            f.write(f"Median: {pretrained_values.median():.4f}\n")
            f.write(f"Std: {pretrained_values.std():.4f}\n")
            f.write(f"Q1: {pretrained_values.quantile(0.25):.4f}\n")
            f.write(f"Q3: {pretrained_values.quantile(0.75):.4f}\n")
            
            # Stats for all three strategies
            for strategy in ['Strategy 1', 'Strategy 2', 'Strategy 3']:
                strategy_values = results['creator1'][strategy][metric].dropna()
                f.write(f"\n{strategy}:\n")
                f.write(f"Mean: {strategy_values.mean():.4f}\n")
                f.write(f"Median: {strategy_values.median():.4f}\n")
                f.write(f"Std: {strategy_values.std():.4f}\n")
                f.write(f"Q1: {strategy_values.quantile(0.25):.4f}\n")
                f.write(f"Q3: {strategy_values.quantile(0.75):.4f}\n")
                
                # Statistical tests
                stat, pval = stats.mannwhitneyu(pretrained_values, strategy_values)
                d = cohen_d(pretrained_values, strategy_values)
                f.write(f"Mann-Whitney U test p-value vs Pretrained: {pval:.4f}\n")
                f.write(f"Cohen's d effect size vs Pretrained: {d:.4f}\n")
            
            # Overall comparison
            all_values = [pretrained_values] + [results['creator1'][s][metric].dropna() 
                                              for s in ['Strategy 1', 'Strategy 2', 'Strategy 3']]
            h_stat, p_val = stats.kruskal(*all_values)
            f.write(f"\nKruskal-Wallis H-test p-value (all models): {p_val:.4f}\n")
        
        # 2. Fine-tuned Strategies Comparison (rest of the analysis remains the same)
        f.write("\n\n" + "="*80 + "\n")
        f.write("FINE-TUNED STRATEGIES COMPARISON\n")
        f.write("="*80 + "\n")
        
        # [Rest of the code remains the same]

# Run the analysis
compute_comprehensive_statistics()
print("Statistics have been saved to 'comprehensive_statistics.txt'")

In [None]:
def compute_enhanced_statistics():
    metrics = ['BERTScore_F1', 'Syntactic_Similarity', 'Candidate_MTLD', 'Candidate_Sentiment_Compound']
    pretrained_df = pd.read_csv('#path')
    
    with open("enhanced_statistics.txt", 'w') as f:
        # 1. RQ1: Pretrained vs All Fine-tuned Strategies (Creator 1)
        f.write("="*80 + "\n")
        f.write("RQ1: PRETRAINED VS ALL FINE-TUNED STRATEGIES (Creator 1)\n")
        f.write("="*80 + "\n\n")
        
        for metric in metrics:
            f.write(f"\n{metric} Statistics:\n{'-'*50}\n")
            
            # Pretrained stats
            pretrained_values = pretrained_df[metric].dropna()
            ci = stats.t.interval(confidence=0.95,
                                df=len(pretrained_values)-1,
                                loc=pretrained_values.mean(),
                                scale=stats.sem(pretrained_values))
            
            f.write("\nPretrained Model:\n")
            f.write(f"Mean: {pretrained_values.mean():.4f}\n")
            f.write(f"95% CI: [{ci[0]:.4f}, {ci[1]:.4f}]\n")
            f.write(f"Median: {pretrained_values.median():.4f}\n")
            f.write(f"Std: {pretrained_values.std():.4f}\n")
            f.write(f"Q1: {pretrained_values.quantile(0.25):.4f}\n")
            f.write(f"Q3: {pretrained_values.quantile(0.75):.4f}\n")
            
            # Stats for all three strategies
            for strategy in ['Strategy 1', 'Strategy 2', 'Strategy 3']:
                strategy_values = results['creator1'][strategy][metric].dropna()
                ci = stats.t.interval(confidence=0.95,
                                    df=len(strategy_values)-1,
                                    loc=strategy_values.mean(),
                                    scale=stats.sem(strategy_values))
                
                f.write(f"\n{strategy}:\n")
                f.write(f"Mean: {strategy_values.mean():.4f}\n")
                f.write(f"95% CI: [{ci[0]:.4f}, {ci[1]:.4f}]\n")
                f.write(f"Median: {strategy_values.median():.4f}\n")
                f.write(f"Std: {strategy_values.std():.4f}\n")
                f.write(f"Q1: {strategy_values.quantile(0.25):.4f}\n")
                f.write(f"Q3: {strategy_values.quantile(0.75):.4f}\n")
                
                # Effect sizes and statistical tests against pretrained
                d = cohen_d(pretrained_values, strategy_values)
                stat, pval = stats.mannwhitneyu(pretrained_values, strategy_values)
                
                f.write(f"\nComparison with Pretrained:\n")
                f.write(f"Cohen's d: {d:.4f}\n")
                f.write(f"Mann-Whitney U test p-value: {pval:.4f}\n")
            
            # Kruskal-Wallis test across all models (pretrained + 3 strategies)
            all_values = [pretrained_values] + [results['creator1'][s][metric].dropna() 
                                              for s in ['Strategy 1', 'Strategy 2', 'Strategy 3']]
            h_stat, p_val = stats.kruskal(*all_values)
            n = sum(len(x) for x in all_values)
            eta_sq = (h_stat - len(all_values) + 1) / (n - len(all_values))
            
            f.write(f"\nOverall Comparison:\n")
            f.write(f"Kruskal-Wallis H-test p-value: {p_val:.4f}\n")
            f.write(f"Eta-squared effect size: {eta_sq:.4f}\n")
            
        # [Rest of the code remains the same for correlation analysis and fine-tuned comparisons]

# Run enhanced analysis
compute_enhanced_statistics()
print("Enhanced statistics have been saved to 'enhanced_statistics.txt'")

In [None]:
def compute_reference_inclusive_statistics():
    metrics = {
        'MTLD': {'reference': 'Reference_MTLD', 'candidate': 'Candidate_MTLD'},
        'Sentiment': {'reference': 'Reference_Sentiment_Compound', 'candidate': 'Candidate_Sentiment_Compound'}
    }
    
    pretrained_df = pd.read_csv('#path')
    
    with open("reference_inclusive_statistics.txt", 'w') as f:
        # RQ1: Reference vs Pretrained vs Fine-tuned
        f.write("="*80 + "\n")
        f.write("RQ1: REFERENCE VS PRETRAINED VS FINE-TUNED COMPARISON\n")
        f.write("="*80 + "\n\n")
        
        for metric_name, metric_keys in metrics.items():
            f.write(f"\n{metric_name} Statistics:\n{'-'*50}\n")
            
            # Get reference values (from Strategy 1 of first creator)
            reference_values = results['creator1']['Strategy 1'][metric_keys['reference']].dropna()
            
            # Reference stats
            ci = stats.t.interval(confidence=0.95,
                                df=len(reference_values)-1,
                                loc=reference_values.mean(),
                                scale=stats.sem(reference_values))
            
            f.write("\nReference Values:\n")
            f.write(f"Mean: {reference_values.mean():.4f}\n")
            f.write(f"95% CI: [{ci[0]:.4f}, {ci[1]:.4f}]\n")
            f.write(f"Median: {reference_values.median():.4f}\n")
            f.write(f"Std: {reference_values.std():.4f}\n")
            f.write(f"Q1: {reference_values.quantile(0.25):.4f}\n")
            f.write(f"Q3: {reference_values.quantile(0.75):.4f}\n")
            
            # Pretrained stats
            pretrained_values = pretrained_df[metric_keys['candidate']].dropna()
            ci = stats.t.interval(confidence=0.95,
                                df=len(pretrained_values)-1,
                                loc=pretrained_values.mean(),
                                scale=stats.sem(pretrained_values))
            
            f.write("\nPretrained Model:\n")
            f.write(f"Mean: {pretrained_values.mean():.4f}\n")
            f.write(f"95% CI: [{ci[0]:.4f}, {ci[1]:.4f}]\n")
            f.write(f"Median: {pretrained_values.median():.4f}\n")
            f.write(f"Std: {pretrained_values.std():.4f}\n")
            f.write(f"Q1: {pretrained_values.quantile(0.25):.4f}\n")
            f.write(f"Q3: {pretrained_values.quantile(0.75):.4f}\n")
            
            # Compare with reference
            d_ref_pre = cohen_d(reference_values, pretrained_values)
            stat, pval = stats.mannwhitneyu(reference_values, pretrained_values)
            f.write(f"\nComparison with Reference:\n")
            f.write(f"Cohen's d: {d_ref_pre:.4f}\n")
            f.write(f"Mann-Whitney U test p-value: {pval:.4f}\n")
            
            # Stats for all three strategies
            for strategy in ['Strategy 1', 'Strategy 2', 'Strategy 3']:
                strategy_values = results['creator1'][strategy][metric_keys['candidate']].dropna()
                ci = stats.t.interval(confidence=0.95,
                                    df=len(strategy_values)-1,
                                    loc=strategy_values.mean(),
                                    scale=stats.sem(strategy_values))
                
                f.write(f"\n{strategy}:\n")
                f.write(f"Mean: {strategy_values.mean():.4f}\n")
                f.write(f"95% CI: [{ci[0]:.4f}, {ci[1]:.4f}]\n")
                f.write(f"Median: {strategy_values.median():.4f}\n")
                f.write(f"Std: {strategy_values.std():.4f}\n")
                f.write(f"Q1: {strategy_values.quantile(0.25):.4f}\n")
                f.write(f"Q3: {strategy_values.quantile(0.75):.4f}\n")
                
                # Compare with reference and pretrained
                d_ref = cohen_d(reference_values, strategy_values)
                d_pre = cohen_d(pretrained_values, strategy_values)
                stat_ref, pval_ref = stats.mannwhitneyu(reference_values, strategy_values)
                stat_pre, pval_pre = stats.mannwhitneyu(pretrained_values, strategy_values)
                
                f.write(f"\nComparisons:\n")
                f.write(f"vs Reference - Cohen's d: {d_ref:.4f}, p-value: {pval_ref:.4f}\n")
                f.write(f"vs Pretrained - Cohen's d: {d_pre:.4f}, p-value: {pval_pre:.4f}\n")
            
            # Overall comparison including reference
            all_values = [reference_values, pretrained_values] + [
                results['creator1'][s][metric_keys['candidate']].dropna() 
                for s in ['Strategy 1', 'Strategy 2', 'Strategy 3']
            ]
            h_stat, p_val = stats.kruskal(*all_values)
            n = sum(len(x) for x in all_values)
            eta_sq = (h_stat - len(all_values) + 1) / (n - len(all_values))
            
            f.write(f"\nOverall Comparison:\n")
            f.write(f"Kruskal-Wallis H-test p-value: {p_val:.4f}\n")
            f.write(f"Eta-squared effect size: {eta_sq:.4f}\n")
        
        # RQ2: Fine-tuned Strategies Comparison (including reference)
        f.write("\n\n" + "="*80 + "\n")
        f.write("RQ2: FINE-TUNED STRATEGIES COMPARISON (Including Reference)\n")
        f.write("="*80 + "\n")
        
        for creator, creator_results in results.items():
            f.write(f"\nCreator: {creator}\n{'-'*50}\n")
            
            for metric_name, metric_keys in metrics.items():
                f.write(f"\n{metric_name} Statistics:\n")
                
                # Get reference values for this creator
                reference_values = creator_results['Strategy 1'][metric_keys['reference']].dropna()
                
                # Prepare stats including reference
                stats_dict = {'Reference': {
                    'mean': reference_values.mean(),
                    'median': reference_values.median(),
                    'std': reference_values.std(),
                    'q1': reference_values.quantile(0.25),
                    'q3': reference_values.quantile(0.75)
                }}
                
                for strategy, df in creator_results.items():
                    values = df[metric_keys['candidate']].dropna()
                    stats_dict[strategy] = {
                        'mean': values.mean(),
                        'median': values.median(),
                        'std': values.std(),
                        'q1': values.quantile(0.25),
                        'q3': values.quantile(0.75)
                    }
                
                stats_df = pd.DataFrame(stats_dict).round(4)
                f.write(f"\n{stats_df.to_string()}\n")
                
                # Kruskal-Wallis including reference
                strategy_values = [reference_values] + [df[metric_keys['candidate']].dropna() 
                                                      for df in creator_results.values()]
                h_stat, p_val = stats.kruskal(*strategy_values)
                n = sum(len(x) for x in strategy_values)
                eta_sq = (h_stat - len(strategy_values) + 1) / (n - len(strategy_values))
                
                f.write(f"\nKruskal-Wallis H-test p-value: {p_val:.4f}\n")
                f.write(f"Eta-squared effect size: {eta_sq:.4f}\n")

# Run the analysis
compute_reference_inclusive_statistics()
print("Reference-inclusive statistics have been saved to 'reference_inclusive_statistics.txt'")