## Human Results

In [1]:
import pandas as pd

# Load data
data = pd.read_csv('/content/Experiment_results - Humans.csv')

# Function to perform analysis
def analyze_data(data, metrics_pos, metrics_neg):
    for thread in ['T1', 'T2']:
        print(f"\nDescriptive Statistics for Overall Scores - Thread {thread}:\n",
              data[data['Comment ID'] == thread]['Overall Score (1-5)'].describe())
    print("\nPositive Metrics Frequency:\n", data[metrics_pos].sum(),
          "\nNegative Metrics Frequency:\n", data[metrics_neg].sum())
    comparison = pd.DataFrame({f'Thread {i+1}': data[data['Comment ID'] == f'T{i+1}'][metrics_pos + metrics_neg].sum()
                              for i in range(2)})
    print("\nThread Comparison:\n", comparison)
    print("\nCorrelation with Overall Score:\n",
          data[metrics_pos + metrics_neg + ['Overall Score (1-5)']].corr()['Overall Score (1-5)'].drop('Overall Score (1-5)'))

# Define metrics
positive_metrics = ['Politeness', 'Empathy', 'Balanced Participation', 'Respectful Disagreement',
                    'Engagement', 'Purpose of Responses', 'Persuasiveness', 'Argument Clarity',
                    'Simple Language (Positive)', 'Complex Language (Positive)', 'Sarcasm (Positive)',
                    'Information Sharing (Positive)']
negative_metrics = ['Simple Language (Negative)', 'Complex Language (Negative)', 'Sarcasm (Negative)',
                    'Information Sharing (Negative)', 'Rudeness', 'Dominating Behaviour',
                    'Hostile disaggreement', 'Lack of engagement', 'Lack of meaningful contribution',
                    'Lack persuasiveness', 'Argument unclarity']

# Run analysis
analyze_data(data, positive_metrics, negative_metrics)

# Save results
results = {
    'Overall Scores Statistics': pd.DataFrame({f'Thread {i+1}': data[data['Comment ID'] == f'T{i+1}']['Overall Score (1-5)'].describe() for i in range(2)}),
    'Positive Metrics Frequency': data[positive_metrics].sum(),
    'Negative Metrics Frequency': data[negative_metrics].sum(),
    'Thread Comparison': pd.DataFrame({f'Thread {i+1}': data[data['Comment ID'] == f'T{i+1}'][positive_metrics + negative_metrics].sum() for i in range(2)}),
    'Correlation with Overall Score': data[positive_metrics + negative_metrics + ['Overall Score (1-5)']].corr()['Overall Score (1-5)'].drop('Overall Score (1-5)')
}

with pd.ExcelWriter('/content/experiment_analysis_results.xlsx') as writer:
    for sheet, df in results.items():
        df.to_excel(writer, sheet_name=sheet)



Descriptive Statistics for Overall Scores - Thread T1:
 count    37.000000
mean      2.918919
std       0.795067
min       2.000000
25%       2.000000
50%       3.000000
75%       4.000000
max       4.000000
Name: Overall Score (1-5), dtype: float64

Descriptive Statistics for Overall Scores - Thread T2:
 count    37.000000
mean      3.783784
std       0.583816
min       3.000000
25%       3.000000
50%       4.000000
75%       4.000000
max       5.000000
Name: Overall Score (1-5), dtype: float64

Positive Metrics Frequency:
 Politeness                        38
Empathy                           20
Balanced Participation            29
Respectful Disagreement           30
Engagement                        21
Purpose of Responses              28
Persuasiveness                    14
Argument Clarity                  31
Simple Language (Positive)        36
Complex Language (Positive)        2
Sarcasm (Positive)                11
Information Sharing (Positive)    35
dtype: int64 
Negative M

In [4]:
# Calculate and round the correlations
correlations = data[positive_metrics + negative_metrics + ['Overall Score (1-5)']].corr()['Overall Score (1-5)'].drop('Overall Score (1-5)').round(2)

# Print the results
print("Correlation with Overall Score:\n", correlations)


Correlation with Overall Score:
 Politeness                         0.59
Empathy                            0.45
Balanced Participation             0.27
Respectful Disagreement            0.46
Engagement                         0.50
Purpose of Responses               0.21
Persuasiveness                     0.43
Argument Clarity                   0.44
Simple Language (Positive)         0.31
Complex Language (Positive)       -0.07
Sarcasm (Positive)                 0.10
Information Sharing (Positive)     0.46
Simple Language (Negative)         0.09
Complex Language (Negative)       -0.25
Sarcasm (Negative)                -0.29
Information Sharing (Negative)    -0.25
Rudeness                          -0.42
Dominating Behaviour              -0.32
Hostile disaggreement             -0.32
Lack of engagement                -0.33
Lack of meaningful contribution   -0.51
Lack persuasiveness               -0.34
Argument unclarity                -0.26
Name: Overall Score (1-5), dtype: float64


## LLM Results

In [10]:
import pandas as pd

# Load your CSV data
df = pd.read_csv('/content/Experiment_results - LLMs.csv')

# 1. Overall Score Comparison: Calculate average and standard deviation of overall scores by model
overall_score_stats = df.groupby('Model')['Overall Score (1-5)'].agg(['mean', 'std']).reset_index()
print("Overall Score Stats:\n", overall_score_stats)

# 2. Metric Frequency Comparison: Count how often each metric is flagged (non-zero) by each model


metrics = ['Politeness',
       'Empathy', 'Balanced Participation', 'Respectful Disagreement',
       'Engagement', 'Purpose of Responses', 'Persuasiveness',
       'Argument Clarity', 'Simple Language (Positive)',
       'Complex Language (Positive)', 'Sarcasm (Positive)',
       'Information Sharing (Positive)', 'Simple Language (Negative)',
       'Complex Language (Negative)', 'Sarcasm (Negative)',
       'Information Sharing (Negative)', 'Rudeness', 'Dominating Behaviour',
       'Hostile disagreement', 'Lack of engagement',
       'Lack of meaningful contribution', 'Lack persuasiveness',
       'Argument unclarity']  # You can add more metrics as needed
metric_flags = df.groupby('Model')[metrics].sum().reset_index()
print("\nMetric Flags by Model:\n", metric_flags)

# 3. Inter-model Comparison: Check consistency of scores for each model (standard deviation across runs)
inter_model_consistency = df.groupby(['Model', 'Comment ID'])['Overall Score (1-5)'].agg(['mean', 'std']).reset_index()
print("\nInter-model Score Consistency:\n", inter_model_consistency)

# save the results to CSV files
overall_score_stats.to_csv('overall_score_stats.csv', index=False)
metric_flags.to_csv('metric_flags_by_model.csv', index=False)
inter_model_consistency.to_csv('inter_model_consistency.csv', index=False)

# Further analysis for qualitative data (Additional Comments) can be done manually or using natural language processing techniques.


Overall Score Stats:
                   Model  mean       std
0               1o-mini   4.0  0.471405
1            1o-preview   4.0  0.471405
2                gpt-4o   3.5  0.527046
3           gpt-4o-mini   4.0  0.000000
4  mistral-large-latest   4.0  0.000000
5     open-mistral-nemo   3.0  0.000000

Metric Flags by Model:
                   Model  Politeness  Empathy  Balanced Participation  \
0               1o-mini          10        9                       5   
1            1o-preview           9        9                       7   
2                gpt-4o          10        5                       4   
3           gpt-4o-mini          10       10                       9   
4  mistral-large-latest          10       10                      10   
5     open-mistral-nemo          10        5                       0   

   Respectful Disagreement  Engagement  Purpose of Responses  Persuasiveness  \
0                        8          10                    10               8   
1       

In [20]:
import pandas as pd

# Load your CSV data
# Replace with your actual file paths
llm_data = pd.read_csv('/content/Experiment_results - LLMs.csv')
human_data = pd.read_csv('/content/Experiment_results - Humans.csv')

# Define the positive and negative metrics lists
positive_metrics = ['Politeness', 'Empathy', 'Balanced Participation', 'Respectful Disagreement',
                    'Engagement', 'Purpose of Responses', 'Persuasiveness', 'Argument Clarity',
                    'Simple Language (Positive)', 'Complex Language (Positive)', 'Sarcasm (Positive)',
                    'Information Sharing (Positive)']
negative_metrics = ['Simple Language (Negative)', 'Complex Language (Negative)', 'Sarcasm (Negative)',
                    'Information Sharing (Negative)', 'Rudeness', 'Dominating Behaviour',
                    'Hostile disagreement', 'Lack of engagement', 'Lack of meaningful contribution',
                    'Lack persuasiveness', 'Argument unclarity']

# Step 1: Calculate the frequency of each positive and negative metric for humans
human_pos_freq = human_data[positive_metrics].sum().nlargest(3)
human_neg_freq = human_data[negative_metrics].sum().nlargest(3)

# Display top 3 positive and top 3 negative metrics based on human evaluations
print("Top 3 Positive Metrics by Human Evaluators:\n", human_pos_freq)
print("\nTop 3 Negative Metrics by Human Evaluators:\n", human_neg_freq)

# Step 2: Filter the LLM data to include only the top human metrics
top_human_metrics = list(human_pos_freq.index) + list(human_neg_freq.index)
llm_filtered_data = llm_data[['Model'] + top_human_metrics]

# Save the filtered LLM data to a CSV if needed
llm_filtered_data.to_csv('filtered_llm_metrics.csv', index=False)

# Step 3: Count how many times each LLM flagged the top metrics and print the results
print("\nNumber of Times Each LLM Flagged Top Human Metrics:")
for metric in top_human_metrics:
    counts = llm_filtered_data.groupby('Model')[metric].sum()
    print(f"\nMetric: {metric}")
    print(counts)


Top 3 Positive Metrics by Human Evaluators:
 Politeness                        38.0
Simple Language (Positive)        36.0
Information Sharing (Positive)    35.0
dtype: float64

Top 3 Negative Metrics by Human Evaluators:
 Lack of meaningful contribution    15.0
Lack persuasiveness                15.0
Sarcasm (Negative)                 14.0
dtype: float64

Number of Times Each LLM Flagged Top Human Metrics:

Metric: Politeness
Model
1o-mini                 10
1o-preview               9
gpt-4o                  10
gpt-4o-mini             10
mistral-large-latest    10
open-mistral-nemo       10
Name: Politeness, dtype: int64

Metric: Simple Language (Positive)
Model
1o-mini                 10
1o-preview              10
gpt-4o                  10
gpt-4o-mini             10
mistral-large-latest    10
open-mistral-nemo       10
Name: Simple Language (Positive), dtype: int64

Metric: Information Sharing (Positive)
Model
1o-mini                 10
1o-preview              10
gpt-4o             

In [22]:
import pandas as pd

# Load your CSV data
# Replace with your actual file paths
llm_data = pd.read_csv('/content/Experiment_results - LLMs.csv')
human_data = pd.read_csv('/content/Experiment_results - Humans.csv')

# Define the positive and negative metrics lists
positive_metrics = ['Politeness', 'Empathy', 'Balanced Participation', 'Respectful Disagreement',
                    'Engagement', 'Purpose of Responses', 'Persuasiveness', 'Argument Clarity',
                    'Simple Language (Positive)', 'Complex Language (Positive)', 'Sarcasm (Positive)',
                    'Information Sharing (Positive)']
negative_metrics = ['Simple Language (Negative)', 'Complex Language (Negative)', 'Sarcasm (Negative)',
                    'Information Sharing (Negative)', 'Rudeness', 'Dominating Behaviour',
                    'Hostile disagreement', 'Lack of engagement', 'Lack of meaningful contribution',
                    'Lack persuasiveness', 'Argument unclarity']

# Step 1: Calculate the frequency of each positive and negative metric for humans
human_pos_freq = human_data[positive_metrics].sum().nlargest(3)
human_neg_freq = human_data[negative_metrics].sum().nlargest(3)

# Display top 3 positive and top 3 negative metrics based on human evaluations
print("Top 3 Positive Metrics by Human Evaluators:\n", human_pos_freq)
print("\nTop 3 Negative Metrics by Human Evaluators:\n", human_neg_freq)

# Step 2: Filter the LLM data to include only the top human metrics
top_human_metrics = list(human_pos_freq.index) + list(human_neg_freq.index)
llm_filtered_data = llm_data[['Model'] + top_human_metrics]

# Step 3: Aggregate the metric flags for each model and calculate percentages
llm_aggregated = llm_filtered_data.groupby('Model')[top_human_metrics].sum()

# Calculate percentage of flags for each LLM
max_llm_flag = 10  # Max possible flags for LLMs
llm_percentage_data = (llm_aggregated / max_llm_flag) * 100

# Calculate percentage of flags for humans
max_human_flag = 74  # Max possible flags for humans
human_percentage_data = (human_data[top_human_metrics].sum() / max_human_flag) * 100

# Print LLM percentages
print("\nPercentage of Top Metrics Flagged by Each LLM Model:\n", llm_percentage_data)

# Print Human percentages
print("\nPercentage of Top Metrics Flagged by Human Evaluators:\n", human_percentage_data)


Top 3 Positive Metrics by Human Evaluators:
 Politeness                        38.0
Simple Language (Positive)        36.0
Information Sharing (Positive)    35.0
dtype: float64

Top 3 Negative Metrics by Human Evaluators:
 Lack of meaningful contribution    15.0
Lack persuasiveness                15.0
Sarcasm (Negative)                 14.0
dtype: float64

Percentage of Top Metrics Flagged by Each LLM Model:
                       Politeness  Simple Language (Positive)  \
Model                                                          
1o-mini                    100.0                       100.0   
1o-preview                  90.0                       100.0   
gpt-4o                     100.0                       100.0   
gpt-4o-mini                100.0                       100.0   
mistral-large-latest       100.0                       100.0   
open-mistral-nemo          100.0                       100.0   

                      Information Sharing (Positive)  \
Model             

In [37]:
import pandas as pd
import numpy as np

# Human mean scores for Thread 1 and Thread 2
human_means = np.array([2.92, 3.78])

# LLM mean scores for each model by thread
llm_means = {
    '1o-mini': [3.8, 4.2],
    '1o-preview': [3.8, 4.2],
    'GPT-4o': [3.0, 4.0],
    'GPT-4o-mini': [4.0, 4.0],
    'Mistral Large': [4.0, 4.0],
    'Mistral Nemo': [3.0, 3.0]
}

# Calculate MAE for each model with human scores
mae_results = {}
for model, scores in llm_means.items():
    scores_array = np.array(scores)
    mae = np.mean(np.abs(human_means - scores_array))
    mae_results[model] = mae

# Convert to DataFrame for readability
mae_results_df = pd.DataFrame.from_dict(mae_results, orient='index', columns=['Mean Absolute Error (MAE)'])

# Print the results
print("\nMean Absolute Error (MAE) of LLM Model Scores with Human Scores:\n", mae_results_df)



Mean Absolute Error (MAE) of LLM Model Scores with Human Scores:
                Mean Absolute Error (MAE)
1o-mini                             0.65
1o-preview                          0.65
GPT-4o                              0.15
GPT-4o-mini                         0.65
Mistral Large                       0.65
Mistral Nemo                        0.43
