In [14]:
# Importando bibliotecas necessárias
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

# Configurações de visualização
pio.templates.default = "plotly_white"
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
# set default to plotly in pandas
pd.options.plotting.backend = "plotly"

In [15]:
# Definindo caminhos dos arquivos
data_path = Path("20250915")

# Carregando os arquivos
print("Loading files...")

# Dados principais para análises dos pontos 1-3
db_val2 = pd.read_excel(data_path / "DB_val2.xlsx")
print(f"DB_val2.xlsx loaded: {db_val2.shape[0]} rows, {db_val2.shape[1]} columns")

# Dicionário de espécies
dicionario = pd.read_excel(data_path / "Dicionário.xlsx")
print(f"Dicionário.xlsx loaded: {dicionario.shape[0]} rows, {dicionario.shape[1]} columns")

# Dados para análise do ponto 4 (IVC)
ivc = pd.read_excel(data_path / "IVC.xlsx")
print(f"IVC.xlsx loaded: {ivc.shape[0]} rows, {ivc.shape[1]} columns")

Loading files...
DB_val2.xlsx loaded: 156 rows, 98 columns
Dicionário.xlsx loaded: 207 rows, 7 columns
IVC.xlsx loaded: 4 rows, 38 columns


In [16]:
ivc

Unnamed: 0.1,Unnamed: 0,q01_r1,q02_r1,q03_r1,q04_r1,q05_r1,q06_r1,q07_r1,q08_r1,q09_r1,q10_r1,q11_r1,q12_r1,q13_r1,q14_r1,q01_r2,q02_r2,q03_r2,q04_r2,q05_r2,q06_r2,q07_r2,q08_r2,q09_r2,q10_r2,q11_r2,q12_r2,q13_r2,q14_r2,q15_r2,q16_r2,q17_r2,q18_r2,q19_r2,q20_r2,q21_r2,q22_r2,q23_r2
0,ava1,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,4,4,4,3,4,3,4,3,3,3,4,4,4,3,4,3,4,3,3,3,4,4,4
1,ava2,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
2,ava3,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,sim,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
3,ava4,nao,nao,nao,nao,nao,nao,nao,nao,nao,nao,nao,nao,nao,nao,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3


In [17]:
round_1 = ivc[[col for col in ivc.columns if col.endswith("_r1")]].T
round_2 = ivc[[col for col in ivc.columns if col.endswith("_r2")]].T
round_1.columns = [f"annotator_{i + 1}" for i in range(round_1.shape[1])]
round_2.columns = [f"annotator_{i + 1}" for i in range(round_2.shape[1])]
round_1

Unnamed: 0,annotator_1,annotator_2,annotator_3,annotator_4
q01_r1,sim,sim,sim,nao
q02_r1,sim,sim,sim,nao
q03_r1,sim,sim,sim,nao
q04_r1,sim,sim,sim,nao
q05_r1,sim,sim,sim,nao
q06_r1,sim,sim,sim,nao
q07_r1,sim,sim,sim,nao
q08_r1,sim,sim,sim,nao
q09_r1,sim,sim,sim,nao
q10_r1,sim,sim,sim,nao


In [18]:
round_2

Unnamed: 0,annotator_1,annotator_2,annotator_3,annotator_4
q01_r2,4,3,4,3
q02_r2,4,4,4,3
q03_r2,4,4,4,3
q04_r2,3,4,4,3
q05_r2,4,4,4,3
q06_r2,3,4,4,3
q07_r2,4,4,4,3
q08_r2,3,4,4,3
q09_r2,3,4,4,3
q10_r2,3,4,4,3


In [19]:
round_2.max(), round_2.min()

(annotator_1    4
 annotator_2    4
 annotator_3    4
 annotator_4    3
 dtype: int64,
 annotator_1    3
 annotator_2    3
 annotator_3    4
 annotator_4    3
 dtype: int64)

In [20]:
# Examining the data structure
print("Round 1 shape:", round_1.shape)
print("Round 2 shape:", round_2.shape)
print("\nRound 1 sample:")
print(round_1.head())
print("\nRound 2 sample:")
print(round_2.head())
print("\nUnique values in round_1:")
print(round_1.stack().unique())
print("\nUnique values in round_2:")
print(round_2.stack().unique())

Round 1 shape: (14, 4)
Round 2 shape: (23, 4)

Round 1 sample:
       annotator_1 annotator_2 annotator_3 annotator_4
q01_r1         sim         sim         sim         nao
q02_r1         sim         sim         sim         nao
q03_r1         sim         sim         sim         nao
q04_r1         sim         sim         sim         nao
q05_r1         sim         sim         sim         nao

Round 2 sample:
        annotator_1  annotator_2  annotator_3  annotator_4
q01_r2            4            3            4            3
q02_r2            4            4            4            3
q03_r2            4            4            4            3
q04_r2            3            4            4            3
q05_r2            4            4            4            3

Unique values in round_1:
['sim' 'nao']

Unique values in round_2:
[4 3]


In [21]:
# Import required libraries for inter-annotator agreement
from sklearn.metrics import cohen_kappa_score
from itertools import combinations
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import accuracy_score

In [22]:
# Functions for calculating inter-annotator agreement metrics


def calculate_pairwise_agreement(data, metric="cohen_kappa"):
    """
    Calculate pairwise agreement between all annotators

    Parameters:
    data: DataFrame with annotators as columns
    metric: 'cohen_kappa', 'accuracy', 'pearson', 'spearman'

    Returns:
    DataFrame with pairwise agreement scores
    """
    annotators = data.columns
    n_annotators = len(annotators)

    # Initialize results matrix
    agreement_matrix = np.zeros((n_annotators, n_annotators))

    for i, ann1 in enumerate(annotators):
        for j, ann2 in enumerate(annotators):
            if i == j:
                agreement_matrix[i, j] = 1.0  # Perfect agreement with self
            elif i < j:  # Calculate only upper triangle
                if metric == "cohen_kappa":
                    score = cohen_kappa_score(data[ann1], data[ann2])
                elif metric == "accuracy":
                    score = accuracy_score(data[ann1], data[ann2])
                elif metric == "pearson":
                    score, _ = pearsonr(data[ann1], data[ann2])
                elif metric == "spearman":
                    score, _ = spearmanr(data[ann1], data[ann2])

                agreement_matrix[i, j] = score
                agreement_matrix[j, i] = score  # Symmetric matrix

    return pd.DataFrame(agreement_matrix, index=annotators, columns=annotators)


def calculate_fleiss_kappa(data):
    """
    Calculate Fleiss' kappa for multi-rater agreement

    Parameters:
    data: DataFrame with annotators as columns and items as rows

    Returns:
    float: Fleiss' kappa value
    dict: Additional statistics including P_bar, P_e, and interpretation
    """
    n_items, n_annotators = data.shape

    # Get all unique categories/labels
    all_categories = pd.unique(data.values.ravel())
    n_categories = len(all_categories)

    # Create contingency table: items x categories
    contingency_table = np.zeros((n_items, n_categories))

    for i in range(n_items):
        for j, category in enumerate(all_categories):
            contingency_table[i, j] = (data.iloc[i] == category).sum()

    # Calculate P_i (proportion of agreement for each item)
    P_i = np.zeros(n_items)
    for i in range(n_items):
        sum_squares = np.sum(contingency_table[i] ** 2)
        P_i[i] = (sum_squares - n_annotators) / (n_annotators * (n_annotators - 1))

    # Calculate P_bar (mean proportion of agreement)
    P_bar = np.mean(P_i)

    # Calculate marginal proportions (p_j for each category)
    marginal_props = np.zeros(n_categories)
    for j in range(n_categories):
        marginal_props[j] = np.sum(contingency_table[:, j]) / (n_items * n_annotators)

    # Calculate P_e (expected agreement by chance)
    P_e = np.sum(marginal_props**2)

    # Calculate Fleiss' kappa
    if P_e == 1.0:
        kappa = 1.0  # Perfect agreement case
    else:
        kappa = (P_bar - P_e) / (1 - P_e)

    # Interpretation
    if kappa < 0:
        interpretation = "Poor (worse than chance)"
    elif kappa < 0.20:
        interpretation = "Slight"
    elif kappa < 0.40:
        interpretation = "Fair"
    elif kappa < 0.60:
        interpretation = "Moderate"
    elif kappa < 0.80:
        interpretation = "Substantial"
    else:
        interpretation = "Almost Perfect"

    return kappa, {"P_bar": P_bar, "P_e": P_e, "interpretation": interpretation, "n_items": n_items, "n_annotators": n_annotators, "n_categories": n_categories, "categories": all_categories, "marginal_proportions": dict(zip(all_categories, marginal_props))}


def calculate_fleiss_kappa_simple(data):
    """
    Calculate a simplified version of overall agreement (for backwards compatibility)
    """
    kappa, stats = calculate_fleiss_kappa(data)
    return stats["P_bar"]  # Return observed agreement proportion

In [23]:
# =============================================================================
# PROPER FLEISS' KAPPA CALCULATION
# =============================================================================

print("=" * 60)
print("PROPER FLEISS' KAPPA ANALYSIS")
print("=" * 60)

# Calculate proper Fleiss' kappa for Round 1
print("\nROUND 1 - FLEISS' KAPPA ANALYSIS")
print("-" * 45)

fleiss_kappa_r1, stats_r1 = calculate_fleiss_kappa(round_1)
print(f"Fleiss' Kappa: {fleiss_kappa_r1:.4f}")
print(f"Interpretation: {stats_r1['interpretation']}")
print(f"Observed Agreement (P̄): {stats_r1['P_bar']:.4f}")
print(f"Expected Agreement (Pe): {stats_r1['P_e']:.4f}")
print(f"Number of items: {stats_r1['n_items']}")
print(f"Number of annotators: {stats_r1['n_annotators']}")
print(f"Categories: {list(stats_r1['categories'])}")
print("Marginal proportions:")
for cat, prop in stats_r1["marginal_proportions"].items():
    print(f"  - '{cat}': {prop:.3f}")

# Calculate proper Fleiss' kappa for Round 2
print(f"\nROUND 2 - FLEISS' KAPPA ANALYSIS")
print("-" * 45)

fleiss_kappa_r2, stats_r2 = calculate_fleiss_kappa(round_2)
print(f"Fleiss' Kappa: {fleiss_kappa_r2:.4f}")
print(f"Interpretation: {stats_r2['interpretation']}")
print(f"Observed Agreement (P̄): {stats_r2['P_bar']:.4f}")
print(f"Expected Agreement (Pe): {stats_r2['P_e']:.4f}")
print(f"Number of items: {stats_r2['n_items']}")
print(f"Number of annotators: {stats_r2['n_annotators']}")
print(f"Categories: {list(stats_r2['categories'])}")
print("Marginal proportions:")
for cat, prop in stats_r2["marginal_proportions"].items():
    print(f"  - {cat}: {prop:.3f}")

print(f"\nCOMPARISON SUMMARY")
print("-" * 25)
print(f"Round 1 Fleiss' κ: {fleiss_kappa_r1:.4f} ({stats_r1['interpretation']})")
print(f"Round 2 Fleiss' κ: {fleiss_kappa_r2:.4f} ({stats_r2['interpretation']})")

# Significance test information
print(f"\nSTATISTICAL INTERPRETATION")
print("-" * 30)
print("Fleiss' Kappa interpretation guidelines:")
print("  < 0.00: Poor (worse than chance)")
print("  0.00-0.20: Slight agreement")
print("  0.21-0.40: Fair agreement")
print("  0.41-0.60: Moderate agreement")
print("  0.61-0.80: Substantial agreement")
print("  0.81-1.00: Almost perfect agreement")

PROPER FLEISS' KAPPA ANALYSIS

ROUND 1 - FLEISS' KAPPA ANALYSIS
---------------------------------------------
Fleiss' Kappa: -0.3333
Interpretation: Poor (worse than chance)
Observed Agreement (P̄): 0.5000
Expected Agreement (Pe): 0.6250
Number of items: 14
Number of annotators: 4
Categories: ['sim', 'nao']
Marginal proportions:
  - 'sim': 0.750
  - 'nao': 0.250

ROUND 2 - FLEISS' KAPPA ANALYSIS
---------------------------------------------
Fleiss' Kappa: -0.2441
Interpretation: Poor (worse than chance)
Observed Agreement (P̄): 0.4203
Expected Agreement (Pe): 0.5340
Number of items: 23
Number of annotators: 4
Categories: [np.int64(4), np.int64(3)]
Marginal proportions:
  - 4: 0.630
  - 3: 0.370

COMPARISON SUMMARY
-------------------------
Round 1 Fleiss' κ: -0.3333 (Poor (worse than chance))
Round 2 Fleiss' κ: -0.2441 (Poor (worse than chance))

STATISTICAL INTERPRETATION
------------------------------
Fleiss' Kappa interpretation guidelines:
  < 0.00: Poor (worse than chance)
  0.00-

In [24]:
# =============================================================================
# FLEISS' KAPPA VISUALIZATION
# =============================================================================

print("\nCreating Fleiss' Kappa Comparison Visualizations...")

# Create a comprehensive comparison of agreement metrics
fig_fleiss = make_subplots(
    rows=2, cols=2, subplot_titles=["Fleiss' Kappa Comparison", "Agreement Components - Round 1", "Agreement Components - Round 2", "Interpretation Categories"], specs=[[{"type": "bar"}, {"type": "bar"}], [{"type": "bar"}, {"type": "bar"}]], vertical_spacing=0.12, horizontal_spacing=0.1
)

# 1. Fleiss' Kappa Comparison
datasets = ["Round 1<br>(Categorical)", "Round 2<br>(Ordinal)"]
fleiss_values = [fleiss_kappa_r1, fleiss_kappa_r2]
colors = ["#FF6B6B" if val < 0 else "#4ECDC4" for val in fleiss_values]

fig_fleiss.add_trace(go.Bar(x=datasets, y=fleiss_values, text=[f"{val:.3f}" for val in fleiss_values], textposition="auto", marker_color=colors, name="Fleiss' Kappa"), row=1, col=1)

# 2. Agreement Components - Round 1
components_r1 = ["Observed Agreement<br>(P̄)", "Expected Agreement<br>(Pe)", "Fleiss' Kappa<br>(κ)"]
values_r1 = [stats_r1["P_bar"], stats_r1["P_e"], fleiss_kappa_r1]
colors_r1 = ["#3498DB", "#95A5A6", "#FF6B6B" if fleiss_kappa_r1 < 0 else "#4ECDC4"]

fig_fleiss.add_trace(go.Bar(x=components_r1, y=values_r1, text=[f"{val:.3f}" for val in values_r1], textposition="auto", marker_color=colors_r1, name="Round 1 Components"), row=1, col=2)

# 3. Agreement Components - Round 2
components_r2 = ["Observed Agreement<br>(P̄)", "Expected Agreement<br>(Pe)", "Fleiss' Kappa<br>(κ)"]
values_r2 = [stats_r2["P_bar"], stats_r2["P_e"], fleiss_kappa_r2]
colors_r2 = ["#3498DB", "#95A5A6", "#FF6B6B" if fleiss_kappa_r2 < 0 else "#4ECDC4"]

fig_fleiss.add_trace(go.Bar(x=components_r2, y=values_r2, text=[f"{val:.3f}" for val in values_r2], textposition="auto", marker_color=colors_r2, name="Round 2 Components"), row=2, col=1)

# 4. Interpretation scale reference
interpretation_ranges = ["Poor<br>(<0.00)", "Slight<br>(0.00-0.20)", "Fair<br>(0.21-0.40)", "Moderate<br>(0.41-0.60)", "Substantial<br>(0.61-0.80)", "Almost Perfect<br>(0.81-1.00)"]
range_values = [-0.1, 0.1, 0.3, 0.5, 0.7, 0.9]  # Representative values for each range
range_colors = ["#FF6B6B", "#FFA07A", "#FFD700", "#98FB98", "#32CD32", "#228B22"]

fig_fleiss.add_trace(go.Bar(x=interpretation_ranges, y=range_values, text=[f"{val:.1f}" for val in range_values], textposition="auto", marker_color=range_colors, name="Interpretation Scale"), row=2, col=2)

# Add horizontal lines to show where our values fall
fig_fleiss.add_hline(y=fleiss_kappa_r1, line_dash="dash", line_color="red", annotation_text=f"Round 1: {fleiss_kappa_r1:.3f}", row=2, col=2)
fig_fleiss.add_hline(y=fleiss_kappa_r2, line_dash="dash", line_color="blue", annotation_text=f"Round 2: {fleiss_kappa_r2:.3f}", row=2, col=2)

# Update layout
fig_fleiss.update_layout(height=800, title={"text": "Comprehensive Fleiss' Kappa Analysis", "x": 0.5, "font": {"size": 20}}, showlegend=False)

# Update y-axes
fig_fleiss.update_yaxes(title_text="Kappa Value", row=1, col=1)
fig_fleiss.update_yaxes(title_text="Agreement Value", row=1, col=2)
fig_fleiss.update_yaxes(title_text="Agreement Value", row=2, col=1)
fig_fleiss.update_yaxes(title_text="Reference Value", row=2, col=2)

fig_fleiss.show()


Creating Fleiss' Kappa Comparison Visualizations...


In [25]:
# =============================================================================
# DETAILED FLEISS' KAPPA INTERPRETATION
# =============================================================================

print("=" * 70)
print("DETAILED FLEISS' KAPPA INTERPRETATION & RECOMMENDATIONS")
print("=" * 70)

print(f"\nROUND 1 DETAILED ANALYSIS")
print("-" * 35)
print(f"Data Type: Categorical (sim/nao)")
print(f"Fleiss' Kappa: {fleiss_kappa_r1:.4f}")
print(f"95% of labels are 'sim' (75%), only 25% are 'nao'")
print(f"Problem: High expected agreement by chance (Pe = {stats_r1['P_e']:.3f})")
print(f"Observed agreement ({stats_r1['P_bar']:.3f}) is lower than chance expectation")
print(f"Result: Kappa is negative, indicating systematic disagreement")

print(f"\nROUND 2 DETAILED ANALYSIS")
print("-" * 35)
print(f"Data Type: Ordinal (3-4 scale)")
print(f"Fleiss' Kappa: {fleiss_kappa_r2:.4f}")
print(f"63% of labels are '4', 37% are '3'")
print(f"Problem: High expected agreement by chance (Pe = {stats_r2['P_e']:.3f})")
print(f"Observed agreement ({stats_r2['P_bar']:.3f}) is lower than chance expectation")
print(f"Result: Kappa is negative, indicating systematic disagreement")

print(f"\nCRITICAL ISSUES IDENTIFIED")
print("-" * 35)
print("1. ROUND 1: Perfect split - 3 annotators always say 'sim', 1 always says 'nao'")
print("2. ROUND 2: Limited variation - some annotators use only one category")
print("3. Both rounds show negative kappa (worse than random chance)")
print("4. High marginal imbalance creates inflated chance agreement")

print(f"\nSTATISTICAL CONTEXT")
print("-" * 25)
print("- Negative Fleiss' kappa indicates systematic bias or disagreement")
print("- Values worse than chance suggest fundamental issues with:")
print("  - Annotation guidelines clarity")
print("  - Annotator training/calibration")
print("  - Task definition ambiguity")
print("  - Potential systematic biases")

print(f"\nACTIONABLE RECOMMENDATIONS")
print("-" * 35)
print("IMMEDIATE ACTIONS:")
print("1. Review and clarify annotation guidelines")
print("2. Conduct annotator retraining sessions")
print("3. Implement calibration exercises with gold standard examples")
print("4. Consider removing consistently disagreeing annotators")

print("\nDATA COLLECTION IMPROVEMENTS:")
print("5. Use more balanced rating scales (e.g., 1-5 instead of 3-4)")
print("6. Provide clearer category definitions with examples")
print("7. Implement pilot testing of annotation protocols")
print("8. Add inter-annotator discussion sessions")

print("\nANALYSIS ENHANCEMENTS:")
print("9. Calculate confidence intervals for kappa values")
print("10. Perform significance testing")
print("11. Analyze disagreement patterns by item characteristics")
print("12. Consider weighted kappa for ordinal data")

print(f"\nQUALITY THRESHOLDS")
print("-" * 25)
print("Target Fleiss' Kappa values:")
print("- Minimum acceptable: κ > 0.40 (Moderate)")
print("- Good quality: κ > 0.60 (Substantial)")
print("- Excellent quality: κ > 0.80 (Almost Perfect)")
print(f"\nCurrent status: Both rounds are below minimum threshold")
print(f"Gap to minimum: Round 1 needs +{0.40 - fleiss_kappa_r1:.3f}, Round 2 needs +{0.40 - fleiss_kappa_r2:.3f}")

DETAILED FLEISS' KAPPA INTERPRETATION & RECOMMENDATIONS

ROUND 1 DETAILED ANALYSIS
-----------------------------------
Data Type: Categorical (sim/nao)
Fleiss' Kappa: -0.3333
95% of labels are 'sim' (75%), only 25% are 'nao'
Problem: High expected agreement by chance (Pe = 0.625)
Observed agreement (0.500) is lower than chance expectation
Result: Kappa is negative, indicating systematic disagreement

ROUND 2 DETAILED ANALYSIS
-----------------------------------
Data Type: Ordinal (3-4 scale)
Fleiss' Kappa: -0.2441
63% of labels are '4', 37% are '3'
Problem: High expected agreement by chance (Pe = 0.534)
Observed agreement (0.420) is lower than chance expectation
Result: Kappa is negative, indicating systematic disagreement

CRITICAL ISSUES IDENTIFIED
-----------------------------------
1. ROUND 1: Perfect split - 3 annotators always say 'sim', 1 always says 'nao'
2. ROUND 2: Limited variation - some annotators use only one category
3. Both rounds show negative kappa (worse than random 

In [26]:
# =============================================================================
# ROUND 1 ANALYSIS (Categorical Data: "sim"/"nao")
# =============================================================================

print("=" * 50)
print("ROUND 1 - CATEGORICAL DATA AGREEMENT ANALYSIS")
print("=" * 50)

# Calculate Cohen's Kappa for all pairs
kappa_matrix_r1 = calculate_pairwise_agreement(round_1, metric="cohen_kappa")
print("\nCohen's Kappa Matrix (Round 1):")
print(kappa_matrix_r1.round(3))

# Calculate accuracy for all pairs
accuracy_matrix_r1 = calculate_pairwise_agreement(round_1, metric="accuracy")
print("\nAccuracy Matrix (Round 1):")
print(accuracy_matrix_r1.round(3))

# Calculate overall agreement
overall_agreement_r1 = calculate_fleiss_kappa_simple(round_1)
print(f"\nOverall Agreement (Round 1): {overall_agreement_r1:.3f}")

# Summary statistics
kappa_values = []
accuracy_values = []

for i in range(len(round_1.columns)):
    for j in range(i + 1, len(round_1.columns)):
        kappa_values.append(kappa_matrix_r1.iloc[i, j])
        accuracy_values.append(accuracy_matrix_r1.iloc[i, j])

print(f"\nSummary Statistics (Round 1):")
print(f"Mean Cohen's Kappa: {np.mean(kappa_values):.3f}")
print(f"Std Cohen's Kappa: {np.std(kappa_values):.3f}")
print(f"Mean Accuracy: {np.mean(accuracy_values):.3f}")
print(f"Std Accuracy: {np.std(accuracy_values):.3f}")

ROUND 1 - CATEGORICAL DATA AGREEMENT ANALYSIS

Cohen's Kappa Matrix (Round 1):
             annotator_1  annotator_2  annotator_3  annotator_4
annotator_1          1.0          NaN          NaN          0.0
annotator_2          NaN          1.0          NaN          0.0
annotator_3          NaN          NaN          1.0          0.0
annotator_4          0.0          0.0          0.0          1.0

Accuracy Matrix (Round 1):
             annotator_1  annotator_2  annotator_3  annotator_4
annotator_1          1.0          1.0          1.0          0.0
annotator_2          1.0          1.0          1.0          0.0
annotator_3          1.0          1.0          1.0          0.0
annotator_4          0.0          0.0          0.0          1.0

Overall Agreement (Round 1): 0.500

Summary Statistics (Round 1):
Mean Cohen's Kappa: nan
Std Cohen's Kappa: nan
Mean Accuracy: 0.500
Std Accuracy: 0.500


In [27]:
# Let's examine why Cohen's Kappa is giving NaN values
print("Examining Round 1 data distribution per annotator:")
for col in round_1.columns:
    print(f"{col}: {round_1[col].value_counts().to_dict()}")

print("\n" + "=" * 50)
print("ROUND 2 - ORDINAL DATA AGREEMENT ANALYSIS")
print("=" * 50)

# Calculate Cohen's Kappa for Round 2 (ordinal data)
kappa_matrix_r2 = calculate_pairwise_agreement(round_2, metric="cohen_kappa")
print("\nCohen's Kappa Matrix (Round 2):")
print(kappa_matrix_r2.round(3))

# Calculate Pearson correlation for Round 2
pearson_matrix_r2 = calculate_pairwise_agreement(round_2, metric="pearson")
print("\nPearson Correlation Matrix (Round 2):")
print(pearson_matrix_r2.round(3))

# Calculate Spearman correlation for Round 2
spearman_matrix_r2 = calculate_pairwise_agreement(round_2, metric="spearman")
print("\nSpearman Correlation Matrix (Round 2):")
print(spearman_matrix_r2.round(3))

# Calculate overall agreement for Round 2
overall_agreement_r2 = calculate_fleiss_kappa_simple(round_2)
print(f"\nOverall Agreement (Round 2): {overall_agreement_r2:.3f}")

# Summary statistics for Round 2
kappa_values_r2 = []
pearson_values = []
spearman_values = []

for i in range(len(round_2.columns)):
    for j in range(i + 1, len(round_2.columns)):
        kappa_values_r2.append(kappa_matrix_r2.iloc[i, j])
        pearson_values.append(pearson_matrix_r2.iloc[i, j])
        spearman_values.append(spearman_matrix_r2.iloc[i, j])

print(f"\nSummary Statistics (Round 2):")
print(f"Mean Cohen's Kappa: {np.mean(kappa_values_r2):.3f}")
print(f"Std Cohen's Kappa: {np.std(kappa_values_r2):.3f}")
print(f"Mean Pearson r: {np.mean(pearson_values):.3f}")
print(f"Std Pearson r: {np.std(pearson_values):.3f}")
print(f"Mean Spearman ρ: {np.mean(spearman_values):.3f}")
print(f"Std Spearman ρ: {np.std(spearman_values):.3f}")

Examining Round 1 data distribution per annotator:
annotator_1: {'sim': 14}
annotator_2: {'sim': 14}
annotator_3: {'sim': 14}
annotator_4: {'nao': 14}

ROUND 2 - ORDINAL DATA AGREEMENT ANALYSIS

Cohen's Kappa Matrix (Round 2):
             annotator_1  annotator_2  annotator_3  annotator_4
annotator_1        1.000       -0.086          0.0          0.0
annotator_2       -0.086        1.000          0.0          0.0
annotator_3        0.000        0.000          1.0          0.0
annotator_4        0.000        0.000          0.0          1.0

Pearson Correlation Matrix (Round 2):
             annotator_1  annotator_2  annotator_3  annotator_4
annotator_1        1.000       -0.187          NaN          NaN
annotator_2       -0.187        1.000          NaN          NaN
annotator_3          NaN          NaN          1.0          NaN
annotator_4          NaN          NaN          NaN          1.0

Spearman Correlation Matrix (Round 2):
             annotator_1  annotator_2  annotator_3  an

In [28]:
# =============================================================================
# INDIVIDUAL VISUALIZATIONS WITH PLOTLY
# =============================================================================

# 1. Round 1 Accuracy Heatmap
print("1. Round 1: Accuracy Matrix")
fig1 = go.Figure(data=go.Heatmap(z=accuracy_matrix_r1.values, x=accuracy_matrix_r1.columns, y=accuracy_matrix_r1.index, colorscale="RdYlBu_r", zmin=0, zmax=1, text=accuracy_matrix_r1.round(3).values, texttemplate="%{text}", textfont={"size": 12}, showscale=True))

fig1.update_layout(title={"text": "Round 1: Inter-Annotator Accuracy Matrix", "x": 0.5, "font": {"size": 16}}, xaxis_title="Annotators", yaxis_title="Annotators", height=500, width=600)

fig1.show()

1. Round 1: Accuracy Matrix


In [29]:
# 2. Round 2 Cohen's Kappa Heatmap
print("2. Round 2: Cohen's Kappa Matrix")
fig2 = go.Figure(data=go.Heatmap(z=kappa_matrix_r2.values, x=kappa_matrix_r2.columns, y=kappa_matrix_r2.index, colorscale="RdYlBu_r", zmin=-1, zmax=1, text=kappa_matrix_r2.round(3).values, texttemplate="%{text}", textfont={"size": 12}, showscale=True))

fig2.update_layout(title={"text": "Round 2: Cohen's Kappa Agreement Matrix", "x": 0.5, "font": {"size": 16}}, xaxis_title="Annotators", yaxis_title="Annotators", height=500, width=600)

fig2.show()

2. Round 2: Cohen's Kappa Matrix


In [30]:
# 3. Round 2 Correlations Comparison
print("3. Round 2: Agreement Metrics Comparison")

# Filter out NaN values for comparison
valid_pearson = [val for val in pearson_values if not np.isnan(val)]
valid_spearman = [val for val in spearman_values if not np.isnan(val)]
valid_kappa_r2 = [val for val in kappa_values_r2 if not np.isnan(val)]

correlation_types = ["Cohen's Kappa", "Pearson r", "Spearman ρ"]
correlation_means = [np.mean(valid_kappa_r2) if valid_kappa_r2 else 0, np.mean(valid_pearson) if valid_pearson else 0, np.mean(valid_spearman) if valid_spearman else 0]

fig3 = go.Figure(data=go.Bar(x=correlation_types, y=correlation_means, text=[f"{val:.3f}" for val in correlation_means], textposition="auto", marker_color=["#1f77b4", "#ff7f0e", "#2ca02c"]))

fig3.update_layout(title={"text": "Round 2: Mean Agreement Metrics Comparison", "x": 0.5, "font": {"size": 16}}, xaxis_title="Metric Type", yaxis_title="Mean Agreement Value", height=500, width=700)

fig3.show()

3. Round 2: Agreement Metrics Comparison


In [31]:
# 4. Round 1 Response Distribution
print("4. Round 1: Response Distribution")

r1_dist = round_1.stack().value_counts()
fig4 = go.Figure(data=go.Bar(x=r1_dist.index, y=r1_dist.values, text=r1_dist.values, textposition="auto", marker_color="lightblue"))

fig4.update_layout(title={"text": "Round 1: Distribution of Responses Across All Annotators", "x": 0.5, "font": {"size": 16}}, xaxis_title="Response", yaxis_title="Count", height=500, width=600)

fig4.show()

4. Round 1: Response Distribution


In [32]:
# 5. Round 2 Response Distribution
print("5. Round 2: Response Distribution")

r2_dist = round_2.stack().value_counts()
fig5 = go.Figure(data=go.Bar(x=r2_dist.index, y=r2_dist.values, text=r2_dist.values, textposition="auto", marker_color="lightcoral"))

fig5.update_layout(title={"text": "Round 2: Distribution of Responses Across All Annotators", "x": 0.5, "font": {"size": 16}}, xaxis_title="Response (Scale: 3-4)", yaxis_title="Count", height=500, width=600)

fig5.show()

5. Round 2: Response Distribution


In [33]:
# 6. Overall Agreement Summary
print("6. Overall Agreement Summary")

agreement_summary = ["Round 1\n(Categorical)", "Round 2\n(Ordinal)"]
overall_agreements = [overall_agreement_r1, overall_agreement_r2]

fig6 = go.Figure(data=go.Bar(x=agreement_summary, y=overall_agreements, text=[f"{val:.1%}" for val in overall_agreements], textposition="auto", marker_color=["#9467bd", "#8c564b"]))

fig6.update_layout(title={"text": "Overall Inter-Annotator Agreement Comparison", "x": 0.5, "font": {"size": 16}}, xaxis_title="Dataset", yaxis_title="Overall Agreement Score", yaxis=dict(range=[0, 1]), height=500, width=600)

fig6.show()

6. Overall Agreement Summary


In [34]:
# =============================================================================
# DETAILED INDIVIDUAL PLOTS
# =============================================================================

# 1. Detailed Pairwise Agreement Analysis for Round 1
print("Detailed Pairwise Analysis - Round 1")
print("-" * 40)

pairwise_results_r1 = []
for i, ann1 in enumerate(round_1.columns):
    for j, ann2 in enumerate(round_1.columns):
        if i < j:
            acc = accuracy_score(round_1[ann1], round_1[ann2])
            pairwise_results_r1.append({"Pair": f"{ann1} vs {ann2}", "Accuracy": acc, "Annotator_1": ann1, "Annotator_2": ann2})

pairwise_df_r1 = pd.DataFrame(pairwise_results_r1)
print(pairwise_df_r1)

# Visualize pairwise agreements for Round 1
fig_r1 = px.bar(pairwise_df_r1, x="Pair", y="Accuracy", title="Round 1: Pairwise Accuracy Between Annotators", labels={"Accuracy": "Accuracy Score", "Pair": "Annotator Pairs"}, text="Accuracy")
fig_r1.update_traces(texttemplate="%{text:.3f}", textposition="outside")
fig_r1.update_layout(height=400, xaxis_tickangle=-45)
fig_r1.show()

Detailed Pairwise Analysis - Round 1
----------------------------------------
                         Pair  Accuracy  Annotator_1  Annotator_2
0  annotator_1 vs annotator_2       1.0  annotator_1  annotator_2
1  annotator_1 vs annotator_3       1.0  annotator_1  annotator_3
2  annotator_1 vs annotator_4       0.0  annotator_1  annotator_4
3  annotator_2 vs annotator_3       1.0  annotator_2  annotator_3
4  annotator_2 vs annotator_4       0.0  annotator_2  annotator_4
5  annotator_3 vs annotator_4       0.0  annotator_3  annotator_4


In [35]:
# 2. Detailed Pairwise Agreement Analysis for Round 2
print("\nDetailed Pairwise Analysis - Round 2")
print("-" * 40)

pairwise_results_r2 = []
for i, ann1 in enumerate(round_2.columns):
    for j, ann2 in enumerate(round_2.columns):
        if i < j:
            # Calculate multiple metrics
            kappa = cohen_kappa_score(round_2[ann1], round_2[ann2])

            # Calculate correlation only if there's variation in both annotators
            try:
                pearson_r, _ = pearsonr(round_2[ann1], round_2[ann2])
                spearman_r, _ = spearmanr(round_2[ann1], round_2[ann2])
            except:
                pearson_r = np.nan
                spearman_r = np.nan

            pairwise_results_r2.append({"Pair": f"{ann1} vs {ann2}", "Cohen_Kappa": kappa, "Pearson_r": pearson_r, "Spearman_r": spearman_r, "Annotator_1": ann1, "Annotator_2": ann2})

pairwise_df_r2 = pd.DataFrame(pairwise_results_r2)
print(pairwise_df_r2.round(3))

# Create multiple visualizations for Round 2
fig_r2_multi = make_subplots(rows=1, cols=3, subplot_titles=["Cohen's Kappa", "Pearson Correlation", "Spearman Correlation"], shared_yaxes=True)

# Cohen's Kappa
fig_r2_multi.add_trace(go.Bar(x=pairwise_df_r2["Pair"], y=pairwise_df_r2["Cohen_Kappa"], text=pairwise_df_r2["Cohen_Kappa"].round(3), textposition="auto", name="Cohen's Kappa", marker_color="lightblue"), row=1, col=1)

# Pearson (filter out NaN)
valid_pearson_df = pairwise_df_r2.dropna(subset=["Pearson_r"])
if not valid_pearson_df.empty:
    fig_r2_multi.add_trace(go.Bar(x=valid_pearson_df["Pair"], y=valid_pearson_df["Pearson_r"], text=valid_pearson_df["Pearson_r"].round(3), textposition="auto", name="Pearson r", marker_color="lightcoral"), row=1, col=2)

# Spearman (filter out NaN)
valid_spearman_df = pairwise_df_r2.dropna(subset=["Spearman_r"])
if not valid_spearman_df.empty:
    fig_r2_multi.add_trace(go.Bar(x=valid_spearman_df["Pair"], y=valid_spearman_df["Spearman_r"], text=valid_spearman_df["Spearman_r"].round(3), textposition="auto", name="Spearman ρ", marker_color="lightgreen"), row=1, col=3)

fig_r2_multi.update_layout(height=500, title={"text": "Round 2: Pairwise Agreement Metrics Between Annotators", "x": 0.5, "font": {"size": 16}}, showlegend=False)

fig_r2_multi.update_xaxes(tickangle=-45)
fig_r2_multi.show()


Detailed Pairwise Analysis - Round 2
----------------------------------------
                         Pair  Cohen_Kappa  Pearson_r  Spearman_r  \
0  annotator_1 vs annotator_2       -0.086     -0.187      -0.187   
1  annotator_1 vs annotator_3        0.000        NaN         NaN   
2  annotator_1 vs annotator_4        0.000        NaN         NaN   
3  annotator_2 vs annotator_3        0.000        NaN         NaN   
4  annotator_2 vs annotator_4        0.000        NaN         NaN   
5  annotator_3 vs annotator_4        0.000        NaN         NaN   

   Annotator_1  Annotator_2  
0  annotator_1  annotator_2  
1  annotator_1  annotator_3  
2  annotator_1  annotator_4  
3  annotator_2  annotator_3  
4  annotator_2  annotator_4  
5  annotator_3  annotator_4  
                         Pair  Cohen_Kappa  Pearson_r  Spearman_r  \
0  annotator_1 vs annotator_2       -0.086     -0.187      -0.187   
1  annotator_1 vs annotator_3        0.000        NaN         NaN   
2  annotator_1 vs an

In [36]:
# =============================================================================
# SUMMARY AND INTERPRETATION
# =============================================================================

print("=" * 60)
print("INTER-ANNOTATOR AGREEMENT ANALYSIS SUMMARY")
print("=" * 60)

print("\nROUND 1 (Categorical Data: 'sim'/'nao'):")
print("-" * 45)
print(f"- Dataset size: {round_1.shape[0]} items, {round_1.shape[1]} annotators")
print(f"- Overall agreement: {overall_agreement_r1:.1%}")
print(f"- Mean pairwise accuracy: {np.mean(accuracy_values):.1%}")

print("\nKey findings:")
print(f"  - Annotators 1, 2, 3 show perfect agreement (all 'sim')")
print(f"  - Annotator 4 disagrees completely (all 'nao')")
print(f"  - This creates a clear 3 vs 1 split in annotations")

print(f"\nROUND 2 (Ordinal Data: 3-4 scale):")
print("-" * 45)
print(f"- Dataset size: {round_2.shape[0]} items, {round_2.shape[1]} annotators")
print(f"- Overall agreement: {overall_agreement_r2:.1%}")
print(f"- Mean Cohen's Kappa: {np.nanmean([kappa_matrix_r2.iloc[i, j] for i in range(len(kappa_matrix_r2)) for j in range(i + 1, len(kappa_matrix_r2))]):.3f}")

# Check variation in Round 2
print(f"\nAnnotator variation (Round 2):")
for col in round_2.columns:
    values = round_2[col].unique()
    print(f"  - {col}: {sorted(values)} (variation: {len(values) > 1})")


# Create a final summary visualization
fig_summary = go.Figure()

# Add overall agreement comparison
fig_summary.add_trace(go.Bar(x=["Round 1<br>(Categorical)", "Round 2<br>(Ordinal)"], y=[overall_agreement_r1, overall_agreement_r2], text=[f"{overall_agreement_r1:.1%}", f"{overall_agreement_r2:.1%}"], textposition="auto", marker_color=["#2E86AB", "#A23B72"], name="Overall Agreement"))

fig_summary.update_layout(title={"text": "Overall Inter-Annotator Agreement Comparison", "x": 0.5, "font": {"size": 18}}, xaxis_title="Dataset", yaxis_title="Agreement Score", yaxis=dict(range=[0, 1]), height=400, showlegend=False)

fig_summary.show()

INTER-ANNOTATOR AGREEMENT ANALYSIS SUMMARY

ROUND 1 (Categorical Data: 'sim'/'nao'):
---------------------------------------------
- Dataset size: 14 items, 4 annotators
- Overall agreement: 50.0%
- Mean pairwise accuracy: 50.0%

Key findings:
  - Annotators 1, 2, 3 show perfect agreement (all 'sim')
  - Annotator 4 disagrees completely (all 'nao')
  - This creates a clear 3 vs 1 split in annotations

ROUND 2 (Ordinal Data: 3-4 scale):
---------------------------------------------
- Dataset size: 23 items, 4 annotators
- Overall agreement: 42.0%
- Mean Cohen's Kappa: -0.014

Annotator variation (Round 2):
  - annotator_1: [np.int64(3), np.int64(4)] (variation: True)
  - annotator_2: [np.int64(3), np.int64(4)] (variation: True)
  - annotator_3: [np.int64(4)] (variation: False)
  - annotator_4: [np.int64(3)] (variation: False)
