In [1]:
# Importando bibliotecas necessárias
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

# Configurações de visualização
pio.templates.default = "plotly_white"
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
# set default to plotly in pandas
pd.options.plotting.backend = "plotly"

# Create images directory if it doesn't exist
images_path = Path("src/survey_garden/images")
images_path.mkdir(parents=True, exist_ok=True)
print(f"Images will be saved to: {images_path}")

Images will be saved to: src/survey_garden/images


In [2]:
# Definindo caminhos dos arquivos
data_path = Path("20250915")

# Carregando os arquivos
print("Loading files...")

# Dados principais para análises dos pontos 1-3
db_val2 = pd.read_excel(data_path / "DB_val2.xlsx")
print(f"DB_val2.xlsx loaded: {db_val2.shape[0]} rows, {db_val2.shape[1]} columns")

# Dicionário de espécies
dicionario = pd.read_excel(data_path / "Dicionário.xlsx")
print(f"Dicionário.xlsx loaded: {dicionario.shape[0]} rows, {dicionario.shape[1]} columns")

# Dados para análise do ponto 4 (IVC)
round_1 = pd.read_excel(data_path / "IVC.xlsx", sheet_name="Rodada 1 ")
round_2 = pd.read_excel(data_path / "IVC.xlsx", sheet_name="Rodada 2")
print(f"IVC Rounda 1 loaded: {round_1.shape[0]} rows, {round_1.shape[1]} columns")
print(f"IVC Rounda 2 loaded: {round_2.shape[0]} rows, {round_2.shape[1]} columns")

Loading files...
DB_val2.xlsx loaded: 155 rows, 98 columns
Dicionário.xlsx loaded: 207 rows, 7 columns
IVC Rounda 1 loaded: 14 rows, 9 columns
IVC Rounda 2 loaded: 23 rows, 7 columns
Dicionário.xlsx loaded: 207 rows, 7 columns
IVC Rounda 1 loaded: 14 rows, 9 columns
IVC Rounda 2 loaded: 23 rows, 7 columns


In [3]:
round_1

Unnamed: 0,Item,Avaliador 1,Avaliador 2,Avaliador 3,Avaliador 4,"N° de ""sim""",IVC-i,Unnamed: 7,Unnamed: 8
0,Q01,Sim,Sim,Sim,Não,3,0.75,,"IVC-i = (Nº de Juízes que votaram ""SIM"") / (Nº..."
1,Q02,Sim,Sim,Sim,Não,3,0.75,,
2,Q03,Sim,Sim,Sim,Não,3,0.75,,
3,Q04,Sim,Sim,Sim,Não,3,0.75,,
4,Q05,Sim,Sim,Sim,Não,3,0.75,,
5,Q06,Sim,Sim,Sim,Não,3,0.75,,
6,Q07,Sim,Sim,Sim,Não,3,0.75,,
7,Q08,Sim,Sim,Sim,Não,3,0.75,,
8,Q09,Sim,Sim,Sim,Não,3,0.75,,
9,Q10,Sim,Sim,Sim,Não,3,0.75,,


In [4]:
round_2

Unnamed: 0,Item,Avaliador 1,Avaliador 2,Avaliador 3,Avaliador 4,"N° de ""sim""",IVC-i
0,Q01,Sim,Sim,Sim,Sim,4,1
1,Q02,Sim,Sim,Sim,Sim,4,1
2,Q03,Sim,Sim,Sim,Sim,4,1
3,Q04,Sim,Sim,Sim,Sim,4,1
4,Q05,Sim,Sim,Sim,Sim,4,1
5,Q06,Sim,Sim,Sim,Sim,4,1
6,Q07,Sim,Sim,Sim,Sim,4,1
7,Q08,Sim,Sim,Sim,Sim,4,1
8,Q09,Sim,Sim,Sim,Sim,4,1
9,Q10,Sim,Sim,Sim,Sim,4,1


In [5]:
round_1.columns = [i.lower().strip().replace(" ", "_") for i in round_1.columns]
round_2.columns = [i.lower().strip().replace(" ", "_") for i in round_2.columns]

round_1

Unnamed: 0,item,avaliador_1,avaliador_2,avaliador_3,avaliador_4,"n°_de_""sim""",ivc-i,unnamed:_7,unnamed:_8
0,Q01,Sim,Sim,Sim,Não,3,0.75,,"IVC-i = (Nº de Juízes que votaram ""SIM"") / (Nº..."
1,Q02,Sim,Sim,Sim,Não,3,0.75,,
2,Q03,Sim,Sim,Sim,Não,3,0.75,,
3,Q04,Sim,Sim,Sim,Não,3,0.75,,
4,Q05,Sim,Sim,Sim,Não,3,0.75,,
5,Q06,Sim,Sim,Sim,Não,3,0.75,,
6,Q07,Sim,Sim,Sim,Não,3,0.75,,
7,Q08,Sim,Sim,Sim,Não,3,0.75,,
8,Q09,Sim,Sim,Sim,Não,3,0.75,,
9,Q10,Sim,Sim,Sim,Não,3,0.75,,


In [6]:
round_2

Unnamed: 0,item,avaliador_1,avaliador_2,avaliador_3,avaliador_4,"n°_de_""sim""",ivc-i
0,Q01,Sim,Sim,Sim,Sim,4,1
1,Q02,Sim,Sim,Sim,Sim,4,1
2,Q03,Sim,Sim,Sim,Sim,4,1
3,Q04,Sim,Sim,Sim,Sim,4,1
4,Q05,Sim,Sim,Sim,Sim,4,1
5,Q06,Sim,Sim,Sim,Sim,4,1
6,Q07,Sim,Sim,Sim,Sim,4,1
7,Q08,Sim,Sim,Sim,Sim,4,1
8,Q09,Sim,Sim,Sim,Sim,4,1
9,Q10,Sim,Sim,Sim,Sim,4,1


In [7]:
round_1.set_index("item", inplace=True)
round_2.set_index("item", inplace=True)
round_1.replace({"Sim": 1, "Não": 0}, inplace=True)
round_2.replace({"Sim": 1, "Não": 0}, inplace=True)


In [8]:
round_1 = round_1[[col for col in round_1.columns if "avaliador_" in col]]
round_1

Unnamed: 0_level_0,avaliador_1,avaliador_2,avaliador_3,avaliador_4
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q01,1,1,1,0
Q02,1,1,1,0
Q03,1,1,1,0
Q04,1,1,1,0
Q05,1,1,1,0
Q06,1,1,1,0
Q07,1,1,1,0
Q08,1,1,1,0
Q09,1,1,1,0
Q10,1,1,1,0


In [9]:
round_2 = round_2[[col for col in round_2.columns if "avaliador_" in col]]
round_2

Unnamed: 0_level_0,avaliador_1,avaliador_2,avaliador_3,avaliador_4
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q01,1,1,1,1
Q02,1,1,1,1
Q03,1,1,1,1
Q04,1,1,1,1
Q05,1,1,1,1
Q06,1,1,1,1
Q07,1,1,1,1
Q08,1,1,1,1
Q09,1,1,1,1
Q10,1,1,1,1


In [10]:
round_2

Unnamed: 0_level_0,avaliador_1,avaliador_2,avaliador_3,avaliador_4
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q01,1,1,1,1
Q02,1,1,1,1
Q03,1,1,1,1
Q04,1,1,1,1
Q05,1,1,1,1
Q06,1,1,1,1
Q07,1,1,1,1
Q08,1,1,1,1
Q09,1,1,1,1
Q10,1,1,1,1


In [11]:
round_2.max(), round_2.min()

(avaliador_1    1
 avaliador_2    1
 avaliador_3    1
 avaliador_4    1
 dtype: int64,
 avaliador_1    1
 avaliador_2    1
 avaliador_3    1
 avaliador_4    1
 dtype: int64)

In [12]:
# Examining the data structure
print("Round 1 shape:", round_1.shape)
print("Round 2 shape:", round_2.shape)
print("\nRound 1 sample:")
print(round_1.head())
print("\nRound 2 sample:")
print(round_2.head())
print("\nUnique values in round_1:")
print(round_1.stack().unique())
print("\nUnique values in round_2:")
print(round_2.stack().unique())

Round 1 shape: (14, 4)
Round 2 shape: (23, 4)

Round 1 sample:
      avaliador_1  avaliador_2  avaliador_3  avaliador_4
item                                                    
Q01             1            1            1            0
Q02             1            1            1            0
Q03             1            1            1            0
Q04             1            1            1            0
Q05             1            1            1            0

Round 2 sample:
      avaliador_1  avaliador_2  avaliador_3  avaliador_4
item                                                    
Q01             1            1            1            1
Q02             1            1            1            1
Q03             1            1            1            1
Q04             1            1            1            1
Q05             1            1            1            1

Unique values in round_1:
[1 0]

Unique values in round_2:
[1]


In [13]:
# Import required libraries for inter-annotator agreement
from sklearn.metrics import cohen_kappa_score
from itertools import combinations
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import accuracy_score

In [14]:
# Functions for calculating inter-annotator agreement metrics


def calculate_pairwise_agreement(data, metric="cohen_kappa"):
    """
    Calculate pairwise agreement between all annotators

    Parameters:
    data: DataFrame with annotators as columns
    metric: 'cohen_kappa', 'accuracy', 'pearson', 'spearman'

    Returns:
    DataFrame with pairwise agreement scores
    """
    annotators = data.columns
    n_annotators = len(annotators)

    # Initialize results matrix
    agreement_matrix = np.zeros((n_annotators, n_annotators))

    for i, ann1 in enumerate(annotators):
        for j, ann2 in enumerate(annotators):
            if i == j:
                agreement_matrix[i, j] = 1.0  # Perfect agreement with self
            elif i < j:  # Calculate only upper triangle
                if metric == "cohen_kappa":
                    score = cohen_kappa_score(data[ann1], data[ann2])
                elif metric == "accuracy":
                    score = accuracy_score(data[ann1], data[ann2])
                elif metric == "pearson":
                    score, _ = pearsonr(data[ann1], data[ann2])
                elif metric == "spearman":
                    score, _ = spearmanr(data[ann1], data[ann2])

                agreement_matrix[i, j] = score
                agreement_matrix[j, i] = score  # Symmetric matrix

    return pd.DataFrame(agreement_matrix, index=annotators, columns=annotators)


def calculate_fleiss_kappa(data):
    """
    Calculate Fleiss' kappa for multi-rater agreement

    Parameters:
    data: DataFrame with annotators as columns and items as rows

    Returns:
    float: Fleiss' kappa value
    dict: Additional statistics including P_bar, P_e, and interpretation
    """
    n_items, n_annotators = data.shape

    # Get all unique categories/labels
    all_categories = pd.unique(data.values.ravel())
    n_categories = len(all_categories)

    # Create contingency table: items x categories
    contingency_table = np.zeros((n_items, n_categories))

    for i in range(n_items):
        for j, category in enumerate(all_categories):
            contingency_table[i, j] = (data.iloc[i] == category).sum()

    # Calculate P_i (proportion of agreement for each item)
    P_i = np.zeros(n_items)
    for i in range(n_items):
        sum_squares = np.sum(contingency_table[i] ** 2)
        P_i[i] = (sum_squares - n_annotators) / (n_annotators * (n_annotators - 1))

    # Calculate P_bar (mean proportion of agreement)
    P_bar = np.mean(P_i)

    # Calculate marginal proportions (p_j for each category)
    marginal_props = np.zeros(n_categories)
    for j in range(n_categories):
        marginal_props[j] = np.sum(contingency_table[:, j]) / (n_items * n_annotators)

    # Calculate P_e (expected agreement by chance)
    P_e = np.sum(marginal_props**2)

    # Calculate Fleiss' kappa
    if P_e == 1.0:
        kappa = 1.0  # Perfect agreement case
    else:
        kappa = (P_bar - P_e) / (1 - P_e)

    # Interpretation
    if kappa < 0:
        interpretation = "Poor (worse than chance)"
    elif kappa < 0.20:
        interpretation = "Slight"
    elif kappa < 0.40:
        interpretation = "Fair"
    elif kappa < 0.60:
        interpretation = "Moderate"
    elif kappa < 0.80:
        interpretation = "Substantial"
    else:
        interpretation = "Almost Perfect"

    return kappa, {"P_bar": P_bar, "P_e": P_e, "interpretation": interpretation, "n_items": n_items, "n_annotators": n_annotators, "n_categories": n_categories, "categories": all_categories, "marginal_proportions": dict(zip(all_categories, marginal_props))}


def calculate_fleiss_kappa_simple(data):
    """
    Calculate a simplified version of overall agreement (for backwards compatibility)
    """
    kappa, stats = calculate_fleiss_kappa(data)
    return stats["P_bar"]  # Return observed agreement proportion

In [15]:
# =============================================================================
# PROPER FLEISS' KAPPA CALCULATION
# =============================================================================

print("=" * 60)
print("PROPER FLEISS' KAPPA ANALYSIS")
print("=" * 60)

# Calculate proper Fleiss' kappa for Round 1
print("\nROUND 1 - FLEISS' KAPPA ANALYSIS")
print("-" * 45)

fleiss_kappa_r1, stats_r1 = calculate_fleiss_kappa(round_1)
print(f"Fleiss' Kappa: {fleiss_kappa_r1:.4f}")
print(f"Interpretation: {stats_r1['interpretation']}")
print(f"Observed Agreement (P̄): {stats_r1['P_bar']:.4f}")
print(f"Expected Agreement (Pe): {stats_r1['P_e']:.4f}")
print(f"Number of items: {stats_r1['n_items']}")
print(f"Number of annotators: {stats_r1['n_annotators']}")
print(f"Categories: {list(stats_r1['categories'])}")
print("Marginal proportions:")
for cat, prop in stats_r1["marginal_proportions"].items():
    print(f"  - '{cat}': {prop:.3f}")

# Calculate proper Fleiss' kappa for Round 2
print(f"\nROUND 2 - FLEISS' KAPPA ANALYSIS")
print("-" * 45)

fleiss_kappa_r2, stats_r2 = calculate_fleiss_kappa(round_2)
print(f"Fleiss' Kappa: {fleiss_kappa_r2:.4f}")
print(f"Interpretation: {stats_r2['interpretation']}")
print(f"Observed Agreement (P̄): {stats_r2['P_bar']:.4f}")
print(f"Expected Agreement (Pe): {stats_r2['P_e']:.4f}")
print(f"Number of items: {stats_r2['n_items']}")
print(f"Number of annotators: {stats_r2['n_annotators']}")
print(f"Categories: {list(stats_r2['categories'])}")
print("Marginal proportions:")
for cat, prop in stats_r2["marginal_proportions"].items():
    print(f"  - '{cat}': {prop:.3f}")

print(f"\nCOMPARISON SUMMARY")
print("-" * 25)
print(f"Round 1 Fleiss' κ: {fleiss_kappa_r1:.4f} ({stats_r1['interpretation']})")
print(f"Round 2 Fleiss' κ: {fleiss_kappa_r2:.4f} ({stats_r2['interpretation']})")

# Significance test information
print(f"\nSTATISTICAL INTERPRETATION")
print("-" * 30)
print("Fleiss' Kappa interpretation guidelines:")
print("  < 0.00: Poor (worse than chance)")
print("  0.00-0.20: Slight agreement")
print("  0.21-0.40: Fair agreement")
print("  0.41-0.60: Moderate agreement")
print("  0.61-0.80: Substantial agreement")
print("  0.81-1.00: Almost perfect agreement")

PROPER FLEISS' KAPPA ANALYSIS

ROUND 1 - FLEISS' KAPPA ANALYSIS
---------------------------------------------
Fleiss' Kappa: -0.3333
Interpretation: Poor (worse than chance)
Observed Agreement (P̄): 0.5000
Expected Agreement (Pe): 0.6250
Number of items: 14
Number of annotators: 4
Categories: [np.int64(1), np.int64(0)]
Marginal proportions:
  - '1': 0.750
  - '0': 0.250

ROUND 2 - FLEISS' KAPPA ANALYSIS
---------------------------------------------
Fleiss' Kappa: 1.0000
Interpretation: Almost Perfect
Observed Agreement (P̄): 1.0000
Expected Agreement (Pe): 1.0000
Number of items: 23
Number of annotators: 4
Categories: [np.int64(1)]
Marginal proportions:
  - '1': 1.000

COMPARISON SUMMARY
-------------------------
Round 1 Fleiss' κ: -0.3333 (Poor (worse than chance))
Round 2 Fleiss' κ: 1.0000 (Almost Perfect)

STATISTICAL INTERPRETATION
------------------------------
Fleiss' Kappa interpretation guidelines:
  < 0.00: Poor (worse than chance)
  0.00-0.20: Slight agreement
  0.21-0.40: Fa

In [16]:
# =============================================================================
# FLEISS' KAPPA VISUALIZATION
# =============================================================================

print("\nCreating Fleiss' Kappa Comparison Visualizations...")

# Create a comprehensive comparison of agreement metrics
fig_fleiss = make_subplots(
    rows=2, cols=2, subplot_titles=["Fleiss' Kappa Comparison", "Agreement Components - Round 1", "Agreement Components - Round 2", "Interpretation Categories"], specs=[[{"type": "bar"}, {"type": "bar"}], [{"type": "bar"}, {"type": "bar"}]], vertical_spacing=0.12, horizontal_spacing=0.1
)

# 1. Fleiss' Kappa Comparison
datasets = ["Round 1<br>(Binary)", "Round 2<br>(Binary)"]
fleiss_values = [fleiss_kappa_r1, fleiss_kappa_r2]
colors = ["#FF6B6B" if val < 0 else "#4ECDC4" for val in fleiss_values]

fig_fleiss.add_trace(go.Bar(x=datasets, y=fleiss_values, text=[f"{val:.3f}" for val in fleiss_values], textposition="auto", marker_color=colors, name="Fleiss' Kappa"), row=1, col=1)

# 2. Agreement Components - Round 1
components_r1 = ["Observed Agreement<br>(P̄)", "Expected Agreement<br>(Pe)", "Fleiss' Kappa<br>(κ)"]
values_r1 = [stats_r1["P_bar"], stats_r1["P_e"], fleiss_kappa_r1]
colors_r1 = ["#3498DB", "#95A5A6", "#FF6B6B" if fleiss_kappa_r1 < 0 else "#4ECDC4"]

fig_fleiss.add_trace(go.Bar(x=components_r1, y=values_r1, text=[f"{val:.3f}" for val in values_r1], textposition="auto", marker_color=colors_r1, name="Round 1 Components"), row=1, col=2)

# 3. Agreement Components - Round 2
components_r2 = ["Observed Agreement<br>(P̄)", "Expected Agreement<br>(Pe)", "Fleiss' Kappa<br>(κ)"]
values_r2 = [stats_r2["P_bar"], stats_r2["P_e"], fleiss_kappa_r2]
colors_r2 = ["#3498DB", "#95A5A6", "#FF6B6B" if fleiss_kappa_r2 < 0 else "#4ECDC4"]

fig_fleiss.add_trace(go.Bar(x=components_r2, y=values_r2, text=[f"{val:.3f}" for val in values_r2], textposition="auto", marker_color=colors_r2, name="Round 2 Components"), row=2, col=1)

# 4. Interpretation scale reference
interpretation_ranges = ["Poor<br>(<0.00)", "Slight<br>(0.00-0.20)", "Fair<br>(0.21-0.40)", "Moderate<br>(0.41-0.60)", "Substantial<br>(0.61-0.80)", "Almost Perfect<br>(0.81-1.00)"]
range_values = [-0.1, 0.1, 0.3, 0.5, 0.7, 0.9]  # Representative values for each range
range_colors = ["#FF6B6B", "#FFA07A", "#FFD700", "#98FB98", "#32CD32", "#228B22"]

fig_fleiss.add_trace(go.Bar(x=interpretation_ranges, y=range_values, text=[f"{val:.1f}" for val in range_values], textposition="auto", marker_color=range_colors, name="Interpretation Scale"), row=2, col=2)

# Add horizontal lines to show where our values fall
fig_fleiss.add_hline(y=fleiss_kappa_r1, line_dash="dash", line_color="red", annotation_text=f"Round 1: {fleiss_kappa_r1:.3f}", row=2, col=2)
fig_fleiss.add_hline(y=fleiss_kappa_r2, line_dash="dash", line_color="blue", annotation_text=f"Round 2: {fleiss_kappa_r2:.3f}", row=2, col=2)

# Update layout
fig_fleiss.update_layout(height=800, title={"text": "Comprehensive Fleiss' Kappa Analysis", "x": 0.5, "font": {"size": 20}}, showlegend=False)

# Update y-axes
fig_fleiss.update_yaxes(title_text="Kappa Value", row=1, col=1)
fig_fleiss.update_yaxes(title_text="Agreement Value", row=1, col=2)
fig_fleiss.update_yaxes(title_text="Agreement Value", row=2, col=1)
fig_fleiss.update_yaxes(title_text="Reference Value", row=2, col=2)

fig_fleiss.show()

# Save the figure
fig_fleiss.write_html(images_path / "03_iaa_fleiss_kappa_comprehensive.html")
fig_fleiss.write_image(images_path / "03_iaa_fleiss_kappa_comprehensive.png", width=1200, height=800, scale=3)
print(f"Saved: 03_iaa_fleiss_kappa_comprehensive.html and .png")


Creating Fleiss' Kappa Comparison Visualizations...


Saved: 03_iaa_fleiss_kappa_comprehensive.html and .png


In [17]:
# =============================================================================
# DETAILED FLEISS' KAPPA INTERPRETATION
# =============================================================================

print("=" * 70)
print("DETAILED FLEISS' KAPPA INTERPRETATION & RECOMMENDATIONS")
print("=" * 70)

print(f"\nROUND 1 DETAILED ANALYSIS")
print("-" * 35)
print(f"Data Type: Binary (0/1)")
print(f"Fleiss' Kappa: {fleiss_kappa_r1:.4f}")
print(f"Label distribution: {dict(stats_r1['marginal_proportions'])}")
print(f"Problem: High expected agreement by chance (Pe = {stats_r1['P_e']:.3f})")
print(f"Observed agreement ({stats_r1['P_bar']:.3f}) vs chance expectation")
if fleiss_kappa_r1 < 0:
    print(f"Result: Kappa is negative, indicating systematic disagreement")
else:
    print(f"Result: {stats_r1['interpretation']}")

print(f"\nROUND 2 DETAILED ANALYSIS")
print("-" * 35)
print(f"Data Type: Binary (0/1)")
print(f"Fleiss' Kappa: {fleiss_kappa_r2:.4f}")
print(f"Label distribution: {dict(stats_r2['marginal_proportions'])}")
print(f"Problem: High expected agreement by chance (Pe = {stats_r2['P_e']:.3f})")
print(f"Observed agreement ({stats_r2['P_bar']:.3f}) vs chance expectation")
if fleiss_kappa_r2 < 0:
    print(f"Result: Kappa is negative, indicating systematic disagreement")
else:
    print(f"Result: {stats_r2['interpretation']}")

print(f"\nCRITICAL ISSUES IDENTIFIED")
print("-" * 35)
if fleiss_kappa_r1 < 0 or fleiss_kappa_r2 < 0:
    print("1. Negative kappa values indicate systematic disagreement")
    print("2. Annotators may have systematic biases or different interpretations")
    print("3. High marginal imbalance creates inflated chance agreement")
    print("4. Both rounds show binary classification challenges")
else:
    print("1. Both rounds use binary classification (0/1)")
    print("2. Agreement levels may be affected by class imbalance")
    print("3. Consider the distribution of positive and negative labels")

print(f"\nSTATISTICAL CONTEXT")
print("-" * 25)
print("- Fleiss' kappa measures agreement beyond chance for multiple raters")
print("- Negative values indicate systematic bias or disagreement")
print("- Binary data can be affected by prevalence and bias effects")
print("- Consider complementary metrics like accuracy or percent agreement")

print(f"\nACTIONABLE RECOMMENDATIONS")
print("-" * 35)
print("IMMEDIATE ACTIONS:")
print("1. Review and clarify annotation guidelines")
print("2. Conduct annotator retraining sessions")
print("3. Implement calibration exercises with gold standard examples")
print("4. Analyze systematic patterns in disagreements")

print("\nDATA COLLECTION IMPROVEMENTS:")
print("5. Ensure balanced representation of both classes if possible")
print("6. Provide clearer category definitions with examples")
print("7. Implement pilot testing of annotation protocols")
print("8. Add inter-annotator discussion sessions")

print("\nANALYSIS ENHANCEMENTS:")
print("9. Calculate confidence intervals for kappa values")
print("10. Perform significance testing")
print("11. Analyze disagreement patterns by item characteristics")
print("12. Consider prevalence-adjusted bias-adjusted kappa (PABAK)")

print(f"\nQUALITY THRESHOLDS")
print("-" * 25)
print("Target Fleiss' Kappa values:")
print("- Minimum acceptable: κ > 0.40 (Moderate)")
print("- Good quality: κ > 0.60 (Substantial)")
print("- Excellent quality: κ > 0.80 (Almost Perfect)")
print(f"\nCurrent status:")
print(f"  Round 1: κ = {fleiss_kappa_r1:.3f} ({stats_r1['interpretation']})")
print(f"  Round 2: κ = {fleiss_kappa_r2:.3f} ({stats_r2['interpretation']})")
if fleiss_kappa_r1 < 0.40:
    print(f"  Round 1 gap to minimum: needs +{0.40 - fleiss_kappa_r1:.3f}")
if fleiss_kappa_r2 < 0.40:
    print(f"  Round 2 gap to minimum: needs +{0.40 - fleiss_kappa_r2:.3f}")

DETAILED FLEISS' KAPPA INTERPRETATION & RECOMMENDATIONS

ROUND 1 DETAILED ANALYSIS
-----------------------------------
Data Type: Binary (0/1)
Fleiss' Kappa: -0.3333
Label distribution: {np.int64(1): np.float64(0.75), np.int64(0): np.float64(0.25)}
Problem: High expected agreement by chance (Pe = 0.625)
Observed agreement (0.500) vs chance expectation
Result: Kappa is negative, indicating systematic disagreement

ROUND 2 DETAILED ANALYSIS
-----------------------------------
Data Type: Binary (0/1)
Fleiss' Kappa: 1.0000
Label distribution: {np.int64(1): np.float64(1.0)}
Problem: High expected agreement by chance (Pe = 1.000)
Observed agreement (1.000) vs chance expectation
Result: Almost Perfect

CRITICAL ISSUES IDENTIFIED
-----------------------------------
1. Negative kappa values indicate systematic disagreement
2. Annotators may have systematic biases or different interpretations
3. High marginal imbalance creates inflated chance agreement
4. Both rounds show binary classification c

In [18]:
# =============================================================================
# ROUND 1 ANALYSIS (Binary Data: 0/1)
# =============================================================================

print("=" * 50)
print("ROUND 1 - BINARY DATA AGREEMENT ANALYSIS")
print("=" * 50)

# Calculate Cohen's Kappa for all pairs
kappa_matrix_r1 = calculate_pairwise_agreement(round_1, metric="cohen_kappa")
print("\nCohen's Kappa Matrix (Round 1):")
print(kappa_matrix_r1.round(3))

# Calculate accuracy for all pairs
accuracy_matrix_r1 = calculate_pairwise_agreement(round_1, metric="accuracy")
print("\nAccuracy Matrix (Round 1):")
print(accuracy_matrix_r1.round(3))

# Calculate overall agreement
overall_agreement_r1 = calculate_fleiss_kappa_simple(round_1)
print(f"\nOverall Agreement (Round 1): {overall_agreement_r1:.3f}")

# Summary statistics
kappa_values = []
accuracy_values = []

for i in range(len(round_1.columns)):
    for j in range(i + 1, len(round_1.columns)):
        kappa_values.append(kappa_matrix_r1.iloc[i, j])
        accuracy_values.append(accuracy_matrix_r1.iloc[i, j])

print(f"\nSummary Statistics (Round 1):")
print(f"Mean Cohen's Kappa: {np.mean(kappa_values):.3f}")
print(f"Std Cohen's Kappa: {np.std(kappa_values):.3f}")
print(f"Mean Accuracy: {np.mean(accuracy_values):.3f}")
print(f"Std Accuracy: {np.std(accuracy_values):.3f}")

ROUND 1 - BINARY DATA AGREEMENT ANALYSIS

Cohen's Kappa Matrix (Round 1):
             avaliador_1  avaliador_2  avaliador_3  avaliador_4
avaliador_1          1.0          NaN          NaN          0.0
avaliador_2          NaN          1.0          NaN          0.0
avaliador_3          NaN          NaN          1.0          0.0
avaliador_4          0.0          0.0          0.0          1.0

Accuracy Matrix (Round 1):
             avaliador_1  avaliador_2  avaliador_3  avaliador_4
avaliador_1          1.0          1.0          1.0          0.0
avaliador_2          1.0          1.0          1.0          0.0
avaliador_3          1.0          1.0          1.0          0.0
avaliador_4          0.0          0.0          0.0          1.0

Overall Agreement (Round 1): 0.500

Summary Statistics (Round 1):
Mean Cohen's Kappa: nan
Std Cohen's Kappa: nan
Mean Accuracy: 0.500
Std Accuracy: 0.500


In [19]:
# Let's examine why Cohen's Kappa is giving NaN values
print("Examining Round 1 data distribution per annotator:")
for col in round_1.columns:
    print(f"{col}: {round_1[col].value_counts().to_dict()}")

print("\n" + "=" * 50)
print("ROUND 2 - BINARY DATA AGREEMENT ANALYSIS")
print("=" * 50)

# Calculate Cohen's Kappa for Round 2 (binary data)
kappa_matrix_r2 = calculate_pairwise_agreement(round_2, metric="cohen_kappa")
print("\nCohen's Kappa Matrix (Round 2):")
print(kappa_matrix_r2.round(3))

# Calculate accuracy for Round 2
accuracy_matrix_r2 = calculate_pairwise_agreement(round_2, metric="accuracy")
print("\nAccuracy Matrix (Round 2):")
print(accuracy_matrix_r2.round(3))

# Calculate Pearson correlation for Round 2 (for binary data, this is phi coefficient)
pearson_matrix_r2 = calculate_pairwise_agreement(round_2, metric="pearson")
print("\nPearson Correlation Matrix (Round 2) - Phi Coefficient for binary data:")
print(pearson_matrix_r2.round(3))

# Calculate Spearman correlation for Round 2
spearman_matrix_r2 = calculate_pairwise_agreement(round_2, metric="spearman")
print("\nSpearman Correlation Matrix (Round 2):")
print(spearman_matrix_r2.round(3))

# Calculate overall agreement for Round 2
overall_agreement_r2 = calculate_fleiss_kappa_simple(round_2)
print(f"\nOverall Agreement (Round 2): {overall_agreement_r2:.3f}")

# Summary statistics for Round 2
kappa_values_r2 = []
pearson_values = []
spearman_values = []

for i in range(len(round_2.columns)):
    for j in range(i + 1, len(round_2.columns)):
        kappa_values_r2.append(kappa_matrix_r2.iloc[i, j])
        pearson_values.append(pearson_matrix_r2.iloc[i, j])
        spearman_values.append(spearman_matrix_r2.iloc[i, j])

print(f"\nSummary Statistics (Round 2):")
print(f"Mean Cohen's Kappa: {np.mean(kappa_values_r2):.3f}")
print(f"Std Cohen's Kappa: {np.std(kappa_values_r2):.3f}")
print(f"Mean Phi Coefficient (Pearson for binary): {np.mean(pearson_values):.3f}")
print(f"Std Phi Coefficient: {np.std(pearson_values):.3f}")
print(f"Mean Spearman ρ: {np.mean(spearman_values):.3f}")
print(f"Std Spearman ρ: {np.std(spearman_values):.3f}")

Examining Round 1 data distribution per annotator:
avaliador_1: {1: 14}
avaliador_2: {1: 14}
avaliador_3: {1: 14}
avaliador_4: {0: 14}

ROUND 2 - BINARY DATA AGREEMENT ANALYSIS

Cohen's Kappa Matrix (Round 2):
             avaliador_1  avaliador_2  avaliador_3  avaliador_4
avaliador_1          1.0          NaN          NaN          NaN
avaliador_2          NaN          1.0          NaN          NaN
avaliador_3          NaN          NaN          1.0          NaN
avaliador_4          NaN          NaN          NaN          1.0

Accuracy Matrix (Round 2):
             avaliador_1  avaliador_2  avaliador_3  avaliador_4
avaliador_1          1.0          1.0          1.0          1.0
avaliador_2          1.0          1.0          1.0          1.0
avaliador_3          1.0          1.0          1.0          1.0
avaliador_4          1.0          1.0          1.0          1.0

Pearson Correlation Matrix (Round 2) - Phi Coefficient for binary data:
             avaliador_1  avaliador_2  avaliador_

In [20]:
# =============================================================================
# INDIVIDUAL VISUALIZATIONS WITH PLOTLY
# =============================================================================

# 1. Round 1 Accuracy Heatmap
print("1. Round 1: Accuracy Matrix")
fig1 = go.Figure(data=go.Heatmap(z=accuracy_matrix_r1.values, x=accuracy_matrix_r1.columns, y=accuracy_matrix_r1.index, colorscale="RdYlBu_r", zmin=0, zmax=1, text=accuracy_matrix_r1.round(3).values, texttemplate="%{text}", textfont={"size": 12}, showscale=True))

fig1.update_layout(title={"text": "Round 1: Inter-Annotator Accuracy Matrix", "x": 0.5, "font": {"size": 16}}, xaxis_title="Annotators", yaxis_title="Annotators", height=500, width=600)

fig1.show()
fig1.write_html(images_path / "03_iaa_r1_accuracy_matrix.html")
fig1.write_image(images_path / "03_iaa_r1_accuracy_matrix.png", width=600, height=500, scale=3)
print("Saved: 03_iaa_r1_accuracy_matrix.html and .png")

1. Round 1: Accuracy Matrix


Saved: 03_iaa_r1_accuracy_matrix.html and .png


In [21]:
# 2. Round 2 Cohen's Kappa Heatmap
print("2. Round 2: Cohen's Kappa Matrix")
fig2 = go.Figure(data=go.Heatmap(z=kappa_matrix_r2.values, x=kappa_matrix_r2.columns, y=kappa_matrix_r2.index, colorscale="RdYlBu_r", zmin=-1, zmax=1, text=kappa_matrix_r2.round(3).values, texttemplate="%{text}", textfont={"size": 12}, showscale=True))

fig2.update_layout(title={"text": "Round 2: Cohen's Kappa Agreement Matrix", "x": 0.5, "font": {"size": 16}}, xaxis_title="Annotators", yaxis_title="Annotators", height=500, width=600)

fig2.show()
fig2.write_html(images_path / "03_iaa_r2_kappa_matrix.html")
fig2.write_image(images_path / "03_iaa_r2_kappa_matrix.png", width=600, height=500, scale=3)
print("Saved: 03_iaa_r2_kappa_matrix.html and .png")

2. Round 2: Cohen's Kappa Matrix


Saved: 03_iaa_r2_kappa_matrix.html and .png


In [22]:
# 3. Round 2 Correlations Comparison
print("3. Round 2: Agreement Metrics Comparison")

# Filter out NaN values for comparison
valid_pearson = [val for val in pearson_values if not np.isnan(val)]
valid_spearman = [val for val in spearman_values if not np.isnan(val)]
valid_kappa_r2 = [val for val in kappa_values_r2 if not np.isnan(val)]

correlation_types = ["Cohen's Kappa", "Phi Coefficient\n(Pearson)", "Spearman ρ"]
correlation_means = [np.mean(valid_kappa_r2) if valid_kappa_r2 else 0, np.mean(valid_pearson) if valid_pearson else 0, np.mean(valid_spearman) if valid_spearman else 0]

fig3 = go.Figure(data=go.Bar(x=correlation_types, y=correlation_means, text=[f"{val:.3f}" for val in correlation_means], textposition="auto", marker_color=["#1f77b4", "#ff7f0e", "#2ca02c"]))

fig3.update_layout(title={"text": "Round 2: Mean Agreement Metrics Comparison (Binary Data)", "x": 0.5, "font": {"size": 16}}, xaxis_title="Metric Type", yaxis_title="Mean Agreement Value", height=500, width=700)

fig3.show()
fig3.write_html(images_path / "03_iaa_r2_metrics_comparison.html")
fig3.write_image(images_path / "03_iaa_r2_metrics_comparison.png", width=700, height=500, scale=3)
print("Saved: 03_iaa_r2_metrics_comparison.html and .png")

3. Round 2: Agreement Metrics Comparison


Saved: 03_iaa_r2_metrics_comparison.html and .png


In [23]:
# 4. Round 1 Response Distribution
print("4. Round 1: Response Distribution")

r1_dist = round_1.stack().value_counts()
fig4 = go.Figure(data=go.Bar(x=r1_dist.index, y=r1_dist.values, text=r1_dist.values, textposition="auto", marker_color="lightblue"))

fig4.update_layout(title={"text": "Round 1: Distribution of Responses Across All Annotators", "x": 0.5, "font": {"size": 16}}, xaxis_title="Response", yaxis_title="Count", height=500, width=600)

fig4.show()

4. Round 1: Response Distribution


In [24]:
# 5. Round 2 Response Distribution
print("5. Round 2: Response Distribution")

r2_dist = round_2.stack().value_counts()
fig5 = go.Figure(data=go.Bar(x=r2_dist.index, y=r2_dist.values, text=r2_dist.values, textposition="auto", marker_color="lightcoral"))

fig5.update_layout(title={"text": "Round 2: Distribution of Responses Across All Annotators (Binary)", "x": 0.5, "font": {"size": 16}}, xaxis_title="Response (0/1)", yaxis_title="Count", height=500, width=600)

fig5.show()

5. Round 2: Response Distribution


In [25]:
# 6. Overall Agreement Summary
print("6. Overall Agreement Summary")

agreement_summary = ["Round 1\n(Binary)", "Round 2\n(Binary)"]
overall_agreements = [overall_agreement_r1, overall_agreement_r2]

fig6 = go.Figure(data=go.Bar(x=agreement_summary, y=overall_agreements, text=[f"{val:.1%}" for val in overall_agreements], textposition="auto", marker_color=["#9467bd", "#8c564b"]))

fig6.update_layout(title={"text": "Overall Inter-Annotator Agreement Comparison", "x": 0.5, "font": {"size": 16}}, xaxis_title="Dataset", yaxis_title="Overall Agreement Score", yaxis=dict(range=[0, 1]), height=500, width=600)

fig6.show()

6. Overall Agreement Summary


In [26]:
# =============================================================================
# DETAILED INDIVIDUAL PLOTS
# =============================================================================

# 1. Detailed Pairwise Agreement Analysis for Round 1
print("Detailed Pairwise Analysis - Round 1")
print("-" * 40)

pairwise_results_r1 = []
for i, ann1 in enumerate(round_1.columns):
    for j, ann2 in enumerate(round_1.columns):
        if i < j:
            acc = accuracy_score(round_1[ann1], round_1[ann2])
            pairwise_results_r1.append({"Pair": f"{ann1} vs {ann2}", "Accuracy": acc, "Annotator_1": ann1, "Annotator_2": ann2})

pairwise_df_r1 = pd.DataFrame(pairwise_results_r1)
print(pairwise_df_r1)

# Visualize pairwise agreements for Round 1
fig_r1 = px.bar(pairwise_df_r1, x="Pair", y="Accuracy", title="Round 1: Pairwise Accuracy Between Annotators", labels={"Accuracy": "Accuracy Score", "Pair": "Annotator Pairs"}, text="Accuracy")
fig_r1.update_traces(texttemplate="%{text:.3f}", textposition="outside")
fig_r1.update_layout(height=400, xaxis_tickangle=-45)
fig_r1.show()

Detailed Pairwise Analysis - Round 1
----------------------------------------
                         Pair  Accuracy  Annotator_1  Annotator_2
0  avaliador_1 vs avaliador_2       1.0  avaliador_1  avaliador_2
1  avaliador_1 vs avaliador_3       1.0  avaliador_1  avaliador_3
2  avaliador_1 vs avaliador_4       0.0  avaliador_1  avaliador_4
3  avaliador_2 vs avaliador_3       1.0  avaliador_2  avaliador_3
4  avaliador_2 vs avaliador_4       0.0  avaliador_2  avaliador_4
5  avaliador_3 vs avaliador_4       0.0  avaliador_3  avaliador_4


In [27]:
# 2. Detailed Pairwise Agreement Analysis for Round 2
print("\nDetailed Pairwise Analysis - Round 2")
print("-" * 40)

pairwise_results_r2 = []
for i, ann1 in enumerate(round_2.columns):
    for j, ann2 in enumerate(round_2.columns):
        if i < j:
            # Calculate multiple metrics
            kappa = cohen_kappa_score(round_2[ann1], round_2[ann2])

            # Calculate correlation only if there's variation in both annotators
            try:
                pearson_r, _ = pearsonr(round_2[ann1], round_2[ann2])  # Phi coefficient for binary data
                spearman_r, _ = spearmanr(round_2[ann1], round_2[ann2])
            except:
                pearson_r = np.nan
                spearman_r = np.nan

            pairwise_results_r2.append({"Pair": f"{ann1} vs {ann2}", "Cohen_Kappa": kappa, "Phi_Coefficient": pearson_r, "Spearman_r": spearman_r, "Annotator_1": ann1, "Annotator_2": ann2})

pairwise_df_r2 = pd.DataFrame(pairwise_results_r2)
print(pairwise_df_r2.round(3))

# Create multiple visualizations for Round 2
fig_r2_multi = make_subplots(rows=1, cols=3, subplot_titles=["Cohen's Kappa", "Phi Coefficient (Binary)", "Spearman Correlation"], shared_yaxes=True)

# Cohen's Kappa
fig_r2_multi.add_trace(go.Bar(x=pairwise_df_r2["Pair"], y=pairwise_df_r2["Cohen_Kappa"], text=pairwise_df_r2["Cohen_Kappa"].round(3), textposition="auto", name="Cohen's Kappa", marker_color="lightblue"), row=1, col=1)

# Phi Coefficient (filter out NaN)
valid_pearson_df = pairwise_df_r2.dropna(subset=["Phi_Coefficient"])
if not valid_pearson_df.empty:
    fig_r2_multi.add_trace(go.Bar(x=valid_pearson_df["Pair"], y=valid_pearson_df["Phi_Coefficient"], text=valid_pearson_df["Phi_Coefficient"].round(3), textposition="auto", name="Phi Coefficient", marker_color="lightcoral"), row=1, col=2)

# Spearman (filter out NaN)
valid_spearman_df = pairwise_df_r2.dropna(subset=["Spearman_r"])
if not valid_spearman_df.empty:
    fig_r2_multi.add_trace(go.Bar(x=valid_spearman_df["Pair"], y=valid_spearman_df["Spearman_r"], text=valid_spearman_df["Spearman_r"].round(3), textposition="auto", name="Spearman ρ", marker_color="lightgreen"), row=1, col=3)

fig_r2_multi.update_layout(height=500, title={"text": "Round 2: Pairwise Agreement Metrics Between Annotators (Binary Data)", "x": 0.5, "font": {"size": 16}}, showlegend=False)

fig_r2_multi.update_xaxes(tickangle=-45)
fig_r2_multi.show()


Detailed Pairwise Analysis - Round 2
----------------------------------------
                         Pair  Cohen_Kappa  Phi_Coefficient  Spearman_r  \
0  avaliador_1 vs avaliador_2          NaN              NaN         NaN   
1  avaliador_1 vs avaliador_3          NaN              NaN         NaN   
2  avaliador_1 vs avaliador_4          NaN              NaN         NaN   
3  avaliador_2 vs avaliador_3          NaN              NaN         NaN   
4  avaliador_2 vs avaliador_4          NaN              NaN         NaN   
5  avaliador_3 vs avaliador_4          NaN              NaN         NaN   

   Annotator_1  Annotator_2  
0  avaliador_1  avaliador_2  
1  avaliador_1  avaliador_3  
2  avaliador_1  avaliador_4  
3  avaliador_2  avaliador_3  
4  avaliador_2  avaliador_4  
5  avaliador_3  avaliador_4  


In [28]:
# =============================================================================
# SUMMARY AND INTERPRETATION
# =============================================================================

print("=" * 60)
print("INTER-ANNOTATOR AGREEMENT ANALYSIS SUMMARY")
print("=" * 60)

print("\nROUND 1 (Binary Data: 0/1):")
print("-" * 45)
print(f"- Dataset size: {round_1.shape[0]} items, {round_1.shape[1]} annotators")
print(f"- Overall agreement: {overall_agreement_r1:.1%}")
print(f"- Mean pairwise accuracy: {np.mean(accuracy_values):.1%}")

print("\nKey findings (Round 1):")
for col in round_1.columns:
    dist = round_1[col].value_counts().to_dict()
    print(f"  - {col}: {dist}")

print(f"\nROUND 2 (Binary Data: 0/1):")
print("-" * 45)
print(f"- Dataset size: {round_2.shape[0]} items, {round_2.shape[1]} annotators")
print(f"- Overall agreement: {overall_agreement_r2:.1%}")
print(f"- Mean Cohen's Kappa: {np.nanmean([kappa_matrix_r2.iloc[i, j] for i in range(len(kappa_matrix_r2)) for j in range(i + 1, len(kappa_matrix_r2))]):.3f}")

# Check variation in Round 2
print(f"\nAnnotator distribution (Round 2):")
for col in round_2.columns:
    dist = round_2[col].value_counts().to_dict()
    print(f"  - {col}: {dist}")


# Create a final summary visualization
fig_summary = go.Figure()

# Add overall agreement comparison
fig_summary.add_trace(go.Bar(x=["Round 1<br>(Binary)", "Round 2<br>(Binary)"], y=[overall_agreement_r1, overall_agreement_r2], text=[f"{overall_agreement_r1:.1%}", f"{overall_agreement_r2:.1%}"], textposition="auto", marker_color=["#2E86AB", "#A23B72"], name="Overall Agreement"))

fig_summary.update_layout(title={"text": "Overall Inter-Annotator Agreement Comparison", "x": 0.5, "font": {"size": 18}}, xaxis_title="Dataset", yaxis_title="Agreement Score", yaxis=dict(range=[0, 1]), height=400, showlegend=False)

fig_summary.show()

INTER-ANNOTATOR AGREEMENT ANALYSIS SUMMARY

ROUND 1 (Binary Data: 0/1):
---------------------------------------------
- Dataset size: 14 items, 4 annotators
- Overall agreement: 50.0%
- Mean pairwise accuracy: 50.0%

Key findings (Round 1):
  - avaliador_1: {1: 14}
  - avaliador_2: {1: 14}
  - avaliador_3: {1: 14}
  - avaliador_4: {0: 14}

ROUND 2 (Binary Data: 0/1):
---------------------------------------------
- Dataset size: 23 items, 4 annotators
- Overall agreement: 100.0%
- Mean Cohen's Kappa: nan

Annotator distribution (Round 2):
  - avaliador_1: {1: 23}
  - avaliador_2: {1: 23}
  - avaliador_3: {1: 23}
  - avaliador_4: {1: 23}
