In [62]:
import pandas as pd
import json
from collections import defaultdict

# Lista dos arquivos de cada anotador
arquivos = ['cesar.jsonl', 'marilia.jsonl', 'murilo.jsonl', 'natassia.jsonl']
dados = []

# Função para extrair apenas os nomes das labels
def extrair_labels(label_list):
    return [l[2] for l in label_list] if isinstance(label_list, list) else []

# Lê cada .jsonl e guarda os dados
for i, caminho in enumerate(arquivos, 1):
    anotador = f"Anotador_{i}"
    with open(caminho, 'r', encoding='utf-8') as f:
        for linha in f:
            obj = json.loads(linha)
            texto = obj['text'].strip()
            labels = extrair_labels(obj.get('label', []))
            dados.append({'text': texto, 'anotador': anotador, 'labels': labels})

# Converte para DataFrame
df = pd.DataFrame(dados)

# Pivotar: um texto por linha, uma coluna por anotador com as labels extraídas
df_pivot = df.pivot_table(index='text', columns='anotador', values='labels', aggfunc=lambda x: x)

# Corrigir valores NaN (não anotado) para lista vazia
df_pivot = df_pivot.applymap(lambda x: x if isinstance(x, list) else [])

# Visualizar
df_pivot =df_pivot.reset_index()



  df_pivot = df_pivot.applymap(lambda x: x if isinstance(x, list) else [])


In [63]:
df_pivot

anotador,text,Anotador_1,Anotador_2,Anotador_3,Anotador_4
0,## A small molecule chaperone rescues the stab...,"[Method, Method, Mutation, Mutation, Mutation,...","[Gene, Mutation, Mutation, Mutation, Gene, Gen...","[Gene, Mutation]","[Mutation, Mutation, Mutation, Mutation, Mutat..."
1,## Concomitant targeting of BCL2 with venetocl...,"[Gene, Gene, Gene, Gene, Gene, Gene]","[Gene, Gene, Gene, Gene, Gene, Gene, Gene, Gen...","[Gene, Gene, Gene, Gene]","[Mutation, Mutation, Mutation, Mutation]"
2,## Controlling Structure and Dimensions of a D...,"[Measure, Measure, Measure, Measure]","[Gene, Mutation, Mutation, Measure, Mutation, ...",[],[Measure]
3,## Deletion of heat shock protein 60 in adult ...,"[Method, Method]","[Mutation, Mutation, Mutation, Mutation, Mutat...","[Mutation, Gene, Gene, Gene]",[]
4,## Europe PMC Funders Group\n\nAuthor Manuscri...,"[Mutation, Method, Method, Method, Measure, Me...","[Mutation, Gene, Measure, Gene, Gene, Measure,...",[Mutation],[Measure]
5,## Intracerebroventricular enzyme replacement ...,[Measure],"[Gene, Gene, Method, Method, Method, Method, M...",[Gene],"[Mutation, Mutation, Mutation]"
6,## Reversible folding energetics of Yersinia A...,"[Mutation, Method, Method, Method, Measure, Me...","[Mutation, Mutation, Mutation, Mutation, Gene,...","[Mutation, Mutation]",[]
7,## USP33 deubiquitinates PRKN/parkin and antag...,[],"[Gene, Gene, Gene, Gene, Gene, Gene, Gene, Gen...","[Gene, Gene]","[Gene, Gene, Gene, Gene, Gene, Mutation, Mutat..."


In [64]:
pesos = {
    "Measure": 0.5,
    "Gene": 0.1666,
    "Method": 0.1666,
    "Mutation": 0.1666
}

# Limite da moda: label deve aparecer em pelo menos 2 anotadores
limite_moda = 2

# Função que calcula o score considerando a moda e os pesos definidos
def score_com_pesos(row):
    contador = Counter()
    for labels in row:
        if isinstance(labels, list):
            for label in labels:
                if label in pesos:
                    contador[label] += 1
    # Aplica a moda
    labels_validas = [label for label, count in contador.items() if count >= limite_moda]
    # Soma os pesos
    return sum(pesos[label] for label in labels_validas)

# Aplica por linha (por artigo)
df_pivot['score_moda_pesado'] = df_pivot.apply(score_com_pesos, axis=1)

# Visualizar resultado
print(df_pivot[['score_moda_pesado']].value_counts())


score_moda_pesado
0.4998               4
0.9998               3
0.8332               1
Name: count, dtype: int64


In [65]:
df_pivot['model'] = [0, 0, 1, 0, 1, 1, 1, 1 ]
#df_pivot['Real'] = [0, 0, 1, 0, 1, 0, 1, 0 ]

In [68]:
# Se score == 1.0, então binário = 1; senão 0
df_pivot['estimado_binario'] = df_pivot['score_moda_pesado'].apply(lambda x: 1 if x > 0.5 else 0)

from sklearn.metrics import cohen_kappa_score

# Calcular kappa
kappa = cohen_kappa_score(df_pivot['model'], df_pivot['estimado_binario'])

print(f"Cohen's Kappa: {kappa:.4f}")

Cohen's Kappa: 0.7500


In [69]:
df_pivot

anotador,text,Anotador_1,Anotador_2,Anotador_3,Anotador_4,score_moda_pesado,model,estimado_binario
0,## A small molecule chaperone rescues the stab...,"[Method, Method, Mutation, Mutation, Mutation,...","[Gene, Mutation, Mutation, Mutation, Gene, Gen...","[Gene, Mutation]","[Mutation, Mutation, Mutation, Mutation, Mutat...",0.4998,0,0
1,## Concomitant targeting of BCL2 with venetocl...,"[Gene, Gene, Gene, Gene, Gene, Gene]","[Gene, Gene, Gene, Gene, Gene, Gene, Gene, Gen...","[Gene, Gene, Gene, Gene]","[Mutation, Mutation, Mutation, Mutation]",0.4998,0,0
2,## Controlling Structure and Dimensions of a D...,"[Measure, Measure, Measure, Measure]","[Gene, Mutation, Mutation, Measure, Mutation, ...",[],[Measure],0.9998,1,1
3,## Deletion of heat shock protein 60 in adult ...,"[Method, Method]","[Mutation, Mutation, Mutation, Mutation, Mutat...","[Mutation, Gene, Gene, Gene]",[],0.4998,0,0
4,## Europe PMC Funders Group\n\nAuthor Manuscri...,"[Mutation, Method, Method, Method, Measure, Me...","[Mutation, Gene, Measure, Gene, Gene, Measure,...",[Mutation],[Measure],0.9998,1,1
5,## Intracerebroventricular enzyme replacement ...,[Measure],"[Gene, Gene, Method, Method, Method, Method, M...",[Gene],"[Mutation, Mutation, Mutation]",0.9998,1,1
6,## Reversible folding energetics of Yersinia A...,"[Mutation, Method, Method, Method, Measure, Me...","[Mutation, Mutation, Mutation, Mutation, Gene,...","[Mutation, Mutation]",[],0.8332,1,1
7,## USP33 deubiquitinates PRKN/parkin and antag...,[],"[Gene, Gene, Gene, Gene, Gene, Gene, Gene, Gen...","[Gene, Gene]","[Gene, Gene, Gene, Gene, Gene, Mutation, Mutat...",0.4998,1,0
