In [43]:
import pandas as pd
import json
from collections import defaultdict
import os
from collections import Counter

In [44]:
arquivos = [
    '../data/checkers/cesar.jsonl',
    '../data/checkers/marilia.jsonl',
    '../data/checkers/murilo.jsonl',
    '../data/checkers/natassia.jsonl'
]

dados = []

for caminho in arquivos:
    anotador = os.path.basename(caminho).replace('.jsonl', '')  # Ex: "cesar"
    
    with open(caminho, 'r', encoding='utf-8') as f:
        for linha in f:
            entrada = json.loads(linha)
            texto = entrada.get('text', '')
            labels = entrada.get('label', [])
            
            row = {"annotator": anotador, "text": texto}
            
            for start, end, label in labels:
                entidade = texto[start:end].strip()
                if entidade:
                    if label not in row:
                        row[label] = [entidade]
                    else:
                        row[label].append(entidade)

            # Convertendo listas para strings separadas por vírgula
            for chave in row:
                if isinstance(row[chave], list):
                    row[chave] = ', '.join(sorted(set(row[chave])))
            
            dados.append(row)

# Criar DataFrame
df = pd.DataFrame(dados)

# Visualizar
#df.to_csv('../data/double_checkers.csv')
df

Unnamed: 0,annotator,text,Mutation,Method,Measure,Gene
0,cesar,## Reversible folding energetics of Yersinia A...,"W 42, W42X","GdnHCl, guanidine hydrochloride, urea","CD, circular dichroism",
1,cesar,## Controlling Structure and Dimensions of a D...,,,"CD, circular dichroism",
2,cesar,## Deletion of heat shock protein 60 in adult ...,,"Urea, urea",,
3,cesar,## Europe PMC Funders Group\n\nAuthor Manuscri...,"W 42, W42X","GdnHCl, guanidine hydrochloride, urea","CD, circular dichroism",
4,cesar,## A small molecule chaperone rescues the stab...,P187S,"HDX-MS, Thermal shift assay, urea",,NQO1
5,cesar,## USP33 deubiquitinates PRKN/parkin and antag...,,,,
6,cesar,## Intracerebroventricular enzyme replacement ...,,,gl,
7,cesar,## Concomitant targeting of BCL2 with venetocl...,,,,"EGFR, KRAS, KRASmutant, TP53"
8,marilia,## Reversible folding energetics of Yersinia A...,"W 149, W 149 → Y/F/A, W 42","GdnHCl, GdnHCl (guanidine hydrochloride), SYPR...",,Ail
9,marilia,## Controlling Structure and Dimensions of a D...,"A29V, A29V/A48M, D34S, K35, K35Q, P33A, P33G, ...","SEC, analytical ultracentrifugation",CD,CytR


In [45]:
pesos = {
    "Measure": 0.5,
    "Gene": 0.1666,
    "Method": 0.1666,
    "Mutation": 0.1666
}

# Limite da moda: label deve aparecer em pelo menos 2 anotadores
limite_moda = 3

# Função que calcula o score considerando a moda e os pesos definidos
def score_com_pesos(row):
    contador = Counter()
    for labels in row:
        if isinstance(labels, list):
            for label in labels:
                if label in pesos:
                    contador[label] += 1
    # Aplica a moda
    labels_validas = [label for label, count in contador.items() if count >= limite_moda]
    # Soma os pesos
    return sum(pesos[label] for label in labels_validas)

# Aplica por linha (por artigo)
df_pivot['score_moda_pesado'] = df_pivot.apply(score_com_pesos, axis=1)

df_pivot['pmid'] = ['31605637', '31123034', '31557007', '31209364','31672545' ,'31481471','31672545','31432739']

# Visualizar resultado
df_pivot[['score_moda_pesado']].value_counts()

score_moda_pesado
0.4998               5
0.6666               1
0.8332               1
0.9998               1
Name: count, dtype: int64

In [46]:
df_pivot

anotador,text,Anotador_1,Anotador_2,Anotador_3,Anotador_4,score_moda_pesado,pmid,model,estimado_binario
0,## A small molecule chaperone rescues the stab...,"[Method, Method, Mutation, Mutation, Mutation,...","[Gene, Mutation, Mutation, Mutation, Gene, Gen...","[Gene, Mutation]","[Mutation, Mutation, Mutation, Mutation, Mutat...",0.4998,31605637,0,0
1,## Concomitant targeting of BCL2 with venetocl...,"[Gene, Gene, Gene, Gene, Gene, Gene]","[Gene, Gene, Gene, Gene, Gene, Gene, Gene, Gen...","[Gene, Gene, Gene, Gene]","[Mutation, Mutation, Mutation, Mutation]",0.4998,31123034,0,0
2,## Controlling Structure and Dimensions of a D...,"[Measure, Measure, Measure, Measure]","[Gene, Mutation, Mutation, Measure, Mutation, ...",[],[Measure],0.6666,31557007,1,1
3,## Deletion of heat shock protein 60 in adult ...,"[Method, Method]","[Mutation, Mutation, Mutation, Mutation, Mutat...","[Mutation, Gene, Gene, Gene]",[],0.4998,31209364,0,0
4,## Europe PMC Funders Group\n\nAuthor Manuscri...,"[Mutation, Method, Method, Method, Measure, Me...","[Mutation, Gene, Measure, Gene, Gene, Measure,...",[Mutation],[Measure],0.9998,31672545,1,1
5,## Intracerebroventricular enzyme replacement ...,[Measure],"[Gene, Gene, Method, Method, Method, Method, M...",[Gene],"[Mutation, Mutation, Mutation]",0.4998,31481471,1,1
6,## Reversible folding energetics of Yersinia A...,"[Mutation, Method, Method, Method, Measure, Me...","[Mutation, Mutation, Mutation, Mutation, Gene,...","[Mutation, Mutation]",[],0.8332,31672545,1,1
7,## USP33 deubiquitinates PRKN/parkin and antag...,[],"[Gene, Gene, Gene, Gene, Gene, Gene, Gene, Gen...","[Gene, Gene]","[Gene, Gene, Gene, Gene, Gene, Mutation, Mutat...",0.4998,31432739,0,0


In [47]:
#df_pivot['model'] =[0.49, 0.33, 0.67, 0.498, 0.83, 0.83, 0.83, 0.33]
df_pivot['model'] = [0,    0,    1,    0,     1,    1,    1,    0]
#df_pivot['Real'] = [0,    0,    1,    0,     1,    0,    1,    0 ]

In [48]:
# Se score == 1.0, então binário = 1; senão 0
df_pivot['estimado_binario'] = df_pivot['score_moda_pesado'].apply(lambda x: 1 if x > 0.5 else 0)

from sklearn.metrics import cohen_kappa_score

# Calcular kappa
kappa = cohen_kappa_score(df_pivot['model'], df_pivot['estimado_binario'])

print(f"Cohen's Kappa: {kappa:.4f}")

Cohen's Kappa: 0.7500


In [49]:
df_pivot

anotador,text,Anotador_1,Anotador_2,Anotador_3,Anotador_4,score_moda_pesado,pmid,model,estimado_binario
0,## A small molecule chaperone rescues the stab...,"[Method, Method, Mutation, Mutation, Mutation,...","[Gene, Mutation, Mutation, Mutation, Gene, Gen...","[Gene, Mutation]","[Mutation, Mutation, Mutation, Mutation, Mutat...",0.4998,31605637,0,0
1,## Concomitant targeting of BCL2 with venetocl...,"[Gene, Gene, Gene, Gene, Gene, Gene]","[Gene, Gene, Gene, Gene, Gene, Gene, Gene, Gen...","[Gene, Gene, Gene, Gene]","[Mutation, Mutation, Mutation, Mutation]",0.4998,31123034,0,0
2,## Controlling Structure and Dimensions of a D...,"[Measure, Measure, Measure, Measure]","[Gene, Mutation, Mutation, Measure, Mutation, ...",[],[Measure],0.6666,31557007,1,1
3,## Deletion of heat shock protein 60 in adult ...,"[Method, Method]","[Mutation, Mutation, Mutation, Mutation, Mutat...","[Mutation, Gene, Gene, Gene]",[],0.4998,31209364,0,0
4,## Europe PMC Funders Group\n\nAuthor Manuscri...,"[Mutation, Method, Method, Method, Measure, Me...","[Mutation, Gene, Measure, Gene, Gene, Measure,...",[Mutation],[Measure],0.9998,31672545,1,1
5,## Intracerebroventricular enzyme replacement ...,[Measure],"[Gene, Gene, Method, Method, Method, Method, M...",[Gene],"[Mutation, Mutation, Mutation]",0.4998,31481471,1,0
6,## Reversible folding energetics of Yersinia A...,"[Mutation, Method, Method, Method, Measure, Me...","[Mutation, Mutation, Mutation, Mutation, Gene,...","[Mutation, Mutation]",[],0.8332,31672545,1,1
7,## USP33 deubiquitinates PRKN/parkin and antag...,[],"[Gene, Gene, Gene, Gene, Gene, Gene, Gene, Gen...","[Gene, Gene]","[Gene, Gene, Gene, Gene, Gene, Mutation, Mutat...",0.4998,31432739,0,0
