In [18]:
#!python -m spacy download en_core_web_sm
#!pip install scispacy
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz

In [14]:
import os
import re
import shutil
from tqdm import tqdm
import spacy
import os
from pprint import pprint
import pandas as pd

In [5]:
def remove_references_and_save(folder_path, output_folder, keywords=None):
    if keywords is None:
        keywords = ["# References", "# Reference", "# Bibliography", "## References", "## Reference", "## Bibliography" ]

    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    md_files = [f for f in os.listdir(folder_path) if f.endswith(".md")]

    for filename in tqdm(md_files, desc="Processing files"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        pattern = '|'.join(re.escape(k) for k in keywords)
        match = re.search(f"({pattern})", content, re.IGNORECASE)
        if match:
            new_content = content[:match.start()].rstrip()
            # Save the new content to the output folder
            new_file_path = os.path.join(output_folder, filename)
            with open(new_file_path, 'w', encoding='utf-8') as f:
                f.write(new_content)
        else:
            # Copy the file if no references section was found
            shutil.copy(file_path, os.path.join(output_folder, filename))

In [22]:
def extract_entities(model, document_text, document_name):
    nlp = model.load()

    if "entity_ruler" in nlp.pipe_names:
        ruler = nlp.get_pipe("entity_ruler")
        ruler.clear()
    else:
        ruler = nlp.add_pipe("entity_ruler", before="ner")

    custom_labels = {"MUTATION", "MEASURE", "METHOD", 'GENE_OR_GENE_PRODUCT'}

    patterns = [
        # MUTATION
        {
            "label": "MUTATION",
            "pattern": [
                {"TEXT": {"REGEX": r"^(Δ|[A-Z])[0-9]+(?:[A-Za-z]*|del|ins|dup|fs|X)$"}}
            ]
        },
        {
            "label": "MUTATION",
            "pattern": [
                {"LOWER": "mutation"},
                {"TEXT": {"REGEX": r"^(Δ|[A-Z])[0-9]+(?:[A-Za-z]*|del|ins|dup|fs|X)$"}}
            ]
        },
        {
            "label": "MUTATION",
            "pattern": [
                {"TEXT": {"REGEX": r"^p\.[A-Z][a-z]{2}[0-9]+(?:[A-Za-z]*|del|ins|dup)"}}
            ]
        },
        {
            "label": "MUTATION",
            "pattern": [
                {"TEXT": {"REGEX": r"^c\.[0-9]+(?:_[0-9]+)?(?:del|ins|dup)[A-Z]*"}}
            ]
        },

        # Measure - DSC
        {
            "label": "MEASURE", 
            "pattern": [
                {"LOWER": "differential"},
                {"LOWER": "scanning"},
                {"LOWER": "calorimetry"},
                {"TEXT": "(", "OP": "?"},
                {"LOWER": "dsc", "OP": "?"},
                {"TEXT": ")", "OP": "?"}
            ]
        },
        {
            "label": "MEASURE", 
            "pattern": [
                {"TEXT": "(", "OP": "?"},
                {"LOWER": "dsc"},
                {"TEXT": ")", "OP": "?"}
            ]
        },

        # METHOD - CD
        {
            "label": "MEASURE", 
            "pattern": [
                {"LOWER": "circular"},
                {"LOWER": "dichroism"},
                {"TEXT": "(", "OP": "?"},
                {"LOWER": "cd", "OP": "?"},
                {"TEXT": ")", "OP": "?"},
                {"LOWER": "spectroscopy"}
            ]
        },
        {
            "label": "MEASURE", 
            "pattern": [
                {"TEXT": "(", "OP": "?"},
                {"LOWER": "cd"},
                {"TEXT": ")", "OP": "?"},
                {"LOWER": "spectroscopy"}
            ]
        },
        {
            "label": "MEASURE", 
            "pattern": [
                {"TEXT": "(", "OP": "?"},
                {"LOWER": "cd"},
                {"TEXT": ")", "OP": "?"}
            ]
        },
                # METHOD - Agentes desnaturantes e ensaios
        {
            "label": "METHOD",
            "pattern": [{"LOWER": "urea"}]
        },
        {
            "label": "METHOD",
            "pattern": [{"LOWER": "gdn"}, {"LOWER": "hcl"}]
        },
        {
            "label": "METHOD",
            "pattern": [{"LOWER": "guanidine"}, {"LOWER": "hydrochloride"}]
        },
        {
            "label": "METHOD",
            "pattern": [{"LOWER": "thermal"}, {"LOWER": "shift"}, {"LOWER": "assay"}]
        },
        {
            "label": "METHOD",
            "pattern": [{"LOWER": "tsa"}]
        }

    ]

    # Adiciona os padrões ao ruler
    ruler.add_patterns(patterns)

    # Processamento
    doc = nlp(document_text)

    # Coleta de entidades
    seen = set()
    entity = []
    label = []

    for ent in doc.ents:
        if ent.label_ in custom_labels and ent.text not in seen:
            entity.append(ent.text)
            label.append(ent.label_)
            seen.add(ent.text)

    # Regex para termos termodinâmicos
    #delta_pattern = re.compile(r'''
    #    (?P<symbol>Δ|delta|Delta)\s*(?P<variable>[GT])|𝚫\s*(?P<variable2>[GT]?)                       
    #    (?:\s*[:=~]\s*(?P<value>[-\d.]+)\s*(?P<unit>kcal/mol|kJ/mol|°C|K)?)?
    #    (?:.*?(Fig(?:ure)?\s*[\dS]+)?)?
    #''', flags=re.IGNORECASE | re.VERBOSE)

    #delta_matches = []
    #for match in delta_pattern.finditer(document_text):
    #    termo = match.group("variable") or match.group("variable2")
    #    if termo:
    #        termo = termo.upper()
    #        valor = match.group("value") or ""
    #        unidade = match.group("unit") or ""
    #        delta_matches.append({
    #            "termo": f"Δ{termo}",
    #            "valor": valor,
    #            "unidade": unidade
    #        })

    return {
        "pmid": document_name,
        "entity": entity,
        "label": label,
        #"termos_termodinamicos": delta_matches,
        #"tem_termo_termodinamico": "Sim" if delta_matches else "Não"
    }

class SpacyModel:
    def load(self):
        return spacy.load("en_ner_bionlp13cg_md")


In [23]:
source_folder = "../data/md"
output_folder = "../data/cleaned_md"
#remove_references_and_save(source_folder, output_folder)

In [None]:
# Caminho da pasta onde estão os arquivos .md
caminho_pasta = "../data/md_cleaned"

# Instanciar o modelo spaCy
model = SpacyModel()

# Lista para armazenar os resultados
todos_resultados = []

# Listar arquivos .md
arquivos_md = [f for f in os.listdir(caminho_pasta) if f.endswith(".md")]

# Iterar com barra de progresso
for nome_arquivo in tqdm(arquivos_md, desc="Processando arquivos"):
    caminho_completo = os.path.join(caminho_pasta, nome_arquivo)
    with open(caminho_completo, "r", encoding="utf-8") as f:
        document_text = f.read()

    # Aplicar a função
    resultado = extract_entities(model, document_text, nome_arquivo)

    # Adiciona resultados linha por linha
    for ent, label in zip(resultado["entity"], resultado["label"]):
        todos_resultados.append({
            "pmid": resultado["pmid"],
            "entity": ent,
            "label": label,
            #"tem_termo_termodinamico": resultado["tem_termo_termodinamico"]
        })

# Criar DataFrame
df = pd.DataFrame(todos_resultados)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
Processando arquivos:  17%|███▊                   | 1/6 [00:39<03:15, 39.13s/it]

In [None]:
df_grouped = df.groupby(['pmid', 'label'])['entity'].apply(list).unstack(fill_value=[]).reset_index()

# Adiciona os campos extras (termodinâmica)
df_termos = df[['pmid', 'tem_termo_termodinamico']].drop_duplicates()

# Junta tudo
df_final = pd.merge(df_grouped, df_termos, on='pmid', how='left')

def calc_term_probability(row):
    score = 0
    if row['METODH']:        
        score += 0.5
    if row['MUTATION']:        
        score += 0.166
    if row['GENE']:              
        score += 0.166
    if row['MEASURE']:     
        score += 0.166

    return round(score, 2)

df_final['probability'] = df_final.apply(calc_term_probability, axis=1)

In [None]:
df_final['bin'] = (df_final['probability'] > 0.5).astype(int)

df_final['pmid'] = df_final['pmid'].str.replace('.md', '', regex=False)

df_final.to_csv('../data/model/papers_screened.csv')

In [None]:
df_final.query('pmid == 31605637')

### Fisrt version joined methodologies

In [None]:
official_data = pd.read_excel('../data/model/oficial_data.xlsx')

official_data['pmid'] = official_data['pmid'].astype(int)

df_final['pmid'] = df_final['pmid'].astype(int)

official_data.merge(df_final[['pmid', 'bin']], on='pmid', how='left').to_excel('../data/model/official_data.xlsx')