In [18]:
#!python -m spacy download en_core_web_sm
#!pip install scispacy
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz

In [13]:
import os
import re
import shutil
from tqdm import tqdm
import spacy
import os
from pprint import pprint
import pandas as pd

In [5]:
def remove_references_and_save(folder_path, output_folder, keywords=None):
    if keywords is None:
        keywords = ["# References", "# Reference", "# Bibliography", "## References", "## Reference", "## Bibliography" ]

    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    md_files = [f for f in os.listdir(folder_path) if f.endswith(".md")]

    for filename in tqdm(md_files, desc="Processing files"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        pattern = '|'.join(re.escape(k) for k in keywords)
        match = re.search(f"({pattern})", content, re.IGNORECASE)
        if match:
            new_content = content[:match.start()].rstrip()
            # Save the new content to the output folder
            new_file_path = os.path.join(output_folder, filename)
            with open(new_file_path, 'w', encoding='utf-8') as f:
                f.write(new_content)
        else:
            # Copy the file if no references section was found
            shutil.copy(file_path, os.path.join(output_folder, filename))

In [22]:
import os
import re
from tqdm import tqdm
import spacy
import pandas as pd

# Carregue sua lista NCBI gene
def load_ncbi_gene_names(filepath):
    gene_names = set()
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("#"):
                continue
            cols = line.strip().split("\t")
            if len(cols) < 3:
                continue
            symbol = cols[2].strip()
            if symbol:
                gene_names.add(symbol.upper())
    return gene_names

# Heurística para eliminar falsos positivos
def is_valid_gene(ent):
    if re.search(r"[,]|M$", ent):
        return False
    if "[" in ent or "]" in ent:
        return False
    if "." in ent and not re.fullmatch(r"[A-Z]\.", ent):
        return False
    if "-" in ent or " " in ent:
        return False
    if len(ent) <= 2:
        return False
    return True

BLACKLIST = {
    "e. coli", "gdn", "gdn hcl", "gndhcl", "28,880 m", "bldg. 3",
    "solvent-membrane", "trp", "tyr", "omp", "omps", "s2", "pagp",
    "vdac", "foma", "yop", "transmembrane α -helices"
}

# Verifica se gene está na lista NCBI
def is_known_gene(ent):
    return ent.upper() in valid_genes

# Função de extração — seu código de extração NER com scispaCy e entity ruler
def extract_entities(model, document_text, document_name):
    nlp = model.load()

    if "entity_ruler" in nlp.pipe_names:
        ruler = nlp.get_pipe("entity_ruler")
        ruler.clear()
    else:
        ruler = nlp.add_pipe("entity_ruler", after="ner")

    patterns = [
        # Padrões MUTATION
        {"label": "MUTATION", "pattern": [{"TEXT": {"REGEX": r"^(Δ|[A-Z])[0-9]+(?:[A-Za-z]*|del|ins|dup|fs|X)$"}}]},
        {"label": "MUTATION", "pattern": [{"LOWER": "mutation"}, {"TEXT": {"REGEX": r"^(Δ|[A-Z])[0-9]+(?:[A-Za-z]*|del|ins|dup|fs|X)$"}}]},
        {"label": "MUTATION", "pattern": [{"TEXT": {"REGEX": r"^p\.[A-Z][a-z]{2}[0-9]+(?:[A-Za-z]*|del|ins|dup)"}}]},
        {"label": "MUTATION", "pattern": [{"TEXT": {"REGEX": r"^c\.[0-9]+(?:_[0-9]+)?(?:del|ins|dup)[A-Z]*"}}]},

        # Padrões METHOD
        {"label": "METHOD", "pattern": [{"LOWER": "urea"}]},
        {"label": "METHOD", "pattern": [{"LOWER": "gdn"}, {"LOWER": "hcl"}]},
        {"label": "METHOD", "pattern": [{"LOWER": "guanidine"}, {"LOWER": "hydrochloride"}]},
        {"label": "METHOD", "pattern": [{"LOWER": "thermal"}, {"LOWER": "shift"}, {"LOWER": "assay"}]},
        {"label": "METHOD", "pattern": [{"LOWER": "tsa"}]},

        # Padrões MEASURE
        {"label": "MEASURE", "pattern": [{"LOWER": "differential"}, {"LOWER": "scanning"}, {"LOWER": "calorimetry"}]},
        {"label": "MEASURE", "pattern": [{"LOWER": "dsc"}]},
        {"label": "MEASURE", "pattern": [{"LOWER": "circular"}, {"LOWER": "dichroism"}, {"LOWER": "spectroscopy"}]},
        {"label": "MEASURE", "pattern": [{"LOWER": "cd"}, {"LOWER": "spectroscopy"}]},
        {"label": "MEASURE", "pattern": [{"LOWER": "cd"}]}
    ]

    ruler.add_patterns(patterns)

    doc = nlp(document_text)

    custom_labels = {"MUTATION", "METHOD", "MEASURE", "GENE_OR_GENE_PRODUCT"}
    seen = set()
    entity = []
    label = []

    for ent in doc.ents:
        if ent.label_ in custom_labels and ent.text.lower() not in seen:
            entity.append(ent.text)
            label.append(ent.label_)
            seen.add(ent.text.lower())

    return {
        "pmid": document_name,
        "entity": entity,
        "label": label
    }

class SpacyModel:
    def load(self):
        return spacy.load("en_ner_bionlp13cg_md")

In [23]:
source_folder = "../data/md"
output_folder = "../data/cleaned_md"
#remove_references_and_save(source_folder, output_folder)

In [None]:
# --- EXECUÇÃO PRINCIPAL ---

# Carrega lista NCBI Gene (antes do loop)
ncbi_gene_path = "data/Homo_sapiens.gene_info"  # Ajuste o caminho conforme seu arquivo
valid_genes = load_ncbi_gene_names(ncbi_gene_path)

caminho_pasta = "data/md_cleaned"
model = SpacyModel()
todos_resultados = []
arquivos_md = [f for f in os.listdir(caminho_pasta) if f.endswith(".md")]

for nome_arquivo in tqdm(arquivos_md, desc="Processando arquivos"):
    caminho_completo = os.path.join(caminho_pasta, nome_arquivo)
    with open(caminho_completo, "r", encoding="utf-8") as f:
        document_text = f.read()

    resultado = extract_entities(model, document_text, nome_arquivo)

    # Aqui aplica o filtro *somente* para genes, aceita os outros labels direto
    for ent, label in zip(resultado["entity"], resultado["label"]):
        ent_clean = ent.strip().lower()
        if label == "GENE_OR_GENE_PRODUCT":
            if ent_clean in BLACKLIST or not is_valid_gene(ent) or not is_known_gene(ent):
                continue  # Ignora falso positivo
        # Salva MUTATION, METHOD, MEASURE direto, sem filtro extra
        todos_resultados.append({
            "pmid": resultado["pmid"],
            "entity": ent,
            "label": label,
        })

df = pd.DataFrame(todos_resultados)

#df.to_csv("result_model.csv", index=False)

# Função que junta as entidades por label para cada pmid
df_wide = df.groupby(['pmid', 'label'])['entity'] \
    .apply(lambda x: ', '.join(sorted(set(x)))) \
    .unstack(fill_value='')

# Se quiser resetar índice para virar coluna:
df_wide = df_wide.reset_index()

# Salvar o novo CSV
df_wide.to_csv('result_model.csv', index=False)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
Processando arquivos:  17%|███▊                   | 1/6 [00:39<03:15, 39.13s/it]

In [14]:
df_wide = pd.read_csv('../data/result_model.csv')
df_wide

Unnamed: 0,pmid,GENE_OR_GENE_PRODUCT,MEASURE,METHOD,MUTATION
0,31123034.md,"ASXL1, BAX, BCL2, BCL2L11, CD34, CDKN2A, CEBPA...",,,"C36950, G1, S1, S10, S2, S3, S4, S5A, S5B, S5C..."
1,31209364.md,"ALDH2, BAX, BCL2, Bckdhb, CPT2, GAPDH, HTRA2, ...",,Urea,"B15001, C18, L2080, M36008, S1, S11, S1A, S1B,..."
2,31432739.md,"CANX, DUSP1, DUSP2, GAPDH, LMNB1, PINK1, PTEN,...",,,"A0149, C0221A, C10102, C10412, C194S, C2211, C..."
3,31481471.md,"GFAP, GLB1, LAMP2, Neu1",CD,,"A30052, C18, G2Si, M6876, Q255H, R351, S1, S2,..."
4,31557007.md,,CD,,"A29V, D34S, K35Q, P33, P33A, R28Q, R43E, R43N,..."
5,31605637.md,NQO1,,"Thermal shift assay, urea","A5, C609, H15, H2O, I3792, P11, P187, P187S, P..."
6,31672545.md,,CD,urea,"S1, S11, S12, S13, S14, S2, S3, S4, S5, S6, S6..."


In [16]:
def calc_term_probability(row):
    score = 0
    if pd.notna(row['MEASURE']):        
        score += 0.5
    if pd.notna(row['MUTATION']):        
        score += 0.166
    if pd.notna(row['GENE_OR_GENE_PRODUCT']):              
        score += 0.166
    if pd.notna(row['METHOD']):     
        score += 0.166

    return round(score, 3)

df_wide['probability'] = df_wide.apply(calc_term_probability, axis=1)

df_wide

Unnamed: 0,pmid,GENE_OR_GENE_PRODUCT,MEASURE,METHOD,MUTATION,probability
0,31123034.md,"ASXL1, BAX, BCL2, BCL2L11, CD34, CDKN2A, CEBPA...",,,"C36950, G1, S1, S10, S2, S3, S4, S5A, S5B, S5C...",0.332
1,31209364.md,"ALDH2, BAX, BCL2, Bckdhb, CPT2, GAPDH, HTRA2, ...",,Urea,"B15001, C18, L2080, M36008, S1, S11, S1A, S1B,...",0.498
2,31432739.md,"CANX, DUSP1, DUSP2, GAPDH, LMNB1, PINK1, PTEN,...",,,"A0149, C0221A, C10102, C10412, C194S, C2211, C...",0.332
3,31481471.md,"GFAP, GLB1, LAMP2, Neu1",CD,,"A30052, C18, G2Si, M6876, Q255H, R351, S1, S2,...",0.832
4,31557007.md,,CD,,"A29V, D34S, K35Q, P33, P33A, R28Q, R43E, R43N,...",0.666
5,31605637.md,NQO1,,"Thermal shift assay, urea","A5, C609, H15, H2O, I3792, P11, P187, P187S, P...",0.498
6,31672545.md,,CD,urea,"S1, S11, S12, S13, S14, S2, S3, S4, S5, S6, S6...",0.832
