In [8]:
import os
import json
import ast
import re
from typing import List, Tuple, Dict
import pandas as pd

def tokenize_and_label(text: str, subs: Dict[str,str]) -> Tuple[List[str], List[str]]:
    pl2lbl = {
        "<author_clinical_condition>":"CLIN_COND",
        "<author_medical_report>":   "MED_REP",
        "<author_genetic>":          "GENETIC",
        "<author_fertility>":        "FERTILITY",
        "<author_disability>":       "DISABILITY",
        "<author_addiction>":        "ADDICTION"
    }
    
    char_labels = ["O"] * len(text)
    
    text_lower = text.lower()
    
    for ph, real in subs.items():
        lbl = pl2lbl.get(ph)
        if not lbl:
            continue
        real = str(real)
        real_lower = real.lower()

        if not real or real.strip() == "":
            continue
        
        start = 0
        while True:
            start = text_lower.find(real_lower, start)
            if start < 0:
                break
            
            for i in range(start, start + len(real)):
                char_labels[i] = "I-" + lbl
            char_labels[start] = "B-" + lbl
            
            start += len(real)  
    
    tokens, labels = [], []
    for m in re.finditer(r"\S+", text):
        tok = m.group()
        start = m.start()
        lbl = char_labels[start]
        
        if lbl == "O":
            for i in range(start, m.end()):
                if char_labels[i].startswith("B-"):
                    lbl = char_labels[i]
                    break
        
        if lbl.startswith("I-") and (start == 0 or char_labels[start-1] != lbl):
            lbl = "B-" + lbl.split("-",1)[1]
        
        if tok and tok[-1] in "?.!:;,()[]'":
            tok = tok[:-1]
        
        tokens.append(tok)
        labels.append(lbl)
    
    return tokens, labels

def save_to_json(records: List[Dict], filename: str):
    dataset_structure = {
        "DatasetDict": {
            "medical_consultations": {
                "Dataset": {
                    "features": ["tokens", "ent_tags"],
                    "num_rows": len(records),
                    "data": records
                }
            }
        }
    }
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(dataset_structure, f, indent=2, ensure_ascii=False)

In [9]:
aggregated_records = []

df = pd.read_csv('/kaggle/input/dataset-training-ner/dataset_with_only_medical_pii.csv')

for i in range(1002):
    final_text = df['final_text'][i]
    if isinstance(final_text, float):
        print("final_text è float. È NaN?", pd.isna(final_text))
        continue
        
    substitutions_dictionary = df['substitutions_dictionary'][i]
    
    try:
        subs = ast.literal_eval(substitutions_dictionary)
    except Exception as e:
        try:
            subs = json.loads(substitutions_dictionary)
        except Exception as e2:
            raise ValueError("Impossibile convertire la stringa in dict")

    tokens, labels = tokenize_and_label(final_text, subs)

    aggregated_records.append({
                "tokens": tokens,
                "ent_tags": labels
            })

save_to_json(aggregated_records, "medical_dataset_NER.json")
print("Dataset per NER creato con successo.")

Dataset per NER creato con successo.
