In [7]:
import pandas as pd
import numpy as np
import re

# Load raw data
df = pd.read_csv('preprocessing/abstracts_with_categories.csv')

In [8]:
disease_mapping = {
    "violence/sexual abuse": "violence",
    "violent crime": "violence",
    "violent crime offenders without and with mental illness": "violence",
    "violent crimes": "violence",
    "violent loss": "violence",
    "violent losses increase": "violence",
    "violent or disruptive": "violence",
    "type 2 diabetes": "type 2 diabetes",
    "type 2 diabetes mellitus": "type 2 diabetes",
    "type-2 diabetes": "type 2 diabetes",
    "tumor": "tumor",
    "tumor necrosis": "tumor",
    "tumors": "tumor",
    "tumour": "tumor",
    "trauma": "trauma",
    "traumas": "trauma",
    "traumatic": "trauma",
    "traumatic brain injuries": "trauma",
    "traumatic brain injury": "trauma",
    "thrombocytosis": "thrombosis",
    "thromboembolic": "thrombosis",
    "thrombosis": "thrombosis",
    "thrombotic disease": "thrombosis",
    "substance abuse": "substance abuse",
    "substance addiction": "substance abuse",
    "substance-related": "substance abuse",
    "substance-related behaviors": "substance abuse",
    "substance-related disorders": "substance abuse",
    "stroke": "stroke",
    "stroke-like episodes": "stroke",
    "stroke-like migraine": "stroke",
    "stroke-related": "stroke",
    "stroke-related impairments": "stroke",
    "strokes": "stroke",
    "sleep and cognitive symptoms": "sleep disorder",
    "sleep anxiety": "sleep disorder",
    "sleep behavior disorder": "sleep disorder",
    "sleep bruxism": "sleep disorder",
    "sleep deficits": "sleep disorder",
    "sleep difficulties": "sleep disorder",
    "sleep disorder": "sleep disorder",
    "sleep disorders": "sleep disorder",
    "sleep disturbance": "sleep disorder",
    "sleep disturbance anxiety": "sleep disorder",
    "sleep disturbances": "sleep disorder",
    "sleep impairments": "sleep disorder",
    "sleep-disordered": "sleep disorder",
    "sleep-related errors": "sleep disorder",
    "sleep-related leg movements": "sleep disorder",
    "sleep-wake disorders": "sleep disorder",
    "sleepiness": "sleep disorder",
    "sleeping problems": "sleep disorder",
    "seizure": "seizure",
    "seizure-related": "seizure",
    "seizure-triggering": "seizure",
    "seizures": "seizure",
    "seizures/convulsions": "seizure",
    "schizoid personality disorders": "schizoid personality disorder",
    "schizophrenia": "schizophrenia",
    "scz": "schizophrenia",
    "schizophrenia spectrum disorders": "schizophrenia",
    "schizophrenia traits": "schizophrenia",
    "psychiatric": "psychiatric disorder",
    "psychiatric disorder": "psychiatric disorder",
    "psychiatric disorders": "psychiatric disorder",
    "psychiatric illness": "psychiatric disorder",
    "psychological distress": "psychological disorder",
    "psychopathic": "psychopathic disorder",
    "psychopathologies": "psychopathic disorder",
    "psychopathy": "psychopathic disorder",
    "psychoses": "psychotic disorder",
    "psychosis": "psychotic disorder",
    "psychosocial dysfunction": "psychotic disorder",
    "psychotic": "psychotic disorder",
    "psychotic disorder": "psychotic disorder",
    "psychotic disorders": "psychotic disorder",
    "psychotic ideation": "psychotic disorder",
    "psychotic illness": "psychotic disorder",
    "psychotic illnesses": "psychotic disorder",
    "psychotic symptoms": "psychotic disorder",
    "posttraumatic pleiosomnia": "post-traumatic stress disorder",
    "posttraumatic sleep": "post-traumatic stress disorder",
    "posttraumatic stress": "post-traumatic stress disorder",
    "posttraumatic stress disorder": "post-traumatic stress disorder",
    "ptsd": "post-traumatic stress disorder",
    "post-traumatic": "post-traumatic stress disorder",
    "post-traumatic sequelae": "post-traumatic stress disorder",
    "post-traumatic stress": "post-traumatic stress disorder",
    "post-traumatic stress disorder": "post-traumatic stress disorder",
    "post-traumatic stress disorders": "post-traumatic stress disorder",
    "post-traumatic symptoms": "post-traumatic stress disorder",
    "personality disorders": "personality disorder",
    "personality traits": "personality trait",
    "parkinson": "parkinson's disease",
    "parkinson disease": "parkinson's disease",
    "parkinson's": "parkinson's disease",
    "parkinson's and alzheimer's disease": "parkinson's disease and alzheimer's disease",
    "parkinson's and alzheimer's diseases": "parkinson's disease and alzheimer's disease",
    "parkinson's disease (pd)-causing mutations": "parkinson's disease",
    "parkinson's disease dementia": "parkinson's disease",
    "parkinson's disease questionnaire-39 (pdq-39)": "parkinson's disease",
    "parkinson's disease rating scale": "parkinson's disease",
    "parkinson's disease-like disorders": "parkinson's disease",
    "parkinson's disease.": "parkinson's disease",
    "parkinson-like disorders": "parkinson's disease",
    "parkinsonian disorders": "parkinson's disease",
    "parkinsonian motor symptoms": "parkinson's disease",
    "parkinsonian syndromes": "parkinson's disease",
    "parkinsonism": "parkinson's disease",
    "panic": "panic attack",
    "panic anxiety": "panic attack",
    "panic disorder": "panic attack",
    "panic attacks": "panic attack",
    "panic attacks": "panic attack",
    "od": "overdose",
    "overdose": "overdose",
    "overdose deaths": "overdose",
    "overdosed": "overdose",
    "overdoses": "overdose",
    "neuropsychiatric": "neuropsychiatric disorder",
    "neuropsychiatric abnormalities": "neuropsychiatric disorder",
    "neuropsychiatric and cognitive deficits": "neuropsychiatric disorder",
    "neuropsychiatric deficits": "neuropsychiatric disorder",
    "neuropsychiatric disease": "neuropsychiatric disorder",
    "neuropsychiatric disorders": "neuropsychiatric disorder",
    "neuropsychiatric syndrome": "neuropsychiatric disorder",
    "neuropsychiatric syndromes": "neuropsychiatric disorder",
    "neurodevelopmental": "neurodevelopmental disorder",
    "neurodevelopmental complications": "neurodevelopmental disorder",
    "neurodevelopmental diseases": "neurodevelopmental disorder",
    "neurodevelopmental disorder": "neurodevelopmental disorder",
    "neurodevelopmental disorders": "neurodevelopmental disorder",
    "neurodevelopmental disturbances": "neurodevelopmental disorder",
    "neurodevelopmental impairment": "neurodevelopmental disorder",
    "neurodevelopmental problems": "neurodevelopmental disorder",
    "neurodevelopmental sequela": "neurodevelopmental disorder",
    "neurodevelopmental sequelae": "neurodevelopmental disorder",
    "neurocognitive decision-making deficits": "neurocognitive disorder",
    "neurocognitive deficits": "neurocognitive disorder",
    "neurocognitive disorder": "neurocognitive disorder",
    "neurocognitive disorders": "neurocognitive disorder",
    "neurocognitive impairment": "neurocognitive disorder",
    "neurocognitive impairments": "neurocognitive disorder",
    "neurocognitive sequelae": "neurocognitive disorder",
    "neurodegeneration": "neurodegenerative disorder",
    "neurodegenerative": "neurodegenerative disorder",
    "neurodegenerative disease": "neurodegenerative disorder",
    "neurodegenerative diseases": "neurodegenerative disorder",
    "neurodegenerative disorder": "neurodegenerative disorder",
    "neurodegenerative disorders": "neurodegenerative disorder",
    "neurodegenerative gi disorders": "neurodegenerative disorder",
    "neurodegenerative lesions": "neurodegenerative disorder",
    "ms": "multiple sclerosis",
    "ms-related disability": "multiple sclerosis",
    "multiple sclerosis": "multiple sclerosis",
    "multiple sclerosis-related symptoms": "multiple sclerosis",
    "multiple sclerosis-specific symptoms": "multiple sclerosis",
    "motor" : "motor deficits",
    "motor and sensory dysfunction": "motor deficits",
    "motor coordination impairment": "motor deficits",
    "motor deficits": "motor deficits",
    "muscle atrophy": "muscle deficiency",
    "muscular abnormality": "muscle deficiency",
    "muscular atrophy": "muscle deficiency",
    "muscular dystrophies": "muscle deficiency",
    "muscle cell degeneration": "muscle deficiency",
    "muscle contractions": "muscle deficiency",
    "muscle cramps": "muscle deficiency",
    "muscle degeneration": "muscle deficiency",
    "muscle endurance": "muscle deficiency",
    "muscle loss": "muscle deficiency",
    "muscle mass": "muscle deficiency",
    "muscle of juvenile dermatomyositis": "muscle deficiency",
    "muscle pain": "muscle deficiency",
    "muscle tendons": "muscle deficiency",
    "muscle wasting": "muscle deficiency",
    "muscle deficiency": "muscle deficiency",
    "muscle-wasting disease": "muscle deficiency",
    "motor delay": "motor deficits",
    "motor dysfunction": "motor deficits",
    "motor impairments": "motor deficits",
    "motor neuron disease": "motor deficits",
    "movement disorder": "motor deficits",
    "movement disorders": "motor deficits",
    "muscle deficiency": "motor deficits",
    "muscle weakness and atrophy": "motor deficits",
    "muscle weakness and atrophy": "motor deficits",
    "mood disorders": "mood disorder",
    "mood disturbance": "mood disorder",
    "mood/anxiety disorders": "mood disorder",
    "mood/apathy": "mood disorder",
    "mitochondrial abnormalities": "mitochondrial disorder",
    "mitochondrial alterations": "mitochondrial disorder",
    "mitochondrial damage": "mitochondrial disorder",
    "mitochondrial disorders": "mitochondrial disorder",
    "mitochondrial dysfunction": "mitochondrial disorder",
    "metabolic disease": "metabolic disorder",
    "metabolic disorders": "metabolic disorder",
    "metabolic disturbance": "metabolic disorder",
    "metabolic disturbances": "metabolic disorder",
    "metabolic dysfunction": "metabolic disorder",
    "metabolic dysfunction-associated steatotic liver disease": "metabolic disorder",
    "metabolic dysfuntion": "metabolic disorder",
    "metabolic syndrome": "metabolic disorder",
    "metabolic syndromes": "metabolic disorder",
    "mental disorder": "mental disorder",
    "mental disorders": "mental disorder",
    "mental disorders face various barriers on the road": "mental disorder",
    "mental health-related disorders": "mental disorder",
    "mental illness": "mental disorder",
    "mental quality of life": "mental disorder",
    "mental retardation": "mental disorder",
    "memory defects": "memory deficiency",
    "memory deficit": "memory deficiency",
    "memory deficits": "memory deficiency",
    "memory disorders": "memory deficiency",
    "memory impairments": "memory disorder",
    "memory loss": "memory disorder",
    "liver damage": "liver disease",
    "liver disease": "liver disease",
    "liver diseases": "liver disease",
    "liver dysfunction": "liver disease",
    "liver fibrosis": "liver disease",
    "liver injury": "liver disease",
    "ibd": "ibd",
    "inflammatory bowel disease": "ibd",
    "inflammatory bowel disease-like": "ibd",
    "inflammatory bowel disease-like symptoms": "ibd",
    "inflammatory bowel disease-like symptoms": "ibd",
    "infection": "infection",
    "infections": "infection",
    "infectious disease": "infection",
    "infectious diseases": "infection",
    "immune disorders": "immune disorder",
    "immune dysfunction": "immune disorder",
    "immune-mediated disease": "immune disorder",
    "immune-mediated diseases": "immune disorder",

    "huntington's disease": "huntington's disease",
    "huntington's disease and neurotraumatic disorders": "huntington's disease",
    "huntington's disease-like": "huntington's disease",
    "huntington's diseases": "huntington's disease",
    "huntington's neurodegenerative diseases": "huntington's disease",
    "hiv (pwh": "hiv",
    "hiv infection": "hiv",
    "hiv mortality and decreased retention": "hiv",
    "hiv serologies": "hiv",
    "hiv-associated ani": "hiv",
    "hiv-associated neurocognitive disorders": "hiv",
    "hiv-associated sensory neuropathy": "hiv",
    "hiv-related": "hiv",
    "heart disease": "heart disease",
    "heart failure": "heart disease",
    "hearing disorders": "hearing impairment",
    "hearing impairment": "hearing impairment",
    "hearing loss": "hearing impairment",
    "hearing loss (<0.05)": "hearing impairment",
    "guillain-barre syndrome": "guillain-barre syndrome",
    "guillain-barrè syndrome": "guillain-barre syndrome",
    "guillain-barré syndrome": "guillain-barre syndrome",
    "glioblastoma": "glioma/glioblastoma",
    "glioblastoma multiforme": "glioma/glioblastoma",
    "glioma": "glioma/glioblastoma",
    "glioma aggressiveness": "glioma/glioblastoma",
    "gliomas": "glioma/glioblastoma",

    "gastroenteritis": "gastrointestinal disorder",
    "gastrointestinal (gi) disorders": "gastrointestinal disorder",
    "gastrointestinal (gi) motility disorders": "gastrointestinal disorder",
    "gastrointestinal (gi) tract within 8": "gastrointestinal disorder",
    "gastrointestinal and musculoskeletal symptoms": "gastrointestinal disorder",
    "gastrointestinal bleeding": "gastrointestinal disorder",
    "gastrointestinal comorbidities": "gastrointestinal disorder",
    "gastrointestinal disorders": "gastrointestinal disorder",
    "gastrointestinal hemorrhage": "gastrointestinal disorder",
    "gastrointestinal tract": "gastrointestinal disorder",

    "gait ataxia": "gait impairment",
    "gait disturbance": "gait impairment",
    "gait disturbances": "gait impairment",
    "gait impairments": "gait impairment",
    "gait instability": "gait impairment",
    "fear-circuit": "fear-related disorder",
    "fear-related disorders": "fear-related disorder",
    "fear/anxiety": "fear-related disorder",
    "extraintestinal": "extraintestinal disease",
    "extraintestinal diseases": "extraintestinal disease",
    "extraintestinal manifestation of": "extraintestinal disease",
    "extraintestinal manifestations": "extraintestinal disease",
    "epilepsies": "epilepsy",
    "epilepsy": "epilepsy",
    "epilepsy syndrome": "epilepsy",
    "epileptic": "epilepsy",
    "epileptic encephalopathies": "epilepsy",
    "epileptic encephalopathy": "epilepsy",
    "epileptic se": "epilepsy",
    "epileptic seizures": "epilepsy",
    "dystonia": "dystonia",
    "dystonic dysarthria": "dystonia",
    "dystonic movements": "dystonia",
    "disordered eating": "eating disorder",
    "disordered eating behaviors-such": "eating disorder",
    "eating disorder": "eating disorder",
    "eating behavior": "eating disorder",
    "eating behaviors": "eating disorder",
    "eating difficulties": "eating disorder",
    "eating disorders": "eating disorder",
    "eating problems": "eating disorder",
    "eating disorder-like behavior": "eating disorder",
    "eating disorder-like behaviors": "eating disorder",
    "eating disorder-like symptoms": "eating disorder",
    "eating disorder-like symptoms": "eating disorder",
    "depressed": "depression",
    "depressed/anxious mood": "depression",
    "depression": "depression",
    "depression-like behaviors": "depression",
    "depression/anxiety": "depression",
    "depressive": "depression",
    "depressive disorder": "depression",
    "depressive disorders": "depression",
    "depressive symptoms": "depression",
    "depressive-like behavior": "depression",
    "depressive-like behaviors": "depression",
    "decline in cognitive abilities with age.": "dementia",
    "decline in cognitive function": "dementia",
    "degenerative diseases": "dementia",
    "dementias": "dementia",
    "dementia": "dementia",
    "dementia-like symptoms": "dementia",
    "dementia-like behavior": "dementia",
    "dementia-like behavior": "dementia",
    "death": "death",
    "deaths": "death",
    "cryptogenic ischemic stroke": "stroke",
    "cryptogenic stroke": "stroke",
    "covid-19 infection": "covid-19",
    "covid-19 infections": "covid-19",
    "covid-19-related discrimination": "covid-19",
    "covid-19-related hypochondriasis": "covid-19",
    "covid-19-related neuropathology": "covid-19",
    "coronavirus disease": "covid-19",
    "coronavirus disease 2019": "covid-19",
    "congenital abnormalities": "congenital anomaly",
    "congenital anomalies": "congenital anomaly",
    "congenital anomaly": "congenital anomaly",
    "congenital disorders": "congenital disorder",
    "comorbid disease": "comorbid disease",
    "comorbid disruptive behavior disorder": "comorbid disease",
    "comorbid eye diseases": "comorbid disease",
    "comorbid genetic disease": "comorbid disease",
    "comorbid sleep-disordered breathing": "comorbid disease",
    "comorbid somatic conditions": "comorbid disease",
    "comorbid somatizing": "comorbid disease",
    "comorbid stressor-related disorders": "comorbid disease",
    "acneiform": "acne",
    "acneiform eruptions": "acne",
    "acute ischemic strokes": "stroke",
    "acute ischemic stroke": "stroke",
    "acute stroke": "stroke",
    "acute myocardial infarction": "heart attack",
    "ad": "alzheimers disease",
    "alzheimers disease": "alzheimers disease",
    "addiction-like behaviors": "addiction",
    "addictive behaviors": "addiction",
    "addiction": "addiction",
    "addictive behavior": "addiction",
    "addictive behavior": "addiction",
    "adhd hyperactive/impulsivity symptoms": "adhd",
    "adhd-hyperactivity/impulsivity": "adhd",
    "adhd-inattention": "adhd",
    "adhd-like behaviors": "adhd",
    "adhd-like symptoms": "adhd",
    "adhd-pi": "adhd",
    "pd": "parkinsons disease",
    "parkinson's disease": "parkinsons disease",
    "parkinsons disease": "parkinsons disease",
    "attention deficit hyperactivity disorder": "adhd",
    "attention deficit disorder": "aggression",
    "agoraphobic syndrome": "agoraphobia",
    "agoraphobia": "agoraphobia",
    "anxiety": "anxiety",
    "anxiety disorder": "anxiety",
    "alcohol abuse": "alcoholism",
    "alcohol dependence": "alcoholism",
    "alcohol-associated disease": "alcoholism",
    "alcohol-related disorders": "alcoholism",
    "alcohol-related traits": "alcoholism",
    "alzheimer": "alzheimers disease",
    "alzheimer disease": "alzheimers disease",
    "alzheimer disease/frontotemporal": "alzheimers disease",
    "alzheimer's": "alzheimers disease",
    "alzheimer's and parkinson's disease": "alzheimers disease",
    "alzheimer's and parkinson's diseases": "alzheimers disease",
    "alzheimer's and related dementias": "alzheimers disease",
    "alzheimer's disease (ad)-derived scaffolds": "alzheimers disease",
    "alzheimer's disease (ad)-related gene expression.": "alzheimers disease",
    "alzheimer's disease and ischemic stroke": "alzheimers disease",
    "alzheimer's disease and vascular dementia": "alzheimers disease",
    "alzheimer's disease assessment scale-cognition 14 improved by 2.8 pts (<i>p</i> = 0.037)": "alzheimers disease",
    "alzheimer's disease assessment scale-cognitive section": "alzheimers disease",
    "alzheimer's disease clinical trials.": "alzheimers disease",
    "alzheimer's disease dementia": "alzheimers disease",
    "alzheimer's disease drug": "alzheimers disease",
    "alzheimer's disease neuroimaging initiative": "alzheimers disease",
    "alzheimer's disease or chronic traumatic encephalopathy": "alzheimers disease",
    "alzheimer's disease rating scale (behave-ad)": "alzheimers disease",
    "alzheimer's disease rats (ad": "alzheimers disease",
    "alzheimer's intervention.": "alzheimers disease",
    "alzheimer's disease": "alzheimers disease",
    "amnesia": "amnesia",
    "amnestic cognitive impairment": "amnesia",
    "amnestic mci": "amnesia",
    "amnestic mild cognitive impairment": "amnesia",
    "aneurismal subarachnoid hemorrhage": "aSAH",
    "aneurysmal sah": "aSAH",
    "aneurysmal subarachnoid hemorrhage": "aSAH",
    "aSAH": "aSAH",
    "asah" : "aSAH",
    "subarachnoid hemorrhage": "aSAH",
    "subarachnoid hemorrhage": "aSAH",
    "anxiety": "anxiety",
    "anxiety disorder": "anxiety",
    "anxiety disorder-7": "anxiety",
    "anxiety disorders": "anxiety",
    "anxiety-like behaviors": "anxiety",
    "anxiety/depression": "anxiety",
    "anxious/depressive": "anxiety",
    "apnea": "apnea",
    "apnea-hypopnea": "apnea",
    "apnoea": "apnea",
    "apnoea-hypopnea": "apnea",
    "apnoea-hypopnea syndrome": "apnea",
    "asd": "autism spectrum disorder",
    "asds": "autism spectrum disorder",
    "atherosclerotic": "atherosclerosis",
    "atherosclerotic lesions": "atherosclerosis",
    "atherosclerosis": "atherosclerosis",
    "atherosclerotic plaque": "atherosclerosis",
    "attention deficit": "adhd",
    "attention deficit hyperactivity disorder": "adhd",
    "attention deficit/hyperactivity disorder": "adhd",
    "attention deficits": "adhd",
    "attention-deficit hyperactivity disorder": "adhd",
    "attention-deficit/hyperactivity disorder": "adhd",
    "attention-seeking": "adhd",
    "attention/memory": "adhd",
    "attentional deficits": "adhd",
    "autism": "autism spectrum disorder",
    "autism spectrum disorder": "autism spectrum disorder",
    "autism spectrum disorders": "autism spectrum disorder",
    "autism-like behaviors": "autism spectrum disorder",
    "autistic": "autism spectrum disorder",
    "autistic spectrum disorder": "autism spectrum disorder",
    "autistic symptoms": "autism spectrum disorder",
    "autistic traits": "autism spectrum disorder",
    "autoimmune disease": "autoimmune disease",
    "autoimmune disease of the central nervous system": "autoimmune disease",
    "autoimmune diseases": "autoimmune disease",
    "autoimmune disorder": "autoimmune disease",
    "autoimmune disorders": "autoimmune disease",
    "autosomal recessive": "autosomal recessive disease",
    "autosomal recessive disorder": "autosomal recessive disease",
    "autosomal recessive juvenile parkinson disease": "autosomal recessive disease",
    "autosomal recessive lethal ciliopathy": "autosomal recessive disease",
    "autosomal recessive movement and vision disorder": "autosomal recessive disease",
    "autosomal recessive neurodegenerative disorder": "autosomal recessive disease",
    "autosomal dominant": "autosomal dominant disease",
    "autosomal dominant disorder": "autosomal dominant disease",
    "autosomal dominant juvenile parkinson disease": "autosomal dominant disease",
    "autosomal dominant movement and vision disorder": "autosomal dominant disease",
    "autosomal dominant neurodegenerative disorder": "autosomal dominant disease",
    "autosomal dominant disease": "autosomal dominant disease",
    "behavior disorders": "behavioral disorder",
    "behavioral & cognitive dysfunction": "behavioral disorder",
    "behavioral abnormalities": "behavioral disorder",
    "behavioral and functional impairments": "behavioral disorder",
    "behavioral deficits": "behavioral disorder",
    "behavioral difficulties": "behavioral disorder",
    "behavioral disorders": "behavioral disorder",
    "behavioral impairments": "behavioral disorder",
    "behavioural abnormalities": "behavioral disorder",
    "behavioural disorders": "behavioral disorder",
    "behavioural impairment": "behavioral disorder",
    "behavioural variant frontotemporal dementia": "behavioral disorder",
    "bipolar": "bipolar disorder",
    "bipolar disorder": "bipolar disorder",
    "bipolar disorders": "bipolar disorder",
    "bipolar i disorder": "bipolar disorder",
    "bipolar ii disorder": "bipolar disorder",
    "brain malignancies": "brain tumor",
    "brain small": "brain tumor",
    "brain tumor": "brain tumor",
    "brain tumors": "brain tumor",
    "cardiovascular disease": "cardiovascular disease",
    "cardiovascular diseases": "cardiovascular disease",
    "cardiovascular disorders": "cardiovascular disease",
    "cardiovascular dysregulation": "cardiovascular disease",
    "ccm": "ccm",
    "ccm lesion": "ccm",
    "ccm lesions": "ccm",
    "cognitive": "cognitive decline",
    "cognitive and affective processes": "cognitive decline",
    "cognitive and behavioral symptoms": "cognitive decline",
    "cognitive and behavioural impairments": "cognitive decline",
    "cognitive and emotional dysregulation": "cognitive decline",
    "cognitive and motor functions": "cognitive decline",
    "cognitive and neuropsychiatric symptoms": "cognitive decline",
    "cognitive and psychiatric": "cognitive decline",
    "cognitive and psychiatric adverse": "cognitive decline",
    "cognitive and psychiatric adverse effects": "cognitive decline",
    "cognitive behavioural": "cognitive decline",
    "cognitive complaints": "cognitive decline",
    "cognitive decline": "cognitive decline",
    "cognitive deficits": "cognitive decline",
    "cognitive deterioration": "cognitive decline",
    "cognitive disengagement syndrome": "cognitive decline",
    "cognitive disorders": "cognitive decline",
    "cognitive dysfunction": "cognitive decline",
    "cognitive failures": "cognitive decline",
    "cognitive impairment": "cognitive decline",
    "cognitive impairments": "cognitive decline",
    "cognitive memory": "cognitive decline",
    "cognitive or psychiatric": "cognitive decline",
    "cognitive sequelae": "cognitive decline",
    "cognitive symptoms": "cognitive decline",
    "cognitive-motor dissociation": "cognitive decline",
}


In [9]:
chemical_mapping = {
    "zn" : "zinc",
    "zinc" : "zinc",
    "zinc improves" : "zinc",
    "zinc improves" : "zinc",
    "trp" : "tryptophan",
    "trazadone" : "trazodone",
    "trazodone" : "trazodone",
    "tp5" : "thymopoietin",
    "tp" : "thymopoietin",
    "thymopoietin" : "thymopoietin",
    "tmt" : "trimethyltrienolone",
    "tmz" : "temozolomide",
    "temozolomide" : "temozolomide",
    "thc" : "tetrahydrocannabinol",
    "thc cannabis" : "tetrahydrocannabinol",
    "cannabis" : "tetrahydrocannabinol",
    "cannabinol" : "tetrahydrocannabinol",
    "cannabinoids" : "tetrahydrocannabinol",
    "cannabinoids improves" : "tetrahydrocannabinol",
    "tetrahydropyridine": "tetrahydropyridine",
    "mpt": "tetrahydropyridine",
    "1-methyl-4-phenyl-1,2,3,6-tetrahydropyridine": "tetrahydropyridine",
    "1-methyl-4-phenyl-1": "tetrahydropyridine",
    "5-HT": "serotonin",
    "5-hydroxytryptamine": "serotonin",
    "serotonin": "serotonin",
    "serotoninergic": "serotonin",
    "serotoninergic effects": "serotonin",
    "serotoninergic effects": "serotonin",
    "ca" :"calcium",
    "cu" :"copper",
    "dopamine": "dopamine",
    "da" : "dopamine",
    "dmpc": "synthetic phospholipid",
    "2-Dimyristoyl-sn-glycero-3-phosphocholine" : "synthetic phospholipid",
    "2-dimyristoyl-sn-glycero-3-phosphocholine" : "synthetic phospholipid",
    "fe" : "iron",
    "fndc5" : "fndc5",
    "fndc5 improves" : "fndc5",
    "fnd" : "fndc5",
    "hg" : "mercury",
    "k" : "potassium",
    "mg" : "magnesium",
    "na" : "sodium",
    "ni" : "nickel",
    "pb" : "lead",
    "pd" : "palladium",
    "pt" : "platinum", 
    "la" : "lanthanum",
    "li" : "lithium",
    "mn" : "manganese",
    "pgrn" : "progranuline",
    "progranuline" : "progranuline",
    "progranuline improves" : "progranuline",
    "mo" : "molybdenum",
    "na" : "sodium",
    "ni" : "nickel",
    "pb" : "lead",
    "ptz" : "pentylenetetrazol",
    "ros" : "reactive oxygen species",
    "ssri" : "selective serotonin reuptake inhibitors",
    "ssris" : "selective serotonin reuptake inhibitors",
    "stz" : "streptozotocin",
    "t3" : "triiodothyronine",
}

In [10]:
def singularize_terms(cell):
    if pd.isna(cell):
        return np.nan
    singularized = []
    for term in re.split(r'[;,]', cell):
        term = term.strip().lower()
        # Only process if term is longer than 3 characters
        if len(term) > 3:
            # Remove 'es' or 's' at the end, but not if the word ends with 'ss'
            if term.endswith('es') and not term.endswith('ses') and not term.endswith('ies'):
                term = term[:-2]
            elif term.endswith('s') and not term.endswith('ss'):
                term = term[:-1]
        singularized.append(term)
    # Remove duplicates while preserving order
    seen = set()
    unique_terms = [x for x in singularized if not (x in seen or seen.add(x))]
    return '; '.join(unique_terms) if unique_terms else np.nan

In [11]:
def clean_semicolon_column(col):
    # Split on semicolons or commas, strip, deduplicate, and rejoin
    def process(cell):
        if pd.isna(cell):
            return np.nan
        # Split on semicolon or comma, then strip whitespace and lowercase
        items = [item.strip().lower() for item in re.split(r'[;,]', cell) if item.strip()]
        # Remove duplicates while preserving order
        seen = set()
        unique_items = [x for x in items if not (x in seen or seen.add(x))]
        return '; '.join(unique_items) if unique_items else np.nan
    return col.apply(process)

In [12]:
def map_terms(cell, mapping):
    if pd.isna(cell):
        return cell
    items = [item.strip().lower() for item in cell.split(';') if item.strip()]
    mapped = [mapping.get(item, item) for item in items]
    # Remove duplicates while preserving order
    seen = set()
    unique_items = [x for x in mapped if not (x in seen or seen.add(x))]
    return '; '.join(unique_items) if unique_items else np.nan

In [13]:
def clean_chemical_list(cell):
    if pd.isna(cell):
        return np.nan
    cleaned = []
    for chem in re.split(r'[;,]', cell):
        chem = chem.strip().lower()
        # Skip if <3 chars
        if len(chem) < 3:
            continue
        # Skip if starts with non-alpha (and not a letter anywhere)
        if not re.search(r'[a-zA-Z]', chem):
            continue
        if not chem[0].isalpha():
            continue
        # Skip if only numbers or numbers+symbols (no letters)
        if not re.search(r'[a-zA-Z]', chem):
            continue
        cleaned.append(chem)
    # Remove duplicates while preserving order
    seen = set()
    unique_cleaned = [x for x in cleaned if not (x in seen or seen.add(x))]
    return '; '.join(unique_cleaned) if unique_cleaned else np.nan

In [14]:
# --- Apply cleaning ---
df['Categories'] = clean_semicolon_column(df['Categories'])
df['Diseases'] = clean_semicolon_column(df['Diseases'])
df['Chemicals'] = clean_semicolon_column(df['Chemicals'])
df['Diseases'] = df['Diseases'].apply(lambda x: map_terms(x, disease_mapping))
df['Chemicals'] = df['Chemicals'].apply(lambda x: map_terms(x, chemical_mapping))
df['Diseases'] = df['Diseases'].apply(singularize_terms)
df['Chemicals'] = df['Chemicals'].apply(singularize_terms)
df['Chemicals'] = df['Chemicals'].apply(clean_chemical_list)
# Drop rows where both Categories and Diseases are empty
df_cleaned = df[~(df['Categories'].isna() & df['Diseases'].isna())].reset_index(drop=True)

# Save cleaned data
df_cleaned.to_csv('preprocessing/cleaned_for_eda.csv', index=False)