In [1]:
!pip install pandas biopython requests pypdb


Collecting biopython
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting pypdb
  Downloading pypdb-2.4-py3-none-any.whl.metadata (3.0 kB)
Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdb-2.4-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython, pypdb
Successfully installed biopython-1.85 pypdb-2.4


### Kinase information

http://www.kinhub.org/kinases.html

In [2]:
from Bio import SwissProt
from Bio import PDB
from urllib.request import urlopen
from io import StringIO
import pandas as pd
import tqdm
import time

def get_pdb_ids_from_uniprot(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.txt"
    response = urlopen(url)
    record_string_list = response.read().decode("utf-8").split('\n')
    time.sleep(0.1)

    pdb_ids = []
    for dbref in record_string_list:
        if 'PDB;' in dbref:
            pdb_id = dbref.split("PDB;")[1].strip()
            pdb_ids.append(pdb_id)
    return pdb_ids

if __name__ == "__main__":
    df = pd.read_csv('./DrugDiscovery/KinaseData.csv')
    structure_info_list = []

    for index, row in tqdm.tqdm(df.iterrows(), total=len(df)):
        uniprot_id = row['UniprotID']
        try:
            pdb_ids = get_pdb_ids_from_uniprot(uniprot_id)
        except:
            pdb_ids = []

        if len(pdb_ids) > 0:
            structure_info_list.append(True)
        else:
            structure_info_list.append(False)

    df['Structure Info'] = structure_info_list
    df.to_csv('./KinaseData_StructureInfo.csv', index=False)
    print(df)

FileNotFoundError: [Errno 2] No such file or directory: './DrugDiscovery/KinaseData.csv'

### *In order not to overload the PubTator3 server, we ask that users post no more than three requests per second.

In [None]:
import pandas as pd
import requests
import json
import time
import tqdm

def get_disease_info_from_pubtator(gene):
    # URL
    url = "https://www.ncbi.nlm.nih.gov/research/pubtator3-api/relations?e1=@GENE_%s&type=stimulate&e2=Disease"%(gene)
    # API
    response = requests.get(url)


    data = response.json() # JSON 응답 파싱
    # 'source'에서 질병 이름만 추출
    diseases = []
    for each_data in data:
        each_disease = each_data['source']
        each_disease = each_disease.replace('@DISEASE_', '')
        each_disease = each_disease.replace('_', ' ')
        diseases.append(each_disease)

    diseases = list(set(diseases))

    return diseases

df = pd.read_csv('./KinaseData_StructureInfo.csv')
df_wo_structure = df[df['Structure Info']==False]

disease_list = []
all_disease_list = []
for index, each_df in tqdm.tqdm(df_wo_structure.iterrows(), total=len(df_wo_structure)):
    gene = each_df['HGNC Name']

    try:
        diseases = get_disease_info_from_pubtator(gene)
        all_disease_list+=diseases
        disease_string = ';'.join(diseases)
    except:
        disease_string = ''
    disease_list.append(disease_string)


df_wo_structure['Disease Information'] = disease_list
df_wo_structure.to_csv('./KinaseData_WO_StructureInfo_DiseaseInfo.csv')

all_disease_list = list(set(all_disease_list))
for disease in all_disease_list:
    print(disease)


Which diseases below are not genetic and are considered rare diseases, or not genetic and have a high unmet medical need?

Inflammation
Tertiary Lymphoid Structures
Diabetes Mellitus
Reperfusion Injury
Transfusion Reaction
Arthropathy progressive pseudorheumatoid of childhood
Osteosarcoma
Hereditary Breast and Ovarian Cancer Syndrome
Erythema Infectiosum
Death
Edema
Radiation Injuries
Noble Bass Sherman syndrome
Diabetic Retinopathy
Intracranial Hemorrhages
Enteropathy Associated T Cell Lymphoma
Long QT Syndrome
Oto palato digital syndrome type 2
Anemia Hemolytic
Pulmonary Fibrosis
Basal Ganglia Diseases
Silicosis
Cystitis
Hypothermia
Lymphoma B Cell
Carcinoma Renal Cell
Fibrosis
Prostatic Neoplasms Castration Resistant
Alveolitis Extrinsic Allergic
Neoplastic Syndromes Hereditary
Anemia Aplastic
Neurobehavioral Manifestations
Urinary Bladder Neoplasms
Subarachnoid Hemorrhage
Hyperglycemia
Idiopathic Noncirrhotic Portal Hypertension
Breast Neoplasms
Leber Congenital Amaurosis
Osteoglophonic dwarfism
Calcinosis
Ring Chromosome 20 Syndrome
Enterocolitis Necrotizing
Sepsis
Asthma
Autism Spectrum Disorder
Apical Hypertrophic Cardiomyopathy
Wounds and Injuries
Arthritis Psoriatic
Anemia
Cytokine Release Syndrome
Lymphatic Metastasis
Osteoarthritis
Demyelinating Diseases
Stress Disorders Post Traumatic
Carcinoma Ductal
Atherosclerosis
Systemic Inflammatory Response Syndrome
Uterine Cervical Neoplasms
Arthritis
Neoplasms
Atrial Fibrillation
Choline Deficiency
Neurodegenerative Diseases
Carcinoma Signet Ring Cell
Pain
Fever
Esophageal Squamous Cell Carcinoma
Microcystic adnexal carcinoma
Pulmonary Embolism
Hepatic Encephalopathy
Skin Diseases
Liver Failure
Squamous Intraepithelial Lesions
Ischemia
Multiple Trauma
Carcinoma Non Small Cell Lung
Iron Overload
Mitochondrial Myopathies
Meningioma
Insulin Resistance
Depressive Disorder
Rumination Syndrome
Hypertension
Prostatic Intraepithelial Neoplasia
Sleep Deprivation
Epilepsy Post Traumatic
Nervous System Diseases
Ischemic Attack Transient
Renal Insufficiency
Protoporphyria Erythropoietic
Myocardial Ischemia
Brain Injury Chronic
Melanoma
Squamous Cell Carcinoma of Head and Neck
Gliosis
Glomerulonephritis Membranous
Leukemia Lymphocytic Chronic B Cell
Hyperglycemic Hyperosmolar Nonketotic Coma
Lymphoma Follicular
Tarlov Cysts
Chromosome Aberrations
Wounds Stab
Pulmonary Arterial Hypertension
Cystic Fibrosis
Epstein Barr Virus Infections
Lung Neoplasms
Keratosis Actinic
Cerebral Infarction
Mitochondrial cytopathy
Burns
Diabetic Nephropathies
Miosis
Neoplasm Metastasis
Hyperplasia
Nasopharyngeal Carcinoma
Carcinoma Basal Cell
Carcinoma Hepatocellular
Multiple Myeloma
Exercise Induced Allergies
Jejunal Diseases
Brain Ischemia
Kidney Failure Chronic
Neuralgia
Acute Kidney Injury
Recurrence
Thymus Neoplasms
Abortion Spontaneous
Colitis
Sciatic Neuropathy
Anodontia
Hepatitis Chronic
Neurofibromatosis 1
Parkinsonian Disorders
Mastocytosis Systemic
Adenocarcinoma of Lung
Cell Transformation Neoplastic
Carcinoma Pancreatic Ductal
Status Asthmaticus
Personality Disorders
Pancreatic Neoplasms
Glioma
Cognition Disorders
Sarcoma Kaposi
Attention Deficit Disorder with Hyperactivity
Diabetes Mellitus Type 1
Carcinoma Squamous Cell
Heart Diseases
Diabetes Mellitus Type 2
Carcinogenesis
Harlequin type ichthyosis
Leukemia Eosinophilic Acute
Immunologic Deficiency Syndromes
Infarction
Small Cell Lung Carcinoma
Adenomyosis
Spherocytosis Type 1
Mitochondrial Diseases
Kidney Diseases
Neurotoxicity Syndromes
Brain Infarction
Adenomatous Polyps
Preeclamptic toxemia
Myocardial Infarction
Nerve Degeneration
Mullerian aplasia
Status Epilepticus
Myocardial Reperfusion Injury
Gallbladder Neoplasms
Obesity
Calcinosis Cutis
Lupus Erythematosus Systemic
Hepatitis B
Carcinoma Ovarian Epithelial
Hyperoxia
Neoplasms Basal Cell
Necrosis
Adenoma Oxyphilic
Vasculitis
Lung Injury
Carcinoma Embryonal
Ureteral Obstruction
Seizures
Esophageal Neoplasms
Ventricular Remodeling
Chromosomal Instability
Colitis Ulcerative
Parkinson Disease Secondary
Glucose Intolerance
Fibromyalgia
Glioblastoma
Fetal Diseases
Arthritis Rheumatoid
Magnesium Deficiency
Escherichia coli Infections
Non alcoholic Fatty Liver Disease
Brain Edema
Cognitive Dysfunction
Blindness
Myelitis Transverse
Cone Dystrophy
Anemia Sickle Cell
Hypoxia Ischemia Brain
Myopathies Structural Congenital
Leukemia Promyelocytic Acute
Renal Insufficiency Chronic
Factor X Deficiency
Cleft Palate
Fetal Growth Retardation
Cardiotoxicity
Pulmonary Disease Chronic Obstructive
Lipoma
Hepatitis C
Hypoxia Brain
Ventricular Dysfunction Left
Liver Neoplasms
Post Acute COVID 19 Syndrome
Drug Related Side Effects and Adverse Reactions
Scleroderma Systemic
Virus Diseases
Arthritis Experimental
Vascular Diseases
Mouth Diseases
Triple Negative Breast Neoplasms
Brain Concussion
Corneal Endothelial Cell Loss
Prodromal Symptoms
Heart Failure
Hyper IgM Immunodeficiency Syndrome
Adamantinoma
Adenoma
Hypoglycemia
Colorectal Neoplasms
Retinal Degeneration
Weight Gain
Huntington Disease
Psoriasis
Reticulocytosis
Bites and Stings
Ovarian Neoplasms
Cystitis Interstitial
Hyperinsulinism
Retinitis
Prostatic Neoplasms
Idiopathic Pulmonary Fibrosis
Malaria Cerebral
Arthralgia
Mandibular Nerve Injuries
Myotonic Dystrophy
Benign non infected urachal cyst
Esophagitis
Stomach Neoplasms
Epilepsy Partial with Variable Foci
Rectal Neoplasms
Lens Diseases
Leukemia Myeloid
Leukemia
Autoimmune Diseases
Brain Injuries Traumatic
16p11.2 Deletion Syndrome
Micrognathism
Behcet Syndrome
Retinal Detachment
Pre Eclampsia
Neutrophil Actin Dysfunction
Leukopenia
Celiac Disease
Malformations of Cortical Development Group I
Syringoma
Hydrops Fetalis
Hemorrhage
613882
Memory Disorders
Uterine Cervicitis
Leukemia Biphenotypic Acute
Hypoxia
Infections
Psychological Distress
Pancreatitis
Mosaic variegated aneuploidy syndrome
248310
Arthritis Infectious
Hyperhomocysteinemia
Cardiomyopathy Hypertrophic
Cardiomyopathy Dilated
Mastitis
Crohn Disease
Prostatic Hyperplasia
Seminoma
Bone Diseases
Infarction Middle Cerebral Artery
Adenocarcinoma
Abdominal Injuries
Learning Disabilities
Hyperimmunoglobulin G1(A1) Syndrome
Parkinson Disease

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import shapiro

population = np.random.exponential(scale=1.0, size=10000)
sample_means = [np.mean(np.random.choice(population, size=30)) for _ in range(1000)]

pop_stat, pop_p = shapiro(population[:5000])
sample_stat, sample_p = shapiro(sample_means)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(population, bins=50, density=True, alpha=0.7, color='skyblue')
axes[0].set_title(f'Population (Exponential)')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Density')

axes[1].hist(sample_means, bins=50, density=True, alpha=0.7, color='salmon')
axes[1].set_title(f'Sampling Distribution of the Mean')
axes[1].set_xlabel('Sample Mean')
axes[1].set_ylabel('Density')

plt.tight_layout()
plt.show()
print(pop_p)
print(sample_p)

In [None]:
# 1000번의 반복 실험을 통해 t-value를 계산하여 분포 시각화
t_values = []

for _ in range(1000):
    group1 = np.random.normal(loc=0, scale=1, size=30)
    group2 = np.random.normal(loc=0.5, scale=1.2, size=30)
    t_statistic, _ = ttest_ind(group1, group2)
    t_values.append(t_statistic)

# t-value 분포 시각화
plt.figure(figsize=(10, 6))
plt.hist(t_values, bins=30, density=True, alpha=0.7, color='lightgreen', edgecolor='black')

# 세부설정
plt.title('Distribution of t-values from 1000 Simulations')
plt.xlabel('t-value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
!wget --no-check-certificate "https://www.dropbox.com/scl/fi/533wpky7dz2kvcjqm4jjx/lec5_data.zip?rlkey=b2cko74izyin5wr05z3d99svw&st=x2rxrzha&dl=0" -O "lec5_data.zip"
!unzip lec5_data.zip

In [None]:
import pandas as pd

cellline_df = pd.read_csv("./Model.csv")
cellline_df

In [None]:
ess_prob_df = pd.read_csv("./CRISPRGeneDependency.csv", index_col=0)
ess_prob_df

In [None]:
mutation_df = pd.read_csv("./mutations_NAsdropped.csv", index_col=0)
mutation_df

In [None]:
from scipy.stats import ttest_ind

def find_significant_genes(essentiality_df, control_group, test_group, threshold=0.001):
    genes = essentiality_df.columns
    control_df = essentiality_df.loc[control_group]
    test_df = essentiality_df.loc[test_group]

    significant_genes = []

    for gene in genes:
        control_prob = control_df[gene]
        test_prob = test_df[gene]

        control_mean = control_prob.mean()
        test_mean = test_prob.mean()

        if control_mean < 0.5 and test_mean > 0.5:
            _, p_value = ttest_ind(control_prob, test_prob)

            if p_value < threshold:
                significant_genes.append(gene)

    return significant_genes


lof_mutations_df = breast_mutation_df[breast_mutation_df['likely_lof'] == True]
lof_genes = lof_mutations_df['gene'].unique()
essentiality_cell_lines = ess_prob_df.index.tolist()

for gene, gene_mutation_df in lof_mutations_df.groupby('gene'):
    lof_cell_lines = set(gene_mutation_df['depmap_id']) & set(essentiality_cell_lines)
    non_lof_cell_lines = set(breast_cancer_cellline_ids) - lof_cell_lines
    non_lof_cell_lines &= set(essentiality_cell_lines)

    if len(lof_cell_lines) >= 3:
        candidate_genes = find_significant_genes(
            ess_prob_df,
            control_group=list(non_lof_cell_lines),
            test_group=list(lof_cell_lines)
        )

        print(f'Potential target genes for LOF gene: {gene}')
        print(candidate_genes)

In [None]:
import pandas as pd

# Load Genomics of Drug Sensitivity in Cancer dataset
df = pd.read_csv('./DrugDiscovery/Data-GDSC-OV_ANOVA.csv')

# Select only PARP inhibitors
df = df[df['Drug target'] == 'PARP1, PARP2']

# Filter by p-value threshold (0.05)
df = df[df['feature_pval'] <= 0.05]

# Exclude features starting with 'cna' (copy number alteration)
df = df[~df['Feature Name'].str.startswith('cna')]

# Print unique feature names
print('Feature name')
print(list(set(df['Feature Name'])))

# Get unique PARP inhibitor drug names
Target_drugs = list(set(df['Drug name']))
print('PARP inhibitors')
print(Target_drugs)

In [None]:
# Define the target gene for this analysis
target_gene = 'BRCA1'

# Load genetic variant data for ovarian cancer cell lines from GDSC dataset
df2 = pd.read_csv('./DrugDiscovery/Data-GDSC-OV_Genetic_features_variant.csv')

# Print the loaded dataframe
print(df2)

# Filter for genetic feature information related to the target gene
tmp_df = df2[df2['Genetic Feature'] == '%s_mut'%(target_gene)]

# Extract cell lines with mutations in the target gene
tmp_df_pos = tmp_df[tmp_df['IS Mutated'] == 1]

# Extract cell lines without mutations in the target gene
tmp_df_neg = tmp_df[tmp_df['IS Mutated'] == 0]

# Get unique list of cell lines with mutations
mut_pos_cells = list(set(tmp_df_pos['Cell Line Name']))

# Get unique list of cell lines without mutations
mut_neg_cells = list(set(tmp_df_neg['Cell Line Name']))

# Print cell lines with and without mutations
print(mut_pos_cells)
print(mut_neg_cells)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load IC50 data from CSV file
ic50_df = pd.read_csv('./DrugDiscovery/Data-GDSC-OV_IC_Mon Sep 18 04_05_59 2023.csv')
# Filter for selected inhibitors
ic50_df = ic50_df[ic50_df['Drug Name'].isin(Target_drugs)]

# Convert IC50 from log scale to linear scale by applying exponential
ic50_df['IC50 new'] = np.exp(ic50_df['IC50'])

# Analyze data for each PARP inhibitor drug by grouping
for each_drug, each_df in ic50_df.groupby('Drug Name'):
    # Extract data for cell lines with mutations
    pos_data_df = each_df[each_df['Cell Line Name'].isin(mut_pos_cells)]

    # Extract data for cell lines without mutations
    neg_data_df = each_df[each_df['Cell Line Name'].isin(mut_neg_cells)]

    # Get IC50 values for cell lines with mutations
    pos_ic50_values = pos_data_df['IC50 new'].values

    # Get IC50 values for cell lines without mutations
    neg_ic50_values = neg_data_df['IC50 new'].values

    # Create boxplot comparing IC50 values for mutation-positive and mutation-negative groups
    plt.boxplot([pos_ic50_values, neg_ic50_values], labels=['%s mut positive'%(target_gene), '%s mut negative'%(target_gene)])
    plt.title('%s'%(each_drug))
    plt.xlabel('Group')
    plt.ylabel('IC50 (uM)')
    plt.show()

In [None]:
import os
import json
import pandas as pd
from Bio import Entrez
from openai import OpenAI
from typing import List, Dict
import time
import re

# Set your contact email for NCBI Entrez and initialize OpenAI client
Entrez.email = "your_email@example.com"
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY", "input")) #

def fetch_pubmed_abstracts(disease: str, gene: str, max_results: int = 5) -> List[Dict]:
    """
    Search PubMed for articles containing both disease and gene terms.
    Returns a list of dicts with PMID, title, and abstract.
    """
    try:
        query = f"{disease}[Title/Abstract] AND {gene}[Title/Abstract]"
        handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
        record = Entrez.read(handle)
        handle.close()
        pmids = record["IdList"]
        results = []
        for pmid in pmids:
            handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
            article = Entrez.read(handle)['PubmedArticle'][0]
            handle.close()
            title = article['MedlineCitation']['Article']['ArticleTitle']
            abstract_list = article['MedlineCitation']['Article'].get('Abstract', {}).get('AbstractText', [])
            abstract = abstract_list[0] if abstract_list else ""
            results.append({"pmid": pmid, "title": title, "abstract": abstract})
        return results
    except Exception as e:
        print(f"Error fetching PubMed abstracts for {disease} and {gene}: {e}")
        return []

def parse_openai_response(response_content):
    # Use regex to find the first valid JSON object in the response
    json_match = re.search(r'\{.*\}', response_content, re.DOTALL)
    if json_match:
        json_str = json_match.group(0)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            raise json.JSONDecodeError(f"Failed to parse JSON: {e}", json_str, e.pos)
    else:
        raise ValueError("No valid JSON found in response")

def evaluate_article(disease: str, gene: str, title: str, abstract: str) -> Dict:
    prompt = f"""
Disease: {disease}
Gene: {gene}
Article Title: {title}
Abstract: {abstract}

Instructions: Analyze the provided abstract carefully and respond in the specified JSON format. Focus *only* on information explicitly stated or strongly implied within this specific abstract.

1.  **Summarize:** Provide a concise, one-sentence summary describing the *specific main finding* reported in the abstract concerning the relationship between the gene '{gene}' and the disease '{disease}'. **This summary must be derived directly from the abstract's content and should not be generic.**
2.  **Identify Potential Therapeutic Target:** Determine if the abstract presents *specific findings* suggesting the gene '{gene}' is a potential therapeutic target for '{disease}'.
    * Set `is_potential_target` to `true` **only if** the abstract describes results where:
        * Modulating the gene's activity demonstrably affects disease pathology, or disease model outcomes.
        **Crucially, this often involves evidence presented at the cellular (in vitro) or animal (in vivo) level, such as showing that *gene knock-down or knock-out leads to a demonstrable therapeutic effect* (e.g., reduced disease phenotype, improved cell viability, slowed disease progression in models).
        ** Other forms of modulation (like pharmacological inhibition) showing similar effects also qualify.
    * **Important:** Simply reporting a correlation (e.g., gene expression levels differ in patients) or describing the gene's general biological function is **not sufficient** to mark it as a potential target *unless* the abstract directly links this to a potential therapeutic intervention strategy supported by experimental evidence presented within the abstract. There must be a clear indication of therapeutic relevance derived from specific results.
    * Base this determination strictly on the content of the provided abstract.
3.  **Describe Mechanism (if applicable):**
    * If `is_potential_target` is `true`, describe the *specific mechanism* mentioned or strongly implied *in the abstract* that explains *how* modulating '{gene}' could lead to a therapeutic benefit for '{disease}'.
    * **This description must be based on details found within the abstract.** If the abstract supports `is_potential_target` being true but *does not* specify the underlying mechanism, state "Mechanism not detailed in the abstract."
    * If `is_potential_target` is `false`, leave the `mechanism` field as an empty string ("").
    * **Do not use generic placeholders like '{{gene}}' or '{{disease}}' in your mechanism description.**

Respond in strict JSON format with the following keys:
- summary: [string] (The specific one-sentence summary based on the abstract)
- is_potential_target: [boolean] (True only if the abstract provides specific findings supporting therapeutic potential, particularly evidence like knock-down/out effects in models, False otherwise)
- mechanism: [string] (Specific mechanism from the abstract if `is_potential_target` is true and mechanism is described, "Mechanism not detailed in the abstract." if true but not described, or "" if false)

Example Output (Target with Mechanism - Knockdown):
{{
  "summary": "The abstract demonstrates that siRNA-mediated knock-down of Gene Alpha in Disease X cell lines reduces cell proliferation and induces apoptosis.",
  "is_potential_target": true,
  "mechanism": "Knock-down of Gene Alpha disrupts the Survival Pathway Y, leading to reduced proliferation and apoptosis of Disease X cells, according to the abstract."
}}

Example Output (Target with Mechanism - Knockout):
{{
  "summary": "This study shows that genetic knock-out of Gene Beta ameliorates disease symptoms and reduces pathological markers in a mouse model of Disease Z.",
  "is_potential_target": true,
  "mechanism": "The abstract suggests Gene Beta knock-out restores normal signaling pathway Q, which is dysregulated in Disease Z."
}}

Example Output (Target, Mechanism Not Detailed):
{{
  "summary": "This abstract reports that pharmacological inhibition of Enzyme C significantly improved outcomes in an animal model of Disease D.",
  "is_potential_target": true,
  "mechanism": "Mechanism not detailed in the abstract."
}}

Example Output (Not a Target - Correlation Only):
{{
  "summary": "The abstract describes a genome-wide association study identifying a SNP near Gene E that is associated with increased risk for Disease F.",
  "is_potential_target": false,
  "mechanism": ""
}}

Example Output (Not a Target - Functional Study Only):
{{
  "summary": "This research investigates the role of Protein G in regulating cellular trafficking pathways within neurons, without specific experimental links to Disease H treatment effects.",
  "is_potential_target": false,
  "mechanism": ""
}}
    """

    response = openai_client.chat.completions.create(
    model="gpt-4.1-mini",  # Updated to a valid model; replace with desired model
    messages=[
        {"role": "system", "content": "You are an expert biomedical researcher."},
        {"role": "user", "content": prompt}
    ],
    temperature=0
    )

    content = response.choices[0].message.content.strip()

    # Parse the cleaned JSON content
    parsed_content = parse_openai_response(content)
    return parsed_content

def analyze_genes(disease: str, genes: List[str], max_articles_per_gene: int = 3) -> pd.DataFrame:
    """
    Main function to fetch abstracts, evaluate each, and compile results.
    Writes results to 'results.csv' and returns a pandas DataFrame.
    """
    records = []
    for gene in genes:
        articles = fetch_pubmed_abstracts(disease, gene, max_results=max_articles_per_gene)
        print('No. of articles : ', len(articles))
        for art in articles:
            #try:
            ev = evaluate_article(disease, gene, art['title'], art['abstract'])
            #except:
            #    continue
            records.append({
                "Disease": disease,
                "Gene": gene,
                "PMID": art['pmid'],
                "Title": art['title'],
                "IsPotentialTarget": ev["is_potential_target"],
                "Summary": ev["summary"],
                "Mechanism": ev["mechanism"]
            })
            time.sleep(0.5)

    df = pd.DataFrame(records)
    try:
        df.to_csv("results.csv", index=False)
    except Exception as e:
        print(f"Error saving results to CSV: {e}")
    return df

if __name__ == "__main__":
    disease_name = "Alzheimer disease"
    candidate_genes = ["APP", "PSEN1", "PSEN2"]
    df_results = analyze_genes(disease_name, candidate_genes, 3)
    print(df_results)

In [None]:
import datamol as dm
from molfeat.trans.base import MoleculeTransformer
from molfeat.calc.pharmacophore import Pharmacophore2D
from molfeat.trans.fp import FPVecTransformer

smiles = "O=C(NNc1cc(Br)c([O-])c(Br)c1)c1ccccc1	"
# sanitize and standardize your molecules if needed

transformer = FPVecTransformer(kind='ecfp', dtype=float)
features = transformer(smiles)
print('EFCP')
print(features.shape)
print(features)

transformer = FPVecTransformer("maccs", dtype=float)
features = transformer(smiles)
print('MACCS')
print(features.shape)
print(features)