In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [32]:
import pandas as pd

# Load the data
df = pd.read_parquet('cases.parquet')

# Explode the 'cases' list into separate rows
df_exploded = df.explode('cases').reset_index(drop=True)

# Normalize the dictionary inside each row of the 'cases' column
df_cases = pd.json_normalize(df_exploded['cases'])

# Add the corresponding article_id to each case
df_cases['article_id'] = df_exploded['article_id']

# Show the result
print(df_cases.head())


    age        case_id                                          case_text  \
0  53.0  PMC3738355_01  A 53-year-old woman presented with a 10-year h...   
1  69.0  PMC5015624_01  A 69-year-old Caucasian female with coronary a...   
2  60.0  PMC6381877_01  A 60-year-old male smoker presented with persi...   
3  41.0  PMC5912312_01  A 41-year-old female with a past medical histo...   
4  51.0  PMC5912312_02  A 51-year-old male with a history of SCAD pres...   

   gender  article_id  
0  Female  PMC3738355  
1  Female  PMC5015624  
2    Male  PMC6381877  
3  Female  PMC5912312  
4    Male  PMC5912312  


In [33]:
df_cases

Unnamed: 0,age,case_id,case_text,gender,article_id
0,53.0,PMC3738355_01,A 53-year-old woman presented with a 10-year h...,Female,PMC3738355
1,69.0,PMC5015624_01,A 69-year-old Caucasian female with coronary a...,Female,PMC5015624
2,60.0,PMC6381877_01,A 60-year-old male smoker presented with persi...,Male,PMC6381877
3,41.0,PMC5912312_01,A 41-year-old female with a past medical histo...,Female,PMC5912312
4,51.0,PMC5912312_02,A 51-year-old male with a history of SCAD pres...,Male,PMC5912312
...,...,...,...,...,...
93811,58.0,PMC11817015_01,We present the case of a 58-year-old female pa...,Female,PMC11817015
93812,60.0,PMC11804267_01,A 60-year-old female presented to surgery outp...,Female,PMC11804267
93813,57.0,PMC11694135_01,"The patient, a 57-year-old White woman with ty...",Female,PMC11694135
93814,50.0,PMC11694668_01,A 50-year-old male amateur soccer player (trai...,Male,PMC11694668


In [34]:
import re


# Example regex patterns to look for diagnosis
patterns = [
    r'diagnosed with ([^.]+)\.',               # diagnosed with [condition].
    r'diagnosis: ([^.]+)\.',                   # diagnosis: [condition].
    r'consistent with ([^.]+)\.',              # consistent with [condition].
    r'was found to have ([^.]+)\.',            # was found to have [condition].
    r'revealed a ([^.]+)\.',                   # revealed a [condition].
    r'suggestive of ([^.]+)\.'                 # suggestive of [condition].
]

def extract_diagnosis(text):
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    return None

# Apply to the case_text column
df_cases['diagnosis_extracted'] = df_cases['case_text'].apply(extract_diagnosis)

# Show rows where a diagnosis was found
print(df_cases[['case_id', 'diagnosis_extracted']].dropna().head(10))


          case_id                                diagnosis_extracted
0   PMC3738355_01                                                 10
1   PMC5015624_01                   central retinal artery occlusion
2   PMC6381877_01  large lung mass, extending to posterior chest ...
3   PMC5912312_01  SCAD of the left main (LM), left anterior desc...
5   PMC5912312_03                                           ischemia
9   PMC5912312_07                                    a troponin of 0
12  PMC5287946_01                         ptosis of the right eyelid
13  PMC9106225_01  well-defined, nontender, smooth-surfaced, roug...
15  PMC6186336_01  New York Heart Association class II and Americ...
16  PMC9937515_01  Glasgow coma scale (GCS) score of 11 (eye: 4, ...


In [35]:
df_cases

Unnamed: 0,age,case_id,case_text,gender,article_id,diagnosis_extracted
0,53.0,PMC3738355_01,A 53-year-old woman presented with a 10-year h...,Female,PMC3738355,10
1,69.0,PMC5015624_01,A 69-year-old Caucasian female with coronary a...,Female,PMC5015624,central retinal artery occlusion
2,60.0,PMC6381877_01,A 60-year-old male smoker presented with persi...,Male,PMC6381877,"large lung mass, extending to posterior chest ..."
3,41.0,PMC5912312_01,A 41-year-old female with a past medical histo...,Female,PMC5912312,"SCAD of the left main (LM), left anterior desc..."
4,51.0,PMC5912312_02,A 51-year-old male with a history of SCAD pres...,Male,PMC5912312,
...,...,...,...,...,...,...
93811,58.0,PMC11817015_01,We present the case of a 58-year-old female pa...,Female,PMC11817015,rheumatoid arthritis at the age of 40 and is c...
93812,60.0,PMC11804267_01,A 60-year-old female presented to surgery outp...,Female,PMC11804267,
93813,57.0,PMC11694135_01,"The patient, a 57-year-old White woman with ty...",Female,PMC11694135,Gottron papules
93814,50.0,PMC11694668_01,A 50-year-old male amateur soccer player (trai...,Male,PMC11694668,very early achievement of a high HR: the athle...
