In [49]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def gen1(text):
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    prompt = f"""
Extract the following information from the medical report:

- Age
- Sex
- Chest Pain
- Cholesterol
- Resting Blood Pressure (trestbps)
- Blood Sugar
- Resting ECG (restecg)
- Maximum Heart Rate Achieved (thalach)
- Exercise Induced Angina (exang)
- ST Depression (oldpeak)

Report Text:
{text}

"""
    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)

    in_ids = model.generate(inputs.input_ids, max_length=1024, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(in_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

    return response

In [50]:
text1 = "1\n128\n93\n4\n5\n7\nTHECOMPLETEBLOODCOUNTSAMPLEREPORT\nDifferentlaboratoriesgeneratereportsthatcanvarygreatlyinappearanceandintheorderandkindof\ninformationincluded.ThisisoneexampleofwhatalabreportforaCompleteBloodCountmaylooklike.\nNamesandplacesusedhavebeenmadeupforillustrativepurposesonly.Thenumberedkeytotheright\nexplainsafewofthereportelements.\nUniversityMedicalCenter,Dept.ofPathology ReportDate/Time:\n123UniversityWay,City,ST12345 02/10/2014 16:402\nName: Doe,JohnQ. Age/Sex:73/M DOB: 01/01/1941\nPatientID:987654321 Status: 6Routine\nOrderingDr:Smith,PeterMD PhysicianCopyfor: Smith,JaneMD\nSPEC#: 223456 CollectionDate/Time:02/10/1414:3010\nReceivedDate/Time:02/10/1415:0011\nSPECIMEN:Wholeblood\nORDERED:CompleteBloodCountandWhiteBloodCellDifferential\nQUERIES: [Commentsandtestinginstructions]\n13 14 15 16 17\nTest Normal Abnormal Flag Units ReferenceRange\nCOMPLETEBLOODCOUNT\nWhiteBloodCell(WBC) 6.9 K/mcL 4.8-10.8\nRedBloodCell(RBC) 1.8 LM/mcL 4.7-6.1\nHemoglobin(HB/Hgb)) 6.5 L*g/dL 14.0-18.0\nHematocrit(HCT) 19.5 L% 42-52\nMeanCellVolume(MCV) 109.6 HfL 80-100\nMeanCellHemoglobin(MCH) 36.5 Hpg 27.0-32.0\nMeanCellHbConc(MCHC) 33.3 g/dL 32.0-36.0\nRedCellDistWidth(RDW) 16.0 H% 11.5-14.5\nPlateletcount 180 K/mcL 150-450\nMeanPlateletVolume 7.9 fL 7.5-11.0\nWBCDifferential\nNeutrophil(Neut) 50 % 33-73\nLymphocyte(Lymph) 36 % 13-52\nMonocyte(Mono) 8 % 0-10\nCholesterol(chol) 119 mg/dL 0-200\nBasophil(Baso) 1 % 0-2\nNeutrophil,Absolute 3.5 K/mcL 1.8-7.8\nLymphocyte,Absolute 2.5 K/mcL 1.0-4.8\nMonocyte,Absolute 0.6 K/mcL 0-0.8\nEosinophil,Absolute 0.4 K/mcL 0-0.45\nBasophil,Absolute 0.1 K/mcL 0-0.2\nFlagKey:L=AbnormalLow,H=AbnormalHigh,=criticalvalue\nComment:Hgbof6.5andHctof19.5reportedtoDr.JSmithat15:20on2/10/14byM.Peters18\nENDOFREPORT*"

In [57]:
text_without_newlines = text1.replace('\n', ' ')

In [52]:
import re
text = re.sub(r'\s+', ' ', text_without_newlines)

In [53]:
import os

os.environ['CURL_CA_BUNDLE'] = ''

In [54]:
res = gen1(text)



In [55]:
print(res)


Extract the following information from the medical report:

- Age
- Sex
- Chest Pain
- Cholesterol
- Resting Blood Pressure (trestbps)
- Blood Sugar
- Resting ECG (restecg)
- Maximum Heart Rate Achieved (thalach)
- Exercise Induced Angina (exang)
- ST Depression (oldpeak)

Report Text:
1 128 93 4 5 7 THECOMPLETEBLOODCOUNTSAMPLEREPORT Differentlaboratoriesgeneratereportsthatcanvarygreatlyinappearanceandintheorderandkindof informationincluded.ThisisoneexampleofwhatalabreportforaCompleteBloodCountmaylooklike. Namesandplacesusedhavebeenmadeupforillustrativepurposesonly.Thenumberedkeytotheright explainsafewofthereportelements. UniversityMedicalCenter,Dept.ofPathology ReportDate/Time: 123UniversityWay,City,ST12345 02/10/2014 16:402 Name: Doe,JohnQ. Age/Sex:73/M DOB: 01/01/1941 PatientID:987654321 Status: 6Routine OrderingDr:Smith,PeterMD PhysicianCopyfor: Smith,JaneMD SPEC#: 223456 CollectionDate/Time:02/10/1414:3010 ReceivedDate/Time:02/10/1415:0011 SPECIMEN:Wholeblood ORDERED:CompleteBloo

In [56]:
report = res

# Splitting the report into parts to easily locate needed values
parts = report.split()

# Define a function to get the value after a given keyword
def get_value_after_keyword(parts, keyword):
    for i, part in enumerate(parts):
        if keyword in part:
            # The value is the next item in the list
            return parts[i + 1]
    return None

# Extracting the values
def extract_age_sex(parts):
    for i, part in enumerate(parts):
        if "Age/Sex" in part:
            age_sex_str = parts[i].split(":")[1]  # Split based on ':'
            age_sex_list = age_sex_str.split("/")
            if len(age_sex_list) == 2:
                return age_sex_list[0], age_sex_list[1]
    return None, None

# Use the function to get age and sex
age, sex = extract_age_sex(parts)
cholesterol = get_value_after_keyword(parts, "Cholesterol(chol)")
mean_platelet_volume = get_value_after_keyword(parts, "MeanPlateletVolume")

# Print extracted values
print("Age:", age)
print("Sex:", sex)
print("Cholesterol:", cholesterol)
print("Mean Platelet Volume:", mean_platelet_volume)


Age: 73
Sex: M
Cholesterol: 119
Mean Platelet Volume: 7.9
