In [1]:
# Function to preprocess the data
def clean_text(text):
    """Clean and normalize the input text."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove punctuation
    return text

In [3]:
import re
from transformers import BartForConditionalGeneration, BartTokenizer

def summarize_text(text):
    """Summarize the input text using BART."""
    # Load the BART model and tokenizer
    model_name = "facebook/bart-large-cnn"
    model = BartForConditionalGeneration.from_pretrained(model_name)
    tokenizer = BartTokenizer.from_pretrained(model_name)

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, min_length=400, max_length=500, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


# Function to extract the attributes based on their respective regular expressions
def extract_demographics(text):
    """Extract demographics from the text using regular expressions."""
    demographics = {
        'age': None,
        'gender': None,
        'race': None,
        'education': None,
        'occupation': None,
        'living_situation': None,
        'income_level': None,
        'access_to_healthcare': None,
        'relationship_status': None,
    }

    # Regular expression patterns for age and gender
    age_pattern = r'(\d{1,3})\s*(?:years?|yr|year|yo|old)'
    gender_pattern = r'\b(male|female|man|woman|girl|boy|gentleman|lady)\b'
    race_pattern = r'\b(white|black|african american|asian|hispanic|latino|native american|other)\b'
    education_pattern = r'\b(high school|bachelor|master|phd|graduate|undergraduate|college)\b'
    occupation_pattern = r'\b(doctor|nurse|teacher|engineer|worker|manager|student|retired|unemployed|other)\b'
    living_situation_pattern = r'\b(lives alone|with family|lives with her daughter|lives with husband|with partner|in assisted living|homeless|other)\b'
    income_level_pattern = r'\b(low|middle|high|upper class|lower class|working class|other)\b'
    access_to_healthcare_pattern = r'\b(insured|uninsured|medicaid|medicare|private insurance|no access|other)\b'
    relationship_status_pattern = r'\b(single|married|divorced|widowed|in a relationship|recently separated from her husband|other)\b'

    # Search for age
    age_match = re.search(age_pattern, text, re.IGNORECASE)
    demographics['age'] = int(age_match.group(1)) if age_match else None

    # Search for gender
    gender_match = re.search(gender_pattern, text, re.IGNORECASE)
    demographics['gender'] = gender_match.group(0) if gender_match else None

    # Search for race
    race_match = re.search(race_pattern, text, re.IGNORECASE)
    demographics['race'] = race_match.group(0) if race_match else None

    # Search for education
    education_match = re.search(education_pattern, text, re.IGNORECASE)
    demographics['education'] = education_match.group(0) if education_match else None

    # Search for occupation
    occupation_match = re.search(occupation_pattern, text, re.IGNORECASE)
    demographics['occupation'] = occupation_match.group(0) if occupation_match else None

    # Search for living situation
    living_situation_match = re.search(living_situation_pattern, text, re.IGNORECASE)
    demographics['living_situation'] = living_situation_match.group(0) if living_situation_match else None

    # Search for income level
    income_level_match = re.search(income_level_pattern, text, re.IGNORECASE)
    demographics['income_level'] = income_level_match.group(0) if income_level_match else None

    # Search for access to healthcare
    access_to_healthcare_match = re.search(access_to_healthcare_pattern, text, re.IGNORECASE)
    demographics['access_to_healthcare'] = access_to_healthcare_match.group(0) if access_to_healthcare_match else None

    # Search for relationship status
    relationship_status_match = re.search(relationship_status_pattern, text, re.IGNORECASE)
    demographics['relationship_status'] = relationship_status_match.group(0) if relationship_status_match else None

    return demographics

# Example usage
if __name__ == "__main__":
    new_summary = """004668411
CTMC
68299235
763052
9/29/1993 12:00:00 AM
Discharge Summary
Signed
DIS
Admission Date :
09/29/1993
Report Status :
Signed
Discharge Date :
10/04/1993
HISTORY OF PRESENT ILLNESS :
The patient is a 28-year-old woman who is HIV positive for two years .
She presented with left upper quadrant pain as well as nausea and vomiting which is a long-standing complaint .
She was diagnosed in 1991 during the birth of her child .
She claims she does not know why she is HIV positive .
She is from Maryland , apparently had no blood transfusions before the birth of her children so it is presumed heterosexual transmission .
At that time , she also had cat scratch fever and she had resection of an abscess in the left lower extremity .
She has not used any anti retroviral therapy since then , because of pancytopenia and vomiting on DDI .
She has complaints of nausea and vomiting as well as left upper quadrant pain on and off getting progressively worse over the past month .
She has had similar pain intermittently for last year .
She described the pain as a burning pain which is positional , worse when she walks or does any type of exercise .
She has no relief from antacids or H2 blockers .
In 10/92 , she had a CT scan which showed fatty infiltration of her liver diffusely with a 1 cm cyst in the right lobe of the liver .
She had a normal pancreas at that time , however , hyperdense kidneys .
Her alkaline phosphatase was slightly elevated but otherwise relatively normal .
Her amylase was mildly elevated but has been down since then .
The patient has had progressive failure to thrive and steady weight loss .
She was brought in for an esophagogastroduodenoscopy on 9/26 but she basically was not sufficiently sedated and readmitted at this time for a GI work-up as well as an evaluation of new abscess in her left lower calf and right medial lower extremity quadriceps muscle .
She was also admitted to be connected up with social services for HIV patients .
PAST MEDICAL HISTORY :
As above .
ALLERGIES :
BACTRIM .
MEDICATIONS :
On admission included Percocet , Prinovil , Dapsone , Mycelex troches .
SOCIAL HISTORY :
The patient was recently separated from her husband .
She lives with her daughter .
She does not drink , use IV drugs or smoke .
PHYSICAL EXAMINATION :
On admission revealed a cachetic woman in no acute distress with stable vital signs .
She was afebrile .
She was not orthostatic .
Blood pressure 110/80 .
HEENT exam was within normal limits .
Lungs were clear to auscultation and percussion bilaterally .
Cardiovascular exam revealed a regular rate and rhythm without murmur .
Abdomen was soft , nontender , nondistended with positive bowel sounds .
There was no hepatosplenomegaly .
Extremities revealed a 2 x 3 cm tender mass in the lateral left calf , medial 1 cm mass above her knee .
There was no evidence of edema .
LABORATORY DATA :
On admission included BUN / creatinine of 33/2.1 .
Sodium 141 .
Potassium 4.2 .
Hematocrit 23 .
White blood cell count was 2.1 with 56 polys and 1 band .
Platelet count 411,000 .
Amylase 143 .
Lipase was elevated to 600 .
ESR was greater than 140 .
Alkaline phosphatase 190 .
ALT 52 .
AST 65 .
Beta hCG was negative .
Urinalysis was positive for protein .
Bilirubin 0.4 .
Chest x-ray revealed clear lung fields .
There was no evidence of rib fracture .
HOSPITAL COURSE :
The patient was admitted and many cultures were sent which were all negative .
She did not have any of her pain in the hospital .
On the third hospital day , she did have some pain and was treated with Percocet .
She went for a debridement of her left calf lesion on 10/2/93 and was started empirically on IV ceftriaxone which was changed to po doxycycline on the day of discharge .
A follow-up CT scan was done which did not show any evidence for splenomegaly or hepatomegaly .
The 1 cm cyst which was seen in 10/92 was still present .
There was a question of a cyst in her kidney with a stone right below the cyst , although this did not seem to be clinically significant .
DISPOSITION :
The patient was discharged to home in stable condition .
Cultures were pending on her aspirate and will be treated with po doxycycline .
Dictated By :
JIMCHARL B. BUN , M.D. OC33
Attending :
I BUN , M.D. GR67 EF283/9675
Batch :
2027
Index No. BOKMII88JZ
D :
10/03/93
T :
10/07/93
    """


    summary = summarize_text(new_summary)


    summary = clean_text(summary)
    demographics = extract_demographics(summary)
    print("Extracted Demographics:")
    for key, value in demographics.items():
        print(f"{key.capitalize()}: {value}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Extracted Demographics:
Age: 28
Gender: woman
Race: None
Education: None
Occupation: None
Living_situation: lives with her daughter
Income_level: None
Access_to_healthcare: None
Relationship_status: None


For one summary just put summary at example usage

In [6]:
import re
import json
from transformers import BartForConditionalGeneration, BartTokenizer

def summarize_text(text):
    """Summarize the input text using BART."""
    # Load the BART model and tokenizer
    model_name = "facebook/bart-large-cnn"
    model = BartForConditionalGeneration.from_pretrained(model_name)
    tokenizer = BartTokenizer.from_pretrained(model_name)

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, min_length=400, max_length=500, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def extract_demographics(text):
    """Extract demographics from the text using regular expressions."""
    demographics = {
        'age': None,
        'gender': None,
        'race': None,
        'education': None,
        'occupation': None,
        'living_situation': None,
        'income_level': None,
        'access_to_healthcare': None,
        'relationship_status': None,
    }

    # Regular expression patterns for age and gender
    age_pattern = r'(\d{1,3})\s*(?:years?|yr|year|yo|old)'
    gender_pattern = r'\b(male|female|man|woman|girl|boy|gentleman|lady)\b'
    race_pattern = r'\b(white|black|african american|asian|hispanic|latino|native american|other)\b'
    education_pattern = r'\b(high school|bachelor|master|phd|graduate|undergraduate|college)\b'
    occupation_pattern = r'\b(doctor|nurse|teacher|engineer|worker|manager|student|retired|unemployed|other)\b'
    living_situation_pattern = r'\b(lives alone|with family|lives with her daughter|lives with husband|with partner|in assisted living|homeless|other)\b'
    income_level_pattern = r'\b(low|middle|high|upper class|lower class|working class|other)\b'
    access_to_healthcare_pattern = r'\b(insured|uninsured|medicaid|medicare|private insurance|no access|other)\b'
    relationship_status_pattern = r'\b(single|married|divorced|widowed|in a relationship|recently separated from her husband|other)\b'

    # Search for age
    age_match = re.search(age_pattern, text, re.IGNORECASE)
    demographics['age'] = int(age_match.group(1)) if age_match else None

    # Search for gender
    gender_match = re.search(gender_pattern, text, re.IGNORECASE)
    demographics['gender'] = gender_match.group(0) if gender_match else None

    # Search for race
    race_match = re.search(race_pattern, text, re.IGNORECASE)
    demographics['race'] = race_match.group(0) if race_match else None

    # Search for education
    education_match = re.search(education_pattern, text, re.IGNORECASE)
    demographics['education'] = education_match.group(0) if education_match else None

    # Search for occupation
    occupation_match = re.search(occupation_pattern, text, re.IGNORECASE)
    demographics['occupation'] = occupation_match.group(0) if occupation_match else None

    # Search for living situation
    living_situation_match = re.search(living_situation_pattern, text, re.IGNORECASE)
    demographics['living_situation'] = living_situation_match.group(0) if living_situation_match else None

    # Search for income level
    income_level_match = re.search(income_level_pattern, text, re.IGNORECASE)
    demographics['income_level'] = income_level_match.group(0) if income_level_match else None

    # Search for access to healthcare
    access_to_healthcare_match = re.search(access_to_healthcare_pattern, text, re.IGNORECASE)
    demographics['access_to_healthcare'] = access_to_healthcare_match.group(0) if access_to_healthcare_match else None

    # Search for relationship status
    relationship_status_match = re.search(relationship_status_pattern, text, re.IGNORECASE)
    demographics['relationship_status'] = relationship_status_match.group(0) if relationship_status_match else None

    return demographics

# Example usage
if __name__ == "__main__":
    new_summary = """004668411
CTMC
68299235
763052
9/29/1993 12:00:00 AM
Discharge Summary
Signed
DIS
Admission Date :
09/29/1993
Report Status :
Signed
Discharge Date :
10/04/1993
HISTORY OF PRESENT ILLNESS :
The patient is a 28-year-old woman who is HIV positive for two years .
She presented with left upper quadrant pain as well as nausea and vomiting which is a long-standing complaint .
She was diagnosed in 1991 during the birth of her child .
She claims she does not know why she is HIV positive .
She is from Maryland , apparently had no blood transfusions before the birth of her children so it is presumed heterosexual transmission .
At that time , she also had cat scratch fever and she had resection of an abscess in the left lower extremity .
She has not used any anti retroviral therapy since then , because of pancytopenia and vomiting on DDI .
She has complaints of nausea and vomiting as well as left upper quadrant pain on and off getting progressively worse over the past month .
She has had similar pain intermittently for last year .
She described the pain as a burning pain which is positional , worse when she walks or does any type of exercise .
She has no relief from antacids or H2 blockers .
In 10/92 , she had a CT scan which showed fatty infiltration of her liver diffusely with a 1 cm cyst in the right lobe of the liver .
She had a normal pancreas at that time , however , hyperdense kidneys .
Her alkaline phosphatase was slightly elevated but otherwise relatively normal .
Her amylase was mildly elevated but has been down since then .
The patient has had progressive failure to thrive and steady weight loss .
She was brought in for an esophagogastroduodenoscopy on 9/26 but she basically was not sufficiently sedated and readmitted at this time for a GI work-up as well as an evaluation of new abscess in her left lower calf and right medial lower extremity quadriceps muscle .
She was also admitted to be connected up with social services for HIV patients .
PAST MEDICAL HISTORY :
As above .
ALLERGIES :
BACTRIM .
MEDICATIONS :
On admission included Percocet , Prinovil , Dapsone , Mycelex troches .
SOCIAL HISTORY :
The patient was recently separated from her husband .
She lives with her daughter .
She does not drink , use IV drugs or smoke .
PHYSICAL EXAMINATION :
On admission revealed a cachetic woman in no acute distress with stable vital signs .
She was afebrile .
She was not orthostatic .
Blood pressure 110/80 .
HEENT exam was within normal limits .
Lungs were clear to auscultation and percussion bilaterally .
Cardiovascular exam revealed a regular rate and rhythm without murmur .
Abdomen was soft , nontender , nondistended with positive bowel sounds .
There was no hepatosplenomegaly .
Extremities revealed a 2 x 3 cm tender mass in the lateral left calf , medial 1 cm mass above her knee .
There was no evidence of edema .
LABORATORY DATA :
On admission included BUN / creatinine of 33/2.1 .
Sodium 141 .
Potassium 4.2 .
Hematocrit 23 .
White blood cell count was 2.1 with 56 polys and 1 band .
Platelet count 411,000 .
Amylase 143 .
Lipase was elevated to 600 .
ESR was greater than 140 .
Alkaline phosphatase 190 .
ALT 52 .
AST 65 .
Beta hCG was negative .
Urinalysis was positive for protein .
Bilirubin 0.4 .
Chest x-ray revealed clear lung fields .
There was no evidence of rib fracture .
HOSPITAL COURSE :
The patient was admitted and many cultures were sent which were all negative .
She did not have any of her pain in the hospital .
On the third hospital day , she did have some pain and was treated with Percocet .
She went for a debridement of her left calf lesion on 10/2/93 and was started empirically on IV ceftriaxone which was changed to po doxycycline on the day of discharge .
A follow-up CT scan was done which did not show any evidence for splenomegaly or hepatomegaly .
The 1 cm cyst which was seen in 10/92 was still present .
There was a question of a cyst in her kidney with a stone right below the cyst , although this did not seem to be clinically significant .
DISPOSITION :
The patient was discharged to home in stable condition .
Cultures were pending on her aspirate and will be treated with po doxycycline .
Dictated By :
JIMCHARL B. BUN , M.D. OC33
Attending :
I BUN , M.D. GR67 EF283/9675
Batch :
2027
Index No. BUN493"""

    summary = summarize_text(new_summary)
    summary = clean_text(summary)
    extracted_demographics = extract_demographics(summary)
    demographics_json = json.dumps(extracted_demographics, indent=4)
    print(demographics_json)


{
    "age": 28,
    "gender": "woman",
    "race": null,
    "education": null,
    "occupation": null,
    "living_situation": "lives with her daughter",
    "income_level": null,
    "access_to_healthcare": null,
    "relationship_status": null
}


In [19]:
# This function below takes the summary as input and output the result for that summary
def summarize_and_extract_demographics(summary):
    """Summarize text and extract demographics in JSON format."""
    summary = summarize_text(summary)
    demographics = extract_demographics(summary)
    return json.dumps(demographics, indent=4)

# Function call for a specified summary
def get_result(text):
  result = summarize_and_extract_demographics(summary)
  print(result)



In [20]:
# Example usage

text= input('')
print(text)

John Smith is a 45-year-old male who has been admitted to the hospital for a routine health check-up following a recent spike in his blood pressure. He works as a project manager at a well-known construction firm, where he has been employed for over 15 years. John is known for his strong leadership skills and ability to manage multiple projects simultaneously, which has earned him several accolades within his company. In terms of his personal life, John is married to Lisa, a 42-year-old school teacher. They have been married for 20 years and have two children, aged 10 and 15. The family enjoys spending quality time together, often engaging in outdoor activities such as hiking and biking during the weekends. John’s supportive spouse, Lisa, plays a crucial role in maintaining a healthy lifestyle for the family, often preparing nutritious meals and encouraging regular exercise. Despite his busy work schedule, John makes it a priority to attend family events and school functions, demonstra

In [21]:
get_result(text)

{
    "age": null,
    "gender": null,
    "race": null,
    "education": null,
    "occupation": null,
    "living_situation": null,
    "income_level": "Middle",
    "access_to_healthcare": null,
    "relationship_status": null
}


    T5 model

In [None]:
# T5 MODEL IMPLEMENTATION FOR THE ATTRIBUTE EXTRACTION

import pandas as pd
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

# Step 1: Load Your Data
file_path = '/content/drive/MyDrive/7 aug/Clinical_Dataset (2).xlsx'
data = pd.read_excel(file_path)

# Step 2: Define a Function to Fix JSON Format
def fix_json_format(x):
    if isinstance(x, str):
        x = x.replace("'", '"')
        x = x.replace('"s ', "'s ")
        x = x.replace('d/c" home', 'd/c home')
        x = x.replace('"s office', "'s office")
        x = x.replace(' " ', ' ')
        return x
    return x


data['sdoh_proceessed'] = data['sdoh_proceessed'].apply(fix_json_format)

# Step 3: Define a Function to Safely Load JSON
def safe_json_loads(x):
    try:
        if isinstance(x, dict):
            return x
        elif isinstance(x, str):
            return json.loads(x)
        else:
            return {}
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e} for value: {x}")
        return {}

# Step 4: Prepare the Input and Output Data
X = data['Discharge Summary'].values
y = data['sdoh_proceessed'].apply(safe_json_loads).values

# Convert y to JSON string format if needed
y_json_strings = [json.dumps(item) for item in y]

# Step 5: Initialize the Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Step 6: Define the Chunking Function
def chunk_text(text, tokenizer, max_length=512, stride=256):
    tokens = tokenizer.encode(text, truncation=False)
    chunks = []
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i + max_length]
        if len(chunk) < max_length:
            chunk += [tokenizer.pad_token_id] * (max_length - len(chunk))
        chunks.append(chunk)
    return chunks

# Step 8: Chunk Inputs Using the Chunking Function
X_chunked = []
for summary in X:
    chunks = chunk_text(summary, tokenizer)
    X_chunked.extend(chunks)

# Step 7: Tokenize Outputs
y_tokenized = [tokenizer.encode(label, padding='max_length', truncation=True, max_length=512) for label in y_json_strings]

# Step 8: Create a Custom Dataset Class
class SDOHDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.inputs[idx], dtype=torch.long),
            'attention_mask': torch.tensor([1 if i != tokenizer.pad_token_id else 0 for i in self.inputs[idx]], dtype=torch.long),
            'labels': torch.tensor(self.outputs[idx % len(self.outputs)], dtype=torch.long)  # Use modulo to cycle through outputs
        }

# Step 9: Create the Dataset and DataLoader
dataset = SDOHDataset(X_chunked, y_tokenized)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Step 10: Train the Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
for epoch in range(10):  # Number of epochs
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

# Step 11: Inference
model.eval()
new_summary = "Your new discharge summary text here."  # Replace with your test summary
input_chunks = chunk_text(new_summary, tokenizer)

# Generate predictions for each chunk
predictions = []
for chunk in input_chunks:
    input_ids = torch.tensor(chunk).unsqueeze(0).to(device)  # Add batch dimension and move to device
    with torch.no_grad():
        outputs = model.generate(input_ids)
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(result)

# Combine predictions
final_prediction = " ".join(predictions)
print(final_prediction)

Error decoding JSON: Expecting ',' delimiter: line 1 column 102 (char 101) for value: {"age": "49y", "gender": "", "race": "", "education": "", "occupation": "", "living_situation": "d/c"d home", "income_level": "", "access_to_healthcare": "", "relationship_status": ""}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Token indices sequence length is longer than the specified maximum sequence length for this model (1796 > 512). Running this sequence through the model will result in indexing errors


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 0, Loss: 0.24264861643314362
Epoch: 0, Loss: 0.22368082404136658
Epoch: 0, Loss: 0.2363816648721695
Epoch: 0, Loss: 0.24816574156284332
Epoch: 0, Loss: 0.2465936690568924
Epoch: 0, Loss: 0.21368855237960815
Epoch: 0, Loss: 0.20904693007469177
Epoch: 0, Loss: 0.19734910130500793
Epoch: 0, Loss: 0.22359493374824524
Epoch: 0, Loss: 0.24259771406650543
Epoch: 0, Loss: 0.24473652243614197
Epoch: 0, Loss: 0.19862601161003113
Epoch: 0, Loss: 0.21829527616500854
Epoch: 0, Loss: 0.26773685216903687
Epoch: 0, Loss: 0.26567012071609497
Epoch: 0, Loss: 0.47003790736198425
Epoch: 0, Loss: 0.23449642956256866
Epoch: 0, Loss: 0.2417486310005188
Epoch: 0, Loss: 0.2607777416706085
Epoch: 0, Loss: 0.20177814364433289
Epoch: 0, Loss: 0.26656171679496765
Epoch: 0, Loss: 0.2322312444448471
Epoch: 0, Loss: 0.24530015885829926
Epoch: 0, Loss: 0.1979166865348816
Epoch: 0, Loss: 0.6823865175247192
Epoch: 0, Loss: 0.35225051641464233
Epoch:



"age": "65", "gender": "M", "race


In [None]:
import os

# Step 1: Define the directory where you want to save the model
output_dir = '/content/drive/MyDrive/saved_model/'

# Step 2: Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Step 3: Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

Model and tokenizer saved to /content/drive/MyDrive/saved_model/


In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json

# Step 1: Load the Pre-trained Model and Tokenizer
model_path = '/content/drive/MyDrive/saved_model'
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Step 2: Define the Discharge Summary
new_summary = """i am boy and my age 24"""

# Step 3: Chunk the Input Text
def chunk_text(text, tokenizer, max_length=512, stride=256):
    tokens = tokenizer.encode(text, truncation=False)
    chunks = []
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i + max_length]
        if len(chunk) < max_length:
            chunk += [tokenizer.pad_token_id] * (max_length - len(chunk))
        chunks.append(chunk)
    return chunks

input_chunks = chunk_text(new_summary, tokenizer)

# Step 4: Generate Predictions
predictions = []
batch_size = 4

for i in range(0, len(input_chunks), batch_size):
    batch = input_chunks[i:i + batch_size]
    input_ids = torch.tensor(batch).to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_new_tokens=100,
            repetition_penalty=1.2,
            no_repeat_ngram_size=2,
        )
        results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        predictions.extend(results)

# Step 5: Combine Predictions and Format Output
final_prediction = " ".join(predictions)

# Step 6: Format the Output
output = {
    'age': '',
    'gender': '',
    'race': '',
    'education': '',
    'occupation': '',
    'living_situation': '',
    'income_level': '',
    'access_to_healthcare': '',
    'relationship_status': ''
}

# Example parsing logic (you may need to modify this based on your actual output)
for line in final_prediction.split(','):
    key_value = line.split(':')
    if len(key_value) == 2:
        key = key_value[0].strip().replace('"', '')
        value = key_value[1].strip().replace('"', '')
        if key in output:
            output[key] = value

# Print the formatted output
print(json.dumps(output, indent=4))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{
    "age": "65",
    "gender": "",
    "race": "",
    "education": "",
    "occupation": "",
    "living_situation": "",
    "income_level": "",
    "access_to_healthcare": "",
    "relationship_status": ""
}


In [None]:
!pip install spacy fuzzywuzzy
!python -m spacy download en_core_web_sm

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!pip install transformers torch



In [None]:
import re
from transformers import BartForConditionalGeneration, BartTokenizer

def summarize_text(text):
    # Load the pre-trained BART model and tokenizer
    model_name = "facebook/bart-large-cnn"
    model = BartForConditionalGeneration.from_pretrained(model_name)
    tokenizer = BartTokenizer.from_pretrained(model_name)

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, min_length=30, max_length=200, early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def extract_age(text):
    """Extract age from the text using regular expressions."""
    # Look for patterns like "65 year old", "70 yo", etc.
    age_pattern = r'(\d{1,3})\s*(?:years?|yr|yo|old)'
    match = re.search(age_pattern, text, re.IGNORECASE)
    if match:
        return int(match.group(1))
    return None

# Example usage
if __name__ == "__main__":
    new_summary = """405868244 YC
70449076
5586564
11/16/2005 12:00:00 AM
POSTPARTUM
DIS
Admission Date :
11/16/2005
Report Status :
Discharge Date :
11/19/2005
****** FINAL DISCHARGE ORDERS ******
WARESIN , SAGAY 702-84-43-7 L06
Room :
DBM62-8758
Service :
OBS
DISCHARGE PATIENT ON :
11/18/05 AT 02:00 PM
CONTINGENT UPON
cnm evaluation
WILL D / C ORDER BE USED AS THE D / C SUMMARY :
YES
Attending :
OBSTETRICS SERVICE , YC , M.D.
DISPOSITION :
Home
DISCHARGE MEDICATIONS :
DOCUSATE SODIUM 100 MG PO BID PRN Constipation IBUPROFEN 400-600 MG PO Q6H PRN Pain
Food / Drug Interaction Instruction
Take with food PRENATAL MULTIVITAMINS ( STUARTNATAL ) 1 TAB PO QD MAALOX-TABLETS QUICK DISSOLVE / CHEWABLE 1-2 TAB PO Q6H PRN Upset Stomach
DIET :
No Restrictions
ACTIVITY :
Resume regular exercise
FOLLOW UP APPOINTMENT ( S ) :
BROOKSIDE 2 wks ,
ALLERGY :
NKA ADMIT DIAGNOSIS :
pregnancy , term
PRINCIPAL DISCHARGE DIAGNOSIS ;
Responsible After Study for Causing Admission )
POSTPARTUM OTHER DIAGNOSIS ;
Conditions , Infections , Complications , affecting Treatment / Stay postpartum
OPERATIONS AND PROCEDURES :
Vaginal delivery :
Spontaneous
Laceration :
Perineal , 1st degree
Placenta :
Delivery :
Spontaneous
Condition :
Normal
Certified Nurse Midwife :
NEAD , MANERTNY , CNM
OTHER TREATMENTS / PROCEDURES ( NOT IN O.R. )
repair of first degree perineal laceration
BRIEF RESUME OF HOSPITAL COURSE :
G2 T1 P0 A0 L1 Estimated
EDC :
12/02/05
Maternal transfer :
No
Initial newborn exam :
No observed abnormalities
Baby 1
MRN :
26793023
Delivery date :
11/16/05 02:21 PM
Apgars 9,9
Weight :
7lb 4.8 oz 3311 grams
Sex :
Female
ADDITIONAL COMMENTS :
DISCHARGE CONDITION :
Stable
TO DO / PLAN :
No dictated summary
ENTERED BY :
JESCCOT , LA P. , C.N.M ( YW8 )
11/17/05 08:24 AM
****** END OF DISCHARGE ORDERS ******"""

    # Summarize the text
    summary = summarize_text(new_summary)
    print("Summary:")
    print(summary)

    # Extract age from the summary
    age = extract_age(summary)
    if age is not None:
        print(f"The patient's age is {age}.")
    else:
        print("Age not found in the summary.")

Summary:
Admission Date : 11/16/2005 12:00:00am. Discharge Date: 11/19/2005 1:30:00pm.
Age not found in the summary.


In [None]:
import re
from transformers import BartForConditionalGeneration, BartTokenizer

def summarize_text(text):
    # Load the pre-trained BART model and tokenizer
    model_name = "facebook/bart-large-cnn"
    model = BartForConditionalGeneration.from_pretrained(model_name)
    tokenizer = BartTokenizer.from_pretrained(model_name)

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, min_length=30, max_length=200, early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def extract_age_and_gender(text):
    """Extract age and gender from the text using regular expressions."""
    # Regular expression patterns for age and gender
    age_pattern = r'(\d{1,3})\s*(?:years?|yr|yo|old)'
    gender_pattern = r'\b(male|female|man|woman|girl|boy|gentleman|lady)\b'

    # Search for age
    age_match = re.search(age_pattern, text, re.IGNORECASE)
    age = int(age_match.group(1)) if age_match else None

    # Search for gender
    gender_match = re.search(gender_pattern, text, re.IGNORECASE)
    gender = gender_match.group(0) if gender_match else None

    return age, gender

# Example usage
if __name__ == "__main__":
    new_summary = """T405868244 YC
70449076
5586564
11/16/2005 12:00:00 AM
POSTPARTUM
DIS
Admission Date :
11/16/2005
Report Status :
Discharge Date :
11/19/2005
****** FINAL DISCHARGE ORDERS ******
WARESIN , SAGAY 702-84-43-7 L06
Room :
DBM62-8758
Service :
OBS
DISCHARGE PATIENT ON :
11/18/05 AT 02:00 PM
CONTINGENT UPON
cnm evaluation
WILL D / C ORDER BE USED AS THE D / C SUMMARY :
YES
Attending :
OBSTETRICS SERVICE , YC , M.D.
DISPOSITION :
Home
DISCHARGE MEDICATIONS :
DOCUSATE SODIUM 100 MG PO BID PRN Constipation IBUPROFEN 400-600 MG PO Q6H PRN Pain
Food / Drug Interaction Instruction
Take with food PRENATAL MULTIVITAMINS ( STUARTNATAL ) 1 TAB PO QD MAALOX-TABLETS QUICK DISSOLVE / CHEWABLE 1-2 TAB PO Q6H PRN Upset Stomach
DIET :
No Restrictions
ACTIVITY :
Resume regular exercise
FOLLOW UP APPOINTMENT ( S ) :
BROOKSIDE 2 wks ,
ALLERGY :
NKA ADMIT DIAGNOSIS :
pregnancy , term
PRINCIPAL DISCHARGE DIAGNOSIS ;
Responsible After Study for Causing Admission )
POSTPARTUM OTHER DIAGNOSIS ;
Conditions , Infections , Complications , affecting Treatment / Stay postpartum
OPERATIONS AND PROCEDURES :
Vaginal delivery :
Spontaneous
Laceration :
Perineal , 1st degree
Placenta :
Delivery :
Spontaneous
Condition :
Normal
Certified Nurse Midwife :
NEAD , MANERTNY , CNM
OTHER TREATMENTS / PROCEDURES ( NOT IN O.R. )
repair of first degree perineal laceration
BRIEF RESUME OF HOSPITAL COURSE :
G2 T1 P0 A0 L1 Estimated
EDC :
12/02/05
Maternal transfer :
No
Initial newborn exam :
No observed abnormalities
Baby 1
MRN :
26793023
Delivery date :
11/16/05 02:21 PM
Apgars 9,9
Weight :
7lb 4.8 oz 3311 grams
Sex :
Female
ADDITIONAL COMMENTS :
DISCHARGE CONDITION :
Stable
TO DO / PLAN :
No dictated summary
ENTERED BY :
JESCCOT , LA P. , C.N.M ( YW8 )
11/17/05 08:24 AM
****** END OF DISCHARGE ORDERS ******
    """

    # Summarize the text (optional)
    summary = summarize_text(new_summary)
    print("Summary:")
    print(summary)

    # Extract age and gender from the summary
    age, gender = extract_age_and_gender(summary)
    print(f"Extracted Age: {age}, Extracted Gender: {gender}")

Summary:
Postpartum admission date is 11/16/2005. Postpartum discharge dates are 11/19/2005 and 11/20/2005, respectively. The patient will be referred to a midwife.
Extracted Age: None, Extracted Gender: None
