In [5]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

df = pd.read_csv('../data/cleaned_data/classified_clinical_notes.csv')
df.head()

Unnamed: 0,notes,classification
0,"anaes in to see pt, report called to receiving...",0
1,Patient reports speaking/understanding both in...,0
2,"Ambulated to bathroom with assistance, voided ...",0
3,Problem: Knowledge Deficit Goal: Understands d...,1
4,Pt states feeling occasional vaginal pressure....,0


In [7]:
breastfeeding_amount = (df['classification']==1).sum()
bottlefeeding_amount = (df['classification']==2).sum()
nonrelated_amount = (df['classification']==0).sum()
both_amount = (df['classification']==3).sum()

print(nonrelated_amount, breastfeeding_amount, bottlefeeding_amount, both_amount)

8691 847 141 320
notes                                                                                                                                                                                                                                                                                classification
I reviewed the patient's progress and care provided during the 24 hours for which this note is meant.  I agree with the evaluation and plan as written in the resident's note.  I was/am available to the patient and her care team.  [**NAME**]                                     0                 19
I have reviewed the rationale for discharge. I agree with the resident/mid -wife's/advanced practice nurse's note and f/u plans for care.  [**NAME**]                                                                                                                                0                 19
I reviewed the patient's history, physical exam findings and plan of care with the resident see

In [4]:
smallest_class_size = min(breastfeeding_amount, bottlefeeding_amount, nonrelated_amount, both_amount)
target_size = int(smallest_class_size * 1.5) 
print(smallest_class_size, target_size)

141 211


In [9]:
target_nonrelated_size = 700

# Undersample the nonrelated class to the target size
df_nonrelated = resample(df[df['classification'] == 0], 
                         replace=False,  # No replacement for undersampling
                         n_samples=target_nonrelated_size, 
                         random_state=42)

# Keep the other classes as they are
df_breastfeeding = df[df['classification'] == 1]
df_bottlefeeding = df[df['classification'] == 2]
df_both = df[df['classification'] == 3]

# Concatenate the datasets
df_balanced = pd.concat([df_nonrelated, df_breastfeeding, df_bottlefeeding, df_both])

# Shuffle the dataset to mix the classes
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows of the balanced dataframe
df_balanced.head()

# Print the new class distribution to confirm balancing
print(df_balanced['classification'].value_counts())

classification
1    847
0    700
3    320
2    141
Name: count, dtype: int64


In [10]:
import spacy
nlp = spacy.load('en_core_web_lg')
#import spacy for removing stop words and lemmatizing text
def preprocess_text(text):
    doc = nlp(text)
    # Remove stop words and lemmatize
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    # Join tokens back into a string
    processed_text = ' '.join(tokens)
    
    return processed_text

df_balanced['notes'] = df_balanced['notes'].apply(preprocess_text)

df_balanced['notes'] = df_balanced['notes'].str.lower()
#lowercase the text
df_balanced['notes'] = df_balanced['notes'].str.replace('[^\w\s]', '', regex=True)
#remove punctuation

df_balanced.to_csv('../data/cleaned_data/final_notes.csv', index=False)