# 📋 01 - Clinical Notes EDA & Preprocessing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
import re
from wordcloud import WordCloud
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

## 🗂 Load Dataset

In [None]:
df = pd.read_csv("../data/synthetic_clinical_notes.csv", parse_dates=["created_at"])
df.head()

## 📊 Note Type & Department Distribution

In [None]:
sns.countplot(data=df, x='note_type')
plt.xticks(rotation=45)
plt.title("Distribution of Note Types")
plt.show()

sns.countplot(data=df, x='department')
plt.xticks(rotation=45)
plt.title("Department-wise Notes")
plt.show()

## 🧼 Text Cleaning Function

In [None]:
def clean_text(text):
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"[^a-zA-Z ]", "", text)
    text = text.lower()
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return " ".join(tokens)

df['clean_note'] = df['note_text'].apply(clean_text)
df[['note_text', 'clean_note']].head(3)

## ☁️ Word Cloud of All Notes

In [None]:
all_words = " ".join(df['clean_note'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)
plt.figure(figsize=(15, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Common Terms in Clinical Notes")
plt.show()