# Treat Clinical notes

**COLAB link** https://colab.research.google.com/github/samsung-ai-course/6-7-edition/blob/main/NLP/Computers%20dont%20read%20numbers/clinical_notes.ipynb

In [None]:
# Standard library imports
import re
import requests 
from tqdm import tqdm 

# Data manipulation libraries
import pandas as pd
import numpy as np

# Data visualization libraries
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image


# NLTK imports
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

# Download required NLTK data files (run only once)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


# Read files into a Data Frame

In [None]:
files_list = requests.get("https://raw.githubusercontent.com/samsung-ai-course/6-7-edition/refs/heads/main/NLP/Computers%20dont%20read%20numbers/directories.txt")
files_list  = files_list.text.split("\n")
notes = []
# run this cells only once this can be blocked by github if a lot of runs are tried
for file in tqdm(files_list[:15]):

    note = requests.get("https://raw.githubusercontent.com/samsung-ai-course/6-7-edition/refs/heads/main/NLP/Computers%20dont%20read%20numbers/data/"+file)
    notes.append(note.text)

In [None]:
notes

In [4]:
# Here everything is done dont worry
# replacement of None Values Exercise
def replace_placeholders(text, placeholder="___", replacement="None"):
    return text.replace(placeholder, replacement)

# Extracting key information using regex
def extract_information(text):
    info = {}
    patterns = {
        "Name": r"Name:\s+(.*?)\s+Unit No",
        "Unit No": r"Unit No:\s+(.*?)\n",
        "Admission Date": r"Admission Date:\s+(.*?)\s+Discharge Date",
        "Discharge Date": r"Discharge Date:\s+(.*?)\n",
        "Date of Birth": r"Date of Birth:\s+(.*?)\s+Sex",
        "Sex": r"Sex:\s+(\w)",
        "Service": r"Service:\s+(.*?)\n",
        "Allergies": r"Allergies:\s+(.*?)\n",
        "Chief Complaint": r"Chief Complaint:\n(.*?)\n",
        "Major Surgical or Invasive Procedure": r"Major Surgical or Invasive Procedure:\n(.*?)\n",
        "History of Present Illness": r"History of Present Illness:\n(.*?)\n\n",
        "Review of systems": r"Review of systems:\s+(.*?)\n\n",
        "Past Medical History": r"Past Medical History:\n(.*?)\n\n",
        "Social History": r"Social History:\n(.*?)\n",
        "Family History": r"Family History:\n(.*?)\n\n",
        "Physical Exam": r"Physical Exam:\n(.*?)\n\n",
        "Brief Hospital Course": r"Brief Hospital Course:\n(.*?)\n\n",
        "Discharge Diagnosis": r"Discharge Diagnosis:\n(.*?)\n\n",
        "Discharge Condition": r"Discharge Condition:\n(.*?)\n\n",
        "Discharge Instructions": r"Discharge Instructions:\n(.*?)\n\n",
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.DOTALL)
        info[key] = match.group(1).strip() if match else "None"
        
    return info
def structure_data(clinical_text):
    # Replace placeholders in the clinical text
    processed_text = replace_placeholders(clinical_text)

    # Extract information into a dictionary
    return extract_information(processed_text)

structure_notes = [ structure_data(note) for note in notes]


# Null values treatment

In [None]:
df = pd.DataFrame(structure_notes).replace('None',np.NaN)
df.head(3)

In [None]:
df.isnull().sum()

In [11]:
# Drop columns with nan values
df_treated = df.dropna(axis=1)

# Complete yourself

In [12]:
# Lets start building a corpus
discharged_diagnosis = ' '.join(df_treated['Discharge Diagnosis'].to_list())

In [None]:

# Function to clean individual lines
def clean_line(line):
    # Remove HTML-like tags
    line = NotImplementedError
    # Remove unwanted characters (e.g., non-ASCII characters)
    line = re.sub(r'[^\x00-\x7F]+', '', line)
    # Strip leading/trailing whitespace
    return line.strip()

def speaking_text_cleaner(speaking_text):
    # Process each line to clean it
    cleaned_lines = []
    for line in speaking_text.splitlines():
        cleaned_line = clean_line(line)
        if cleaned_line:  # Only add non-empty lines
            cleaned_lines.append(cleaned_line)
    return "\n".join(cleaned_lines)

cleaned_discharged_diagnosis = speaking_text_cleaner(discharged_diagnosis)

In [8]:
def corpus_cleaner(cleaned_text,n_min = 4 ):
    n_min = 4                                                           # Minimum number of characters. 
    corpus = []
    lemmatizer = WordNetLemmatizer()

    pre = re.sub(r'\W', ' ', cleaned_text)                          # Substitute the non-alphanumerics character by space. 
                                        # Remove numbers.                                      # Exercise
    pre = nltk.word_tokenize(pre)                                   # Tokenize into words.                                 # Exercise
    pre = [x for x in pre if len(x) > n_min]                        # Minimum length.
    pre = [x.lower() for x in pre]                                  # Convert into the lowercase.
    pre = [x for x in pre if x not in stopwords.words('english')]   # Remove stopwords.
    pre = [lemmatizer.lemmatize(x) for x in pre]                    # Lemmatize.
    corpus += pre                                                   # Back to the corpus.
    return corpus

In [None]:
corpus_clean  = corpus_cleaner( cleaned_discharged_diagnosis )

# Count the words

In [None]:
from collections import Counter

# choose words you want to remove
remove_words = ['primary','secondary','going','diagnosis','right']
  = [x for x in corpus_clean if x not in remove_words]
pd.Series(Counter(corpus_clean)).sort_values()[-50:].plot(kind='bar')

# Word Cloud

In [None]:
a_long_sentence = ' '.join(treated_corpus)
wc = WordCloud(background_color='white', max_words=70,colormap='gist_gray')                  # Customize the output.
wc.generate(a_long_sentence)
# wc.words_                                                          # Check for the top ranking words.                                                         
plt.figure(figsize=(10,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")                                    # Turn off the axes.
#plt.savefig('clinical_notes.png',bbox_inches='tight')
plt.show()