In [1]:
import pandas as pd
import pytesseract
from PIL import Image
import re

pytesseract.pytesseract.tesseract_cmd=r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [2]:
image_path = 'test images/patient_3_report.png'

In [3]:
img = Image.open(image_path)
extracted_text = pytesseract.image_to_string(img)


cleaned_text = re.sub(r"Hospital Name:.*", "", extracted_text)  
cleaned_text = re.sub(r"\n+", "\n", cleaned_text)  
cleaned_text = cleaned_text.strip()  


cleaned_text = cleaned_text.replace('|', 'I') 


cleaned_text = re.sub(r"[^\x00-\x7F]+", '', cleaned_text) 

if "Symptoms:" in cleaned_text:
    description = cleaned_text.split("Symptoms:")[-1].strip()
else:
    description = cleaned_text.strip()

description = description.replace('\n', ' ').strip() 

In [4]:
description

"I'm sweating a lot and can't catch my breath. My throat is full of phlegm and I feel awful. My heart is racing and my chest hurts. I'm coughing up brown stuff."

### SVM Model

In [5]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = joblib.load('tfidf_vectorizer.joblib')
label_encoder = joblib.load('label_encoder.joblib')
best_svc_model = joblib.load('best_svc_model.joblib')

In [6]:
with open('clinical-stopwords.txt', 'r') as f:
    clinical_stopwords = set(f.read().splitlines())

In [7]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'\b\w{1,2}\b', '', text)  
    text = re.sub(r'[^\w\s]', '', text)  
    text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text.lower()) if word not in clinical_stopwords])
    return text

In [8]:
description_tfidf = tfidf.transform([preprocess_text(description)])

In [9]:
predicted_label_encoded = best_svc_model.predict(description_tfidf.toarray())

In [10]:
predicted_disease = label_encoder.inverse_transform(predicted_label_encoded)
predicted_disease[0]

'Pneumonia'

### Bert Model

In [11]:
import torch
from transformers import BertForSequenceClassification, BertTokenizerFast
import joblib

In [12]:
model = BertForSequenceClassification.from_pretrained('./bert/model')

tokenizer = BertTokenizerFast.from_pretrained('./bert/tokenizer')

label_encoder = joblib.load('./bert/label_encoder.pkl')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [13]:
def predict_class(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    predicted_class_idx = torch.argmax(logits, dim=-1).item()

    predicted_class = label_encoder.inverse_transform([predicted_class_idx])[0]

    return predicted_class


In [14]:
description

"I'm sweating a lot and can't catch my breath. My throat is full of phlegm and I feel awful. My heart is racing and my chest hurts. I'm coughing up brown stuff."

In [15]:
predicted_class = predict_class(description)
print(f"Predicted Class: {predicted_class}")

Predicted Class: pneumonia
