# Medical Disease Prediction from the Image of Doctor's Diagnosis using NLP

#### Sample Image of Doctor's Diagnosis
![image info](./dataset/image_dataset_train/patient_40_report.png)

In [1]:
import numpy as np
import pandas as pd
import pytesseract
from PIL import Image
import re

pytesseract.pytesseract.tesseract_cmd=r"C:\Program Files\Tesseract-OCR\tesseract.exe"

## Reading Image Path and Disease CSV

In [2]:
disease_data_test = pd.read_csv('dataset/disease_data_test.csv')
disease_data_train= pd.read_csv('dataset/disease_data_train.csv')

In [3]:
disease_data_train.head()

Unnamed: 0,file_name,disease
0,image_dataset_train/patient_0_report.png,Cervical Spondylosis
1,image_dataset_train/patient_1_report.png,Impetigo
2,image_dataset_train/patient_2_report.png,Urinary Tract Infection
3,image_dataset_train/patient_3_report.png,Arthritis
4,image_dataset_train/patient_4_report.png,Dengue


## Function for Image Processing
##### Open the image and perform OCR to extract text
##### Remove lines with certain patterns like "Hospital Name" and blank lines
##### Replace common OCR misinterpretations (like | as I)
##### Remove non-ASCII characters
##### Clean up the description to remove unwanted artifacts like line breaks or extra spaces
##### Add the extracted description and disease to the reconstructed data list

In [4]:
def get_dataframe_from_images(disease_data):
    reconstructed_data = []
    for _, row in disease_data.iterrows():
        image_path = 'dataset/'+row['file_name']
        disease = row['disease']
        
        img = Image.open(image_path)
        extracted_text = pytesseract.image_to_string(img)
    
        cleaned_text = re.sub(r"Hospital Name:.*", "", extracted_text)  
        cleaned_text = re.sub(r"\n+", "\n", cleaned_text)  
        cleaned_text = cleaned_text.strip()  
        cleaned_text = cleaned_text.replace('|', 'I') 
        cleaned_text = re.sub(r"[^\x00-\x7F]+", '', cleaned_text)  
        if "Symptoms:" in cleaned_text:
            description = cleaned_text.split("Symptoms:")[-1].strip()
        else:
            description = cleaned_text.strip()
        description = description.replace('\n', ' ').strip() 
        reconstructed_data.append({
            'description': description,
            'disease': disease
        })
    reconstructed_df = pd.DataFrame(reconstructed_data)
    return reconstructed_df

## Final Reconstructed DataFrame

In [5]:
train_df=get_dataframe_from_images(disease_data_train)
test_df=get_dataframe_from_images(disease_data_test)

In [6]:
train_df.to_csv('train.csv',index=False)
test_df.to_csv('test.csv',index=False)

In [7]:
train_df

Unnamed: 0,description,disease
0,I've been having a lot of pain in my neck and ...,Cervical Spondylosis
1,I have a rash on my face that is getting worse...,Impetigo
2,I have been urinating blood. I sometimes feel ...,Urinary Tract Infection
3,I have been having trouble with my muscles and...,Arthritis
4,I have been feeling really sick. My body hurts...,Dengue
...,...,...
848,My veins are bulging and painful. I can't stan...,Varicose Veins
849,I have been having headaches for a while now. ...,Migraine
850,I have a rash on my face that is very painful ...,Impetigo
851,I have a stuffy nose and nasal congestion. I s...,Allergy


In [8]:
test_df

Unnamed: 0,description,disease
0,I have a burning sensation in my stomach that ...,Peptic Ulcer Disease
1,I have a hard time swallowing and I feel like ...,Peptic Ulcer Disease
2,"I've been having headaches and migraines, and ...",Drug Reaction
3,I'm sweating a lot and can't catch my breath. ...,Pneumonia
4,"I've been scratching myself a lot lately, and ...",Fungal Infection
...,...,...
207,I have been experiencing muscle pain that make...,Dengue
208,"I have red, irritated skin on my arms, face, a...",Psoriasis
209,"I've been having a hard time breathing, and I'...",Bronchial Asthma
210,I've been coughing a lot for a few days now. I...,Bronchial Asthma


### Checking Null Values

In [8]:
train_df['description'].isnull().sum(), test_df['description'].isnull().sum()

(0, 0)

### Loading Clinical Stopwords

In [9]:
with open('clinical-stopwords.txt', 'r') as f:
    clinical_stopwords = set(f.read().splitlines())

In [10]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()

### Preprocessing Text using NLTK Tokenizer and Lemmatizer

In [11]:
import re
def preprocess_text(text):
    text = re.sub(r'\b\w{1,2}\b', '', text) 
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text.lower()) if word not in clinical_stopwords])
    return text

In [12]:
train_df['description'] = train_df['description'].apply(preprocess_text)
test_df['description'] = test_df['description'].apply(preprocess_text)

### Label Encoding

In [13]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_df['disease'] = label_encoder.fit_transform(train_df['disease'])
test_df['disease'] = label_encoder.transform(test_df['disease'])

In [14]:
X_train=train_df['description']
y_train=train_df['disease']
X_test=test_df['description']
y_test=test_df['disease']

## TF-IDF Vectorization

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=10000, stop_words=None,sublinear_tf=True)

X_train_tfidf = tfidf.fit_transform(X_train).toarray()

## Loading ML Models

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [17]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000,class_weight='balanced'),
    "SVM": SVC(class_weight='balanced'),
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "Naive Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost" : XGBClassifier()
}

## SMOTE Oversampling

In [18]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

## Function to intialize StratifiedKFold and Fit the models
### Returns Precision, Recall, Fscore

In [19]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

def train_and_evaluate_model(model, X_train_resampled, y_train_resampled):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    
    for train_idx, val_idx in cv.split(X_train_resampled, y_train_resampled):
        X_train_fold, X_val_fold = X_train_resampled[train_idx], X_train_resampled[val_idx]
        y_train_fold, y_val_fold = y_train_resampled[train_idx], y_train_resampled[val_idx]
        
        model.fit(X_train_fold, y_train_fold)
        
        y_pred = model.predict(X_val_fold)
        
        accuracy = accuracy_score(y_val_fold, y_pred)
        accuracies.append(accuracy)
        
        precision, recall, f1, _ = precision_recall_fscore_support(y_val_fold, y_pred, average='weighted', zero_division=1)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
    
    avg_accuracy = np.mean(accuracies)
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1_scores)
    
    return avg_accuracy, avg_precision, avg_recall, avg_f1

## Fitting the model

In [20]:
results = {}

for model_name, model in models.items():
    accuracy, precision, recall, f1 = train_and_evaluate_model(model, X_train_resampled, y_train_resampled)
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

results_df = pd.DataFrame(results).T
print(results_df)

best_model_name = results_df['F1 Score'].idxmax()
print(f"Best Model: {best_model_name}")
best_model = models[best_model_name]

                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.914773   0.922896  0.914773  0.914106
SVM                  0.934091   0.945592  0.934091  0.935417
Random Forest        0.919318   0.925460  0.919318  0.916475
Naive Bayes          0.889773   0.899550  0.889773  0.882697
Decision Tree        0.738636   0.772455  0.738636  0.737456
KNN                  0.888636   0.898312  0.888636  0.886014
Gradient Boosting    0.801136   0.833957  0.801136  0.807270
XGBoost              0.820455   0.834563  0.820455  0.818659
Best Model: SVM


### Best Model is found as SVM 

## Hyperparameter tuning

### 1. For SVM Classifier

In [21]:
from sklearn.model_selection import GridSearchCV

svc_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

svc = SVC(class_weight='balanced')

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

grid_search = GridSearchCV(svc, svc_param_grid, cv=stratified_kfold, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

best_svc_model = grid_search.best_estimator_
print(f"Best parameters for SVC: {grid_search.best_params_}")

Best parameters for SVC: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [22]:
X_test_tfidf = tfidf.transform(X_test).toarray()
y_test_pred = best_svc_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy of SVC after Hyperparameter tuning: {test_accuracy}")

Test Set Accuracy of SVC after Hyperparameter tuning: 0.9716981132075472


#### Test Set Accuracy of SVM Classifier = 0.97

### 2. For Random Forest Classifier

In [23]:
rf_param_grid = {
    'n_estimators': [100, 200],  
    'max_depth': [None, 10],  
    'min_samples_split': [2, 5],  
    'min_samples_leaf': [1, 2],  
    'class_weight': ['balanced']
}

rf = RandomForestClassifier()

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

grid_search_rf = GridSearchCV(rf, rf_param_grid, cv=stratified_kfold, scoring='f1_weighted', n_jobs=-1)
grid_search_rf.fit(X_train_resampled, y_train_resampled)

best_rf_model = grid_search_rf.best_estimator_
print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")

Best parameters for Random Forest: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [24]:
X_test_tfidf = tfidf.transform(X_test).toarray()
y_test_pred = best_rf_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy of Random Forest after Hyperparameter tuning: {test_accuracy}")

Test Set Accuracy of Random Forest after Hyperparameter tuning: 0.9292452830188679


#### Test Set Accuracy of Random Forest Classifier = 0.93

## The Hyperparameter tuned SVM Classifier Model is chosen

In [25]:
y_test_pred = best_svc_model.predict(X_test_tfidf)        
acc_svc = accuracy_score(y_test, y_test_pred)
prec_svc, rec_svc, f1_svc, _ = precision_recall_fscore_support(y_test, y_test_pred, average='weighted', zero_division=1)

print(f"Accuracy of SVM Classifier Best Model on Test Set: {acc_svc}")
print(f"Precision of SVM Classifier Best Model on Test Set: {prec_svc}")
print(f"Recall of SVM Classifier Best Model on Test Set: {rec_svc}")
print(f"F1 Score of SVM Classifier Best Model on Test Set: {f1_svc}")

from sklearn.metrics import classification_report
print("\nClassification Report\n")
y_test_decoded = label_encoder.inverse_transform(y_test)  
y_test_pred_decoded = label_encoder.inverse_transform(y_test_pred) 
report = classification_report(y_test_decoded, y_test_pred_decoded, zero_division=1)
print(report)
best_svc_model

Accuracy of SVM Classifier Best Model on Test Set: 0.9716981132075472
Precision of SVM Classifier Best Model on Test Set: 0.9744520678482943
Recall of SVM Classifier Best Model on Test Set: 0.9716981132075472
F1 Score of SVM Classifier Best Model on Test Set: 0.9716199595719426

Classification Report

                                 precision    recall  f1-score   support

                        Allergy       0.91      1.00      0.95        10
                      Arthritis       1.00      1.00      1.00        10
               Bronchial Asthma       0.91      1.00      0.95        10
           Cervical Spondylosis       1.00      1.00      1.00        10
                    Chicken Pox       1.00      0.90      0.95        10
                    Common Cold       1.00      1.00      1.00        10
                         Dengue       1.00      0.90      0.95        10
                       Diabetes       1.00      1.00      1.00        10
                  Drug Reaction       0

#### Saving the SVM model, Label Encoder, TF-IDF Vectorizer to implement in Flask

In [26]:
import joblib

joblib.dump(tfidf, 'tfidf_vectorizer.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')
joblib.dump(best_svc_model, 'best_svc_model.joblib')

print("Saved Successfully")

Saved Successfully


## References
### 1. https://www.kaggle.com/datasets/dpm3333/patient-symptom-report-and-disease
### 2. https://github.com/kavgan/clinical-concepts