### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import nltk
resources = ['stopwords','punkt','wordnet','averaged_perceptron_tagger']
nltk.download(resources)
from nltk.corpus import stopwords

import seaborn as sns

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## Data Collection

In [3]:
data = pd.read_csv('/content/Symptom2Disease.csv')

In [4]:
data = data.drop('Unnamed: 0', axis = 1)
data

Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."
...,...,...
1195,diabetes,I'm shaking and trembling all over. I've lost ...
1196,diabetes,"Particularly in the crevices of my skin, I hav..."
1197,diabetes,I regularly experience these intense urges and...
1198,diabetes,"I have trouble breathing, especially outside. ..."


### Data Preprocessing

In [5]:
data.isnull().sum()

label    0
text     0
dtype: int64

In [6]:
data.label.value_counts()

Psoriasis                          50
Varicose Veins                     50
peptic ulcer disease               50
drug reaction                      50
gastroesophageal reflux disease    50
allergy                            50
urinary tract infection            50
Malaria                            50
Jaundice                           50
Cervical spondylosis               50
Migraine                           50
Hypertension                       50
Bronchial Asthma                   50
Acne                               50
Arthritis                          50
Dimorphic Hemorrhoids              50
Pneumonia                          50
Common Cold                        50
Fungal infection                   50
Dengue                             50
Impetigo                           50
Chicken pox                        50
Typhoid                            50
diabetes                           50
Name: label, dtype: int64

In [7]:
data['text'] = data['text'].str.replace('[^a-zA-Z\s]','',regex=True)
data

Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,My skin has been peeling especially on my knee...
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,There is a silver like dusting on my skin espe...
4,Psoriasis,My nails have small dents or pits in them and ...
...,...,...
1195,diabetes,Im shaking and trembling all over Ive lost my ...
1196,diabetes,Particularly in the crevices of my skin I have...
1197,diabetes,I regularly experience these intense urges and...
1198,diabetes,I have trouble breathing especially outside I ...


In [11]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

def remove_stopwords(sentence):
    sentence = sentence.lower()
    words = sentence.split()
    fil_words = [word for word in words if word not in stop_words]
    new_sentence = ' '.join(fil_words)
    return new_sentence

data['text'] = data['text'].apply(remove_stopwords)


In [13]:
from nltk.tokenize import word_tokenize

data['tokens'] = data['text'].apply(word_tokenize)

data.head()

Unnamed: 0,label,text,tokens
0,Psoriasis,experiencing skin rash arms legs torso past we...,"[experiencing, skin, rash, arms, legs, torso, ..."
1,Psoriasis,skin peeling especially knees elbows scalp pee...,"[skin, peeling, especially, knees, elbows, sca..."
2,Psoriasis,experiencing joint pain fingers wrists knees p...,"[experiencing, joint, pain, fingers, wrists, k..."
3,Psoriasis,silver like dusting skin especially lower back...,"[silver, like, dusting, skin, especially, lowe..."
4,Psoriasis,nails small dents pits often feel inflammatory...,"[nails, small, dents, pits, often, feel, infla..."


### Label encoding

In [16]:
import numpy as np

encoded_labels = np.array([1, 2, 3, 1, 4, 5, 3, 2])

unique_values = np.unique(encoded_labels)

# Get the counts of the unique labels
counts = np.unique(encoded_labels, return_counts=True)[1]

# Create a dictionary of unique labels and their counts
unique_labels_and_counts = dict(zip(unique_values, counts))

print(unique_labels_and_counts)


{0: 50, 1: 50, 2: 50, 3: 50, 4: 50, 5: 50, 6: 50, 7: 50, 8: 50, 9: 50, 10: 50, 11: 50, 12: 50, 13: 50, 14: 50, 15: 50, 16: 50, 17: 50, 18: 50, 19: 50, 20: 50, 21: 50, 22: 50, 23: 50}


In [17]:
encoded_labels

array([15, 15, 15, ..., 19, 19, 19])

In [18]:
label_encoder.inverse_transform([15,16,13])

array(['Psoriasis', 'Typhoid', 'Migraine'], dtype=object)

### Feature Engineering

In [37]:
# Lemmatization


from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn

lemmatizer = WordNetLemmatizer()

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J']: wn.ADJ
tag_map['V']: wn.VERB
tag_map['R']: wn.ADV



def lemmatize_token_list(token_list):

    # creating tagged tokens
    tagged_tokens = pos_tag(token_list)
    final_words = []

    # lemmatization
    for i in range(len(token_list)):
        token , tag = tagged_tokens[i]
        word_final = lemmatizer.lemmatize(token, tag_map[tag[0]])
        final_words.append(word_final)

    return final_words

# applying text preprocessing to tokens column

data['lemma'] = data['tokens'].apply(lemmatize_token_list)
data.head()

Unnamed: 0,label,text,tokens,lemma
0,Psoriasis,experiencing skin rash arms legs torso past we...,"[experiencing, skin, rash, arms, legs, torso, ...","[experiencing, skin, rash, arm, leg, torso, pa..."
1,Psoriasis,skin peeling especially knees elbows scalp pee...,"[skin, peeling, especially, knees, elbows, sca...","[skin, peeling, especially, knee, elbow, scalp..."
2,Psoriasis,experiencing joint pain fingers wrists knees p...,"[experiencing, joint, pain, fingers, wrists, k...","[experiencing, joint, pain, finger, wrist, kne..."
3,Psoriasis,silver like dusting skin especially lower back...,"[silver, like, dusting, skin, especially, lowe...","[silver, like, dusting, skin, especially, lowe..."
4,Psoriasis,nails small dents pits often feel inflammatory...,"[nails, small, dents, pits, often, feel, infla...","[nail, small, dent, pit, often, feel, inflamma..."


In [42]:
X = data['lemma']
def join_func(X):
    text = X
    next_text = ' '.join(text)
    return next_text

X = X.apply(join_func)

In [43]:
X

0       experiencing skin rash arm leg torso past week...
1       skin peeling especially knee elbow scalp peeli...
2       experiencing joint pain finger wrist knee pain...
3       silver like dusting skin especially lower back...
4       nail small dent pit often feel inflammatory te...
                              ...                        
1195    im shaking trembling ive lost sense taste smel...
1196    particularly crevice skin skin rash irritation...
1197    regularly experience intense urge want urinate...
1198    trouble breathing especially outside start fee...
1199    constantly sneeze dry cough infection dont see...
Name: lemma, Length: 1200, dtype: object

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

# Fit and transform the documents to obtain the TF-IDF matrix
X_matrix = vectorizer.fit_transform(X)

In [47]:


# Get the feature names (terms) in the TF-IDF matrix
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a dense array for easier inspection (optional)
tfidf_matrix_dense = X_matrix.toarray()



TF-IDF Matrix:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Feature Names:
['abdomen' 'abdominal' 'ability' ... 'yesterday' 'yet' 'zone']


## Model Buildng

In [50]:
model = LogisticRegression(multi_class='ovr')

# Train the model on the train data
model.fit(X_matrix, encoded_labels)


#### Prediction single disease

In [71]:
def predict_label(complaints):
    input = vectorizer.transform([complaints])
    pred = model. predict(input)
    label = label_encoder.inverse_transform(pred)
    return label

In [73]:
predict_label('fever with burning sensation while urinating')

array(['urinary tract infection'], dtype=object)

#### Predicting differential diagnosis

In [78]:
input = X[1]
input_vec = vectorizer.transform([input])

In [79]:

predicted_probabilities = model.predict_proba(input_vec)
predicted_probabilities

array([[0.01625251, 0.0110576 , 0.01408377, 0.01232546, 0.01930615,
        0.01438042, 0.02603217, 0.01023909, 0.02175402, 0.01617691,
        0.02333256, 0.00970228, 0.00706186, 0.01575444, 0.01196937,
        0.5071625 , 0.02135046, 0.0232565 , 0.02503545, 0.04649795,
        0.04129536, 0.03742118, 0.03982267, 0.02872929]])

In [81]:

top_5_class_indices = np.argsort(predicted_probabilities[0])[::-1][:5]
top_5_class_indices

array([15, 19, 20, 22, 21])

In [83]:

top_5_class_labels = label_encoder.inverse_transform(top_5_class_indices)
top_5_class_labels

array(['Psoriasis', 'diabetes', 'drug reaction', 'peptic ulcer disease',
       'gastroesophageal reflux disease'], dtype=object)