# Medical Diagnosis System Using Naïve Bayes Classification
### EMAD ARMITI
### Thursday - 21/5/2020



<div class="alert alert-block alert-info">
<b>An overview of the data </b>
</div>

In [64]:
import pandas as pd
import math
import numpy as np
import operator
from itertools import islice

In [65]:
original_data=pd.read_csv('dis_symp_updated.csv')

In [66]:
original_data.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall




<div class="alert alert-block alert-info">
<b>Data engineering </b>
</div>

In [67]:
original_data[original_data["Symptom"]!=original_data["Symptom"]]

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
1681,,,


In [68]:
original_data.drop(index=1681,inplace=True)

In [69]:
data=pd.DataFrame(columns=original_data.Symptom.unique().tolist())

In [70]:
data.insert(0, 'Disease', [])
data.insert(1, 'Count', [])

In [71]:
data.head()

Unnamed: 0,Disease,Count,UMLS:C0008031_pain chest,UMLS:C0392680_shortness of breath,UMLS:C0012833_dizziness,UMLS:C0004093_asthenia,UMLS:C0085639_fall,UMLS:C0039070_syncope,UMLS:C0042571_vertigo,UMLS:C0038990_sweat^UMLS:C0700590_sweating increased,...,UMLS:C0474505_feces in rectum,UMLS:C0240805_prodrome,UMLS:C0020639_hypoproteinemia,UMLS:C0556346_alcohol binge episode,UMLS:C0000727_abdomen acute,UMLS:C0740844_air fluid level,UMLS:C0425491_catching breath,UMLS:C0456091_large-for-dates fetus,UMLS:C0231441_immobile,UMLS:C0455204_homicidal thoughts


In [76]:
dic={}
disease=0
for i in original_data.iterrows():
    
    if i[1][0]==i[1][0]:
        if len(dic)!=0:
            row=[]
            row.append(disease)
            row.append(dic[disease][0])
            for t in original_data.Symptom.unique().tolist():
                if t in dic[disease]:
                    row.append(1)
                else:
                    row.append(0)
            data=data.append(pd.Series(row, index=data.columns ), ignore_index=True)
            dic={}
            
        disease=i[1][0]
        dic[disease]=[i[1][1]]
        dic[disease].append(i[1][2])
       
    else:
        if i[1][2] not in dic[disease]:
            dic[disease].append(i[1][2] )

In [73]:
data.head()

Unnamed: 0,Disease,Count,UMLS:C0008031_pain chest,UMLS:C0392680_shortness of breath,UMLS:C0012833_dizziness,UMLS:C0004093_asthenia,UMLS:C0085639_fall,UMLS:C0039070_syncope,UMLS:C0042571_vertigo,UMLS:C0038990_sweat^UMLS:C0700590_sweating increased,...,UMLS:C0474505_feces in rectum,UMLS:C0240805_prodrome,UMLS:C0020639_hypoproteinemia,UMLS:C0556346_alcohol binge episode,UMLS:C0000727_abdomen acute,UMLS:C0740844_air fluid level,UMLS:C0425491_catching breath,UMLS:C0456091_large-for-dates fetus,UMLS:C0231441_immobile,UMLS:C0455204_homicidal thoughts
0,UMLS:C0020538_hypertensive disease,3363.0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,UMLS:C0020538_hypertensive disease,2150.0,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,UMLS:C0011847_diabetes,1421.0,1,1,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,UMLS:C0011847_diabetes,950.0,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,UMLS:C0011570_depression mental^UMLS:C0011581_...,1337.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
data.shape

(142, 403)



<div class="alert alert-block alert-info">
<b>	Estimate the prior </b>
</div>

In [49]:
samples_counts=data['Count'].sum()

In [50]:
data_=data.loc[:, ['Disease','Count']]

In [51]:
prior=data_.groupby(['Disease'])['Count'].apply(lambda x : x.sum()/samples_counts)

In [52]:
prior.head()

Disease
UMLS:C0001175_acquired immuno-deficiency syndrome^UMLS:C0019682_HIV^UMLS:C0019693_hiv infections    0.009218
UMLS:C0001418_adenocarcinoma                                                                        0.004372
UMLS:C0001511_adhesion                                                                              0.001870
UMLS:C0001973_chronic alcoholic intoxication                                                        0.001844
UMLS:C0002395_Alzheimer's disease                                                                   0.002660
Name: Count, dtype: float64



<div class="alert alert-block alert-info">
<b>	Estimate the likelihood </b>
</div>

In [53]:
likelihood=data.loc[data.index.repeat(data.Count)]

In [54]:
likelihood.shape

(37970, 403)

In [55]:
likelihood=likelihood.drop(columns='Count')

In [56]:
likelihood.head()

Unnamed: 0,Disease,UMLS:C0008031_pain chest,UMLS:C0392680_shortness of breath,UMLS:C0012833_dizziness,UMLS:C0004093_asthenia,UMLS:C0085639_fall,UMLS:C0039070_syncope,UMLS:C0042571_vertigo,UMLS:C0038990_sweat^UMLS:C0700590_sweating increased,UMLS:C0030252_palpitation,...,UMLS:C0474505_feces in rectum,UMLS:C0240805_prodrome,UMLS:C0020639_hypoproteinemia,UMLS:C0556346_alcohol binge episode,UMLS:C0000727_abdomen acute,UMLS:C0740844_air fluid level,UMLS:C0425491_catching breath,UMLS:C0456091_large-for-dates fetus,UMLS:C0231441_immobile,UMLS:C0455204_homicidal thoughts
0,UMLS:C0020538_hypertensive disease,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
0,UMLS:C0020538_hypertensive disease,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
0,UMLS:C0020538_hypertensive disease,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
0,UMLS:C0020538_hypertensive disease,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
0,UMLS:C0020538_hypertensive disease,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [57]:
likelihood=likelihood.groupby(['Disease'])[likelihood.columns[1:]].apply(lambda x : (x.sum()+1)/(x.count()+2)).reset_index(inplace = False) 

In [58]:
likelihood.head()

Unnamed: 0,Disease,UMLS:C0008031_pain chest,UMLS:C0392680_shortness of breath,UMLS:C0012833_dizziness,UMLS:C0004093_asthenia,UMLS:C0085639_fall,UMLS:C0039070_syncope,UMLS:C0042571_vertigo,UMLS:C0038990_sweat^UMLS:C0700590_sweating increased,UMLS:C0030252_palpitation,...,UMLS:C0474505_feces in rectum,UMLS:C0240805_prodrome,UMLS:C0020639_hypoproteinemia,UMLS:C0556346_alcohol binge episode,UMLS:C0000727_abdomen acute,UMLS:C0740844_air fluid level,UMLS:C0425491_catching breath,UMLS:C0456091_large-for-dates fetus,UMLS:C0231441_immobile,UMLS:C0455204_homicidal thoughts
0,UMLS:C0001175_acquired immuno-deficiency syndr...,0.002841,0.002841,0.002841,0.002841,0.002841,0.002841,0.002841,0.002841,0.002841,...,0.002841,0.002841,0.002841,0.002841,0.002841,0.002841,0.002841,0.002841,0.002841,0.002841
1,UMLS:C0001418_adenocarcinoma,0.005952,0.005952,0.005952,0.005952,0.005952,0.005952,0.005952,0.005952,0.005952,...,0.005952,0.005952,0.005952,0.005952,0.005952,0.005952,0.005952,0.005952,0.005952,0.005952
2,UMLS:C0001511_adhesion,0.013699,0.986301,0.013699,0.013699,0.013699,0.013699,0.013699,0.013699,0.013699,...,0.013699,0.013699,0.013699,0.013699,0.013699,0.013699,0.013699,0.794521,0.794521,0.013699
3,UMLS:C0001973_chronic alcoholic intoxication,0.013889,0.986111,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,...,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889
4,UMLS:C0002395_Alzheimer's disease,0.009709,0.009709,0.009709,0.009709,0.009709,0.009709,0.009709,0.009709,0.009709,...,0.009709,0.009709,0.009709,0.009709,0.009709,0.009709,0.009709,0.009709,0.009709,0.009709




<div class="alert alert-block alert-info">
<b>	Inference </b>
</div>

In [59]:
def prod(name,value,symptoms):
    summ=1
    for i in range(0,len(name)):
        if name[i] in symptoms :
            summ*=value[i]
        else:
            summ*=1-value[i]
    return summ

In [60]:
def inference(symptoms):
    
    inferences=likelihood.apply(lambda x :  prod(x.iloc[1:].index,x.iloc[1:],symptoms)* prior.loc[x['Disease']] ,axis=1).reset_index().rename(columns={0: "value", "index": "disease"})
    inferences['disease']=likelihood.Disease.unique().tolist()
    norm_factor=inferences['value'].sum()
    inferences['normalized_value']=inferences.apply(lambda x :  round(100*x['value']/norm_factor,5),axis=1)
    return inferences.sort_values(by=['normalized_value'],ascending=False)



In [61]:
tested_symptoms=[
'UMLS:C0032617_polyuria',
'UMLS:C0085602_polydypsia',
'UMLS:C0392680_shortness of breath',
'UMLS:C0008031_pain chest',
'UMLS:C0728899_intoxication',
'UMLS:C0424068_verbal auditory hallucinations',
'UMLS:C0455769_energy increased',
'UMLS:C1299586_difficulty',
'UMLS:C0028084_nightmare',
'UMLS:C0235198_unable to concentrate']

In [62]:
test=inference(tested_symptoms)

In [63]:
test.head(10)

Unnamed: 0,disease,value,normalized_value
29,UMLS:C0011127_decubitus ulcer,6.383314e-29,51.70712
70,UMLS:C0022658_kidney disease,5.961821e-29,48.29288
0,UMLS:C0001175_acquired immuno-deficiency syndr...,2.4088089999999997e-64,0.0
93,UMLS:C0031039_effusion pericardial^UMLS:C12539...,2.739662e-39,0.0
88,UMLS:C0029456_osteoporosis,1.2480989999999999e-57,0.0
89,UMLS:C0030305_pancreatitis,2.570562e-46,0.0
90,UMLS:C0030312_pancytopenia,8.886642e-37,0.0
91,UMLS:C0030567_parkinson disease,8.469926e-56,0.0
92,UMLS:C0030920_ulcer peptic,1.597482e-47,0.0
95,UMLS:C0032285_pneumonia,8.267459e-84,0.0




<div class="alert alert-block alert-info">
<b>Improve inference</b>
</div>

In [79]:
def improve_inference(symptoms,previous_inference):

   
    disease=previous_inference[previous_inference.normalized_value!=0].disease.tolist()
    temp=likelihood[likelihood.Disease.isin(disease)].drop(columns=symptoms)
    
    tuples=temp.iloc[:,1:].apply(lambda x: (x.name,sorted(x.tolist(),reverse=True)[0]-sorted(x.tolist(),reverse=True)[1]),axis=0)
    tuples=sorted(tuples, key=lambda x: x[1],reverse=True)
    
    
    results={}
    for i in range(0,5):
        new_symptoms=symptoms.copy()
        new_symptoms.append(tuples[i][0])
        new_inference=inference(new_symptoms)
        results[tuples[i][0]]=new_inference.reset_index().loc[:0,['disease','normalized_value']].to_records(index=False)
    return results



In [80]:
result1=improve_inference(tested_symptoms,test)
for i in result1:
    print(i,'\n','---> ',result1[i][0][0],' : ', result1[i][0][1],'\n\n')

UMLS:C0020461_hyperkalemia 
 --->  UMLS:C0022658_kidney disease  :  99.97434 


UMLS:C0232995_gravida 0 
 --->  UMLS:C0022658_kidney disease  :  99.97434 


UMLS:C0232257_systolic murmur 
 --->  UMLS:C0011127_decubitus ulcer  :  99.97761 


UMLS:C0151706_bleeding of vagina 
 --->  UMLS:C0022658_kidney disease  :  99.97434 


UMLS:C0871754_frail 
 --->  UMLS:C0011127_decubitus ulcer  :  99.97761 






<div class="alert alert-block alert-info">
<b>Improve inference - brute force </b>
</div>

In [33]:
def heuristic(inference):
    result= inference['value_with_symptom'].tolist()
    return sorted(result,reverse=True)[0]-sorted(result,reverse=True)[1]
    
     

In [34]:
def improve_inference_brute_force(symptoms,previous_inference):

   
    result={}
    for symptom in original_data.Symptom.unique().tolist():
        if symptom not in symptoms:
            
            inferences=previous_inference.copy()
            inferences['value_with_symptom']=inferences.apply(lambda x : x['value']*likelihood[likelihood.Disease==x['disease']][symptom][x.iloc[:0].name],axis=1)
            result[symptom]=heuristic(inferences)
            
    result=sorted(result.items(), key=lambda x: x[1], reverse=True)
    
    results={}
    for i in range(0,5):
        new_symptoms=symptoms.copy()
        new_symptoms.append(result[i][0])
        new_inference=inference(new_symptoms)
        results[result[i][0]]=new_inference.reset_index().loc[:1,['disease','normalized_value']].to_records(index=False)
    return results
 



In [35]:
results=improve_inference_brute_force(tested_symptoms,test)

In [36]:
for i in results:
    print(i,'\n','---> ',results[i][0][0],' : ', results[i][0][1],'\n','---> ',results[i][1][0],' : ', results[i][1][1],'\n\n','\n\n')   

UMLS:C0232257_systolic murmur 
 --->  UMLS:C0011127_decubitus ulcer  :  99.97761 
 --->  UMLS:C0022658_kidney disease  :  0.02239 

 


UMLS:C0871754_frail 
 --->  UMLS:C0011127_decubitus ulcer  :  99.97761 
 --->  UMLS:C0022658_kidney disease  :  0.02239 

 


UMLS:C0020461_hyperkalemia 
 --->  UMLS:C0022658_kidney disease  :  99.97434 
 --->  UMLS:C0011127_decubitus ulcer  :  0.02566 

 


UMLS:C0232995_gravida 0 
 --->  UMLS:C0022658_kidney disease  :  99.97434 
 --->  UMLS:C0011127_decubitus ulcer  :  0.02566 

 


UMLS:C0151706_bleeding of vagina 
 --->  UMLS:C0022658_kidney disease  :  99.97434 
 --->  UMLS:C0011127_decubitus ulcer  :  0.02566 

 


