In [None]:
#ghadeer abualrub
import pandas as pd
import numpy as np
import copy

pd.set_option('mode.chained_assignment','warn')

In [None]:
class NBModel:
    
    '''
    This is the class of the model which do the training and prediction part, the training
    is done once according to the data from the csv file, and the prediction can be done 
    several times by providing the model a list of symptoms.
    
        self.df = main dataframe which hold the data before processing it
        self.features = a list of all symptoms we have
        self.classes = a list of all diseases we have in the dataset
        self.priors = a list of p(Disease), a list of prior probabilities 
         
        self.DSdict = dictionary contains diseases as keys and thier symptoms as values
        self.k = laplace smoothing factor
        self.prior_df = dataframe contains each disease with its count of Occurrence to calcultate prior probability
        
        self.probabilities_df = dataframe has the symptoms as columns and disease as rows to calculate the likelihood probabilities
        self.dictOfSympConfidence = dictionary contains each symptom with its confidence, these are
        not the all symptoms we have but only the symptoms provided to the model to predict the disease
        
        self.confAllSymp = dictionary contains confidence for all symptoms we have
            
    
    '''
    def __init__(self, k = 1):
        self.df = None
        self.featuresCount = 0
        self.classesCount = 0
        self.priors = []
        # contains diseases as keys and thier symptoms as values 
        self.DSdict = dict()
        self.k = k
        self.prior_df = None
        self.probabilities_df = None
        self.dictOfSympConfidence = dict()
        self.confAllSymp = dict()
        
        
    def read_data(self, filename):
        '''read data from csv file and store it as dataframe'''
        self.df = pd.read_csv(filename)
    
    def build_df(self, variables):
        '''build the main dataframe with three columns [Disease, count of occurance, Symptoms]
           this df contains duplicate in the diseases and their count of occurance according to
           thier symptoms
        '''
        self.df = self.df.fillna(0)
        for var in variables:
            fill = self.df[var].iloc[0]
            for i in range(1,self.df.shape[0]):
                if self.df[var].iloc[i] == 0:
                    self.df[var].iloc[i] = fill
                else:
                    fill = self.df[var].iloc[i]
       

    def arrange_df(self, variables):
        ''' this function re-arrange the name of diseases and name of symptoms, beacause it has 
            '^' and '_' in their names
            record the symptoms and diseases count
            build a dictionary map each disease with its sypmtoms
        '''
        #formatting
        for var in variables:
            self.df = self.df[self.df[var] != 0]
            self.df[var] = self.df[var].apply(lambda x: x.split('^'))
            self.df = self.df.explode(var).reset_index()
            self.df[var] = self.df[var].apply(lambda x: x.split('_')[1])
        self.df.drop(['index', 'level_0'], axis = 1, inplace = True)
        
        #record counts and build dictionary
        self.featuresCount = self.df['Symptom'].unique().size
        self.classesCount = self.df['Disease'].unique().size
        self.build_DSdict()
        
        
    def build_DSdict(self):
        '''
        build a dictionary map each disease with its sypmtoms
        '''
        for i in self.df['Disease'].unique():
            self.DSdict[i] = self.df[self.df['Disease'] == i].iloc[:,2].unique()
        

    def calculate_prior(self, variables): # P(D)
        '''
        this function calculate prior probabilities for all diseases as they are the target 
        variables, using K as a laplace smoothing factor
        prior = number of accurence for the disease + k / (total sample + number of diseases*K)
        p(disease) = c(disease) + K / (c(samples) + K * |diseases|)
        
        '''
        #df for disease with its count of occurrence
        self.prior_df = copy.deepcopy(self.df)
        self.prior_df.drop([variables[0]], axis = 1, inplace = True)# drop 'Symptom'
        self.prior_df.drop_duplicates(keep='first',inplace=True)
        self.prior_df.reset_index()

        # group by 'Disease' and sum on 'Count of Disease Occurrence'
        self.prior_df = pd.DataFrame(self.prior_df.groupby(variables[1])[variables[2]].apply(lambda g: g.nlargest(2).sum())).reset_index()
        # calculate prior probabilty for each class of target (disease)
        total_samples = self.prior_df[variables[2]].sum()
        self.prior_df[variables[2]] += self.k
        self.priors = self.prior_df[variables[2]]/(total_samples+self.k*self.classesCount)


    def calculate_probabilities(self):
        '''
        this function calculate likelihood probabilities p(symptom|disease)
        p(symptom|disease) = c(symptom,disease) + K / (c(disease) + K * |symptoms|)
        '''
        # arrange a sparce matrix or a matrix to have a good format to treat with the data
        # so symptoms will be columns and diseases as rows, and the values will be filled 
        # for each cell using the occurance of disease
        
        self.df.drop('Count of Disease Occurrence', axis = 1, inplace = True)
        df_sparse = pd.get_dummies(self.df, columns = ['Symptom']).drop('Symptom_', axis=1).drop_duplicates()
        df_sparse = df_sparse.groupby('Disease').sum().reset_index()
        sr = self.prior_df['Count of Disease Occurrence']
        self.probabilities_df = copy.deepcopy(df_sparse)
        self.probabilities_df.drop(['Disease'], axis = 1, inplace = True)
        self.probabilities_df = self.probabilities_df.mul(sr,axis = 0)
        self.probabilities_df['prior'] = self.priors.array
        self.probabilities_df['Disease'] = df_sparse['Disease']
        
        # calsulate likelihood for each symptom as p(symptom|disease)
        # p(symptom|disease) = (number of occurence of symptom with disease) +K / (number of samples with this disease + K * number of symptoms values)
        # p(symptom|disease) = c(symptom,disease) + K / (c(disease) + K * |symptoms|)
        # it will be calculated for each symptom and and the values stored in the same dataframe 

        for i in range(0,self.probabilities_df.shape[0]):
            for j in range(0,self.featuresCount):
                likelihood = (self.probabilities_df.iloc[i,j]+self.k)/ (self.prior_df.iloc[i,1] + self.k * self.featuresCount)
                self.probabilities_df.iloc[i,j] = likelihood

        
              
        

    def predict(self,SymptomsList,topK):
        '''
        this function take a list of symptoms to predict the possible diseases, 
        this function the topK predicted diseases 
        this function also give the confidence value for each symptom with the related disease 
        of this confidence
        
        it will return the topK diseases as prediction
        '''
        self.product = pd.Series(self.probabilities_df['prior'])
        for symptom in SymptomsList:
            self.product *= self.probabilities_df[symptom]
            confidence = self.probabilities_df[symptom]*nbm.probabilities_df['prior']
            self.dictOfSympConfidence[symptom] = [confidence.max(), self.probabilities_df.iloc[confidence.nlargest(1).index,:]['Disease']]

        return self.probabilities_df.iloc[self.product.nlargest(topK).index,:]['Disease']
    
        
    def calculate_new_symp(self, SymptomsList, diseasesList ):
        '''
        calculate new symptoms to be more sure of our results 
        we calculate the confidence for symptoms ,except the ones we provide to model, 
        with the same list of diseases we got from prdiction and then take the top 10 
        symptoms that have the largest confidence with these diseases
        '''

        # to store neww symptoms to use in diagnosis
        new_symptom = []
        for symptom in self.probabilities_df.columns[0:-3]:
            if symptom not in SymptomsList:
                for i in diseasesList:
                    confidence = self.probabilities_df[symptom]*self.probabilities_df[self.probabilities_df['Disease']== i]['prior']
                    self.confAllSymp[symptom] = confidence.max()
              
        symptoms =  sorted(self.confAllSymp.items(), key=lambda x:x[1],reverse=True)
        x=0
        #take top 10 new symptoms to predict 
        for sym in symptoms:
            if x == 10:
                break
            new_symptom.append(sym)
            x += 1
            
                
        return new_symptom     
    
    def calculate_sequence_symp(self ):  
        
        '''
        this function calculate the confidence value for all symptoms we have not a subset of them 
        but for all to determine which symptoms must be concidered first in diagnosis, 
        this means to provide a sequence of sorted symptoms base on its confidence
        '''
    
       
        for symptom in self.probabilities_df.columns[0:-3]:
           
            confidence=self.probabilities_df[symptom]*self.probabilities_df['prior']
            self.confAllSymp[symptom] = confidence.max()
        symptoms =  sorted(self.confAllSymp.items(), key=lambda x:x[1],reverse=True)
        #take top 10 new symptoms to predict
        new_symptom=[]
        x=0
        for sym in symptoms:
            if x == 10:
                break
            new_symptom.append(sym)
            x += 1
        return new_symptom
        
        
    
          

In [None]:
nbm = NBModel(1)
nbm.read_data('dis_symp.csv')
nbm.build_df(['Disease','Count of Disease Occurrence'])
nbm.arrange_df(['Disease','Symptom'])
nbm.calculate_prior(['Symptom','Disease','Count of Disease Occurrence'])
nbm.df


In [None]:
nbm.priors

In [None]:
nbm.prior_df

In [None]:
nbm.calculate_probabilities()

In [None]:
nbm.probabilities_df


In [None]:
SymptomsList=['Symptom_pain chest',
'Symptom_fall',
'Symptom_asthenia', 'Symptom_shortness of breath', 'Symptom_dizziness','Symptom_welt',
'Symptom_syncope', 'Symptom_vertigo','Symptom_sweating increased', 
              'Symptom_angina pectoris', 'Symptom_pressure chest',
              'Symptom_palpitation', 'Symptom_nausea']
nbm.predict(SymptomsList,11)



In [None]:
diseaseList= ['hypertensive disease',
'myocardial infarction',
'hyperlipidemia',
'ischemia',
'gastroesophageal reflux disease',
'diabetes',
'hypercholesterolemia',
'coronary arteriosclerosis',
'coronary heart disease',
'stenosis aortic valve',
'paroxysmal dyspnea']

In [None]:
nbm.dictOfSympConfidence

In [None]:
nbm.calculate_new_symp(SymptomsList, [ 'hypertensive disease','myocardial infarction','hyperlipidemia','ischemia' ])

In [None]:
nbm.predict(['Symptom_anorexia', 
 'Symptom_asymptomatic', 
 'Symptom_bradycardia', 
 'Symptom_chest discomfort',
 'Symptom_drowsiness', 
 'Symptom_dyspnea', 
 'Symptom_rale', 
 'Symptom_sinus rhythm', 
 'Symptom_sleepy', 
 'Symptom_sweat', ],10)

In [None]:
nbm.calculate_sequence_symp()