In [1]:
from pgmpy.factors.discrete import TabularCPD
from pgmpy.models import BayesianModel

In [2]:
# a different model
model2 = BayesianModel([('schooltype', 'deanlist'), 
                        ('schooltype', 'top20'), 
                        ('schooltype', 'gpa3'), 
                        ('schooltype', 'acttest'), 
                        ('deanlist', 'eligibility'), 
                        ('top20', 'eligibility'), 
                        ('gpa3', 'eligibility'), 
                        ('acttest', 'eligibility'), 
                        ('age', 'eligibility'), 
                        ('schooltype', 'eligibility')])  # fruit -> tasty <- size


# rule set 1
def check_goodstudent(st, dl, t20, gpa3, acttest):
    if ((st==0) | (st==1) | (st==2)):
        if ((dl=='yes') | (t20=='yes') | (gpa3=='yes')):
            return 'yes'
        elif ((dl=='no') & (t20=='no') & (gpa3=='no')):  #all of them are no
            return 'no'
        else:
            return 'unknown'
    elif st==3:   # home school
        if (acttest=='yes'):
            return 'yes'
        elif (acttest=='no'):
            return 'no'
        else:  # don't know acctest result
            return 'unknown'
    else:
        return 'no'
    
    
    
# rule set 2
def check_eligibility(gs, age):
    if ((gs=='yes') & (age=='yes')):
        return 'yes'
    elif (age=='no'):
        return 'no'
    elif ((gs=='unknown') | (age=='unknown')):
        return 'unknown'
    else:
        return 'no'
    
    
    

# construct a 2-d array storing the probability mapped to rules
var_true =[]
var_false =[]
var_unknown =[]

for st in range(0,5):  # 0 - Fulltime high school   1 - College  2- Trade School . 3- Homeschool . 4-Others
    for dl in ['yes', 'no', 'unknown']:  # 0 - Yes, on deanlist   1 - not on deanlist
        for t20 in ['yes', 'no', 'unknown']:   # 0 - Yes, is top 20%,  1 - not in top 20%
            for gpa3 in ['yes', 'no', 'unknown']:  # 0 - yes,  1 . -no
                for acttest in ['yes', 'no', 'unknown']:  # 0 - yes,  1 - no
                    for age in ['yes', 'no', 'unknown']: # 0 below 25,  1 - above 25
                        #print (check_goodstudent(st, dl, t20, gpa3, acttest))
                        if check_eligibility(check_goodstudent(st, dl, t20, gpa3, acttest),age)=='yes':
                            var_true.extend([1])
                            var_false.extend([0])
                            var_unknown.extend([0])
                        elif check_eligibility(check_goodstudent(st, dl, t20, gpa3, acttest),age)=='no':
                            var_true.extend([0])
                            var_false.extend([1])
                            var_unknown.extend([0])
                        else:
                            var_true.extend([0])
                            var_false.extend([0])
                            var_unknown.extend([1])           
                            
                            
# Defining individual CPDs.
# SchoolType   5 states   
# ACTtest      2 states    0-Yes, 1-No, 2-Unknown
# GPA          2 states    0-Yes, 1-No, 2-Unknown
# Top20        2 states    0-Yes, 1-No, 2-Unknown
# Deanlist     2 states    0-Yes, 1-No, 2-Unknown

cpd_schooltype = TabularCPD(variable='schooltype', variable_card=5, values=[[0.3, 0.3, 0.2, 0.15, 0.05]])
cpd_deanlist = TabularCPD(variable='deanlist', variable_card=3, values=[[0.45, 0.45, 0.1]])
cpd_top20 = TabularCPD(variable='top20', variable_card=3, values=[[0.45, 0.45, 0.1]])
cpd_gpa3 = TabularCPD(variable='gpa3', variable_card=3, values=[[0.45, 0.45, 0.1]])
cpd_acttest = TabularCPD(variable='acttest', variable_card=3, values=[[0.45, 0.45, 0.1]])
cpd_age = TabularCPD(variable='age', variable_card=3, values=[[0.5, 0.5, 0]])


cpd_deanlist = TabularCPD(variable='deanlist', variable_card=3, 
                   values=[[0.1,  0.15, 0.2,  0,  0],
                           [0.75, 0.75, 0.7,  0.8,  1],
                           [0.15,  0.1,  0.1,  0.2,  0]],
                  evidence=['schooltype'],
                  evidence_card=[5])


cpd_top20 = TabularCPD(variable='top20', variable_card=3, 
                   values=[[0.2, 0.2, 0.2,  0,  0],
                           [0.7, 0.7, 0.7,  1,  1],
                           [0.1, 0.1, 0.1,  0,  0]],
                   evidence=['schooltype'],
                   evidence_card=[5])


cpd_gpa3 = TabularCPD(variable='gpa3', variable_card=3, 
                   values=[[0.3, 0.2, 0.1,  0,  0],
                           [0.6, 0.7, 0.8,  1,  1],
                           [0.1, 0.1, 0.1,  0,  0]],
                   evidence=['schooltype'],
                   evidence_card=[5])


cpd_acttest = TabularCPD(variable='acttest', variable_card=3, 
                   values=[[0,   0,   0,    0.4,  0],
                           [0.8, 0.8, 0.8,  0.6,  1],
                           [0.2, 0.2, 0.2,  0,    0]],
                   evidence=['schooltype'],
                   evidence_card=[5])


cpd_eligibility = TabularCPD(variable='eligibility', variable_card=3, 
                   values=[var_true, var_false, var_unknown],
                   evidence=['schooltype','deanlist','top20','gpa3','acttest','age'],
                   evidence_card=[5,3,3,3,3,3])


model2.add_cpds(cpd_schooltype, cpd_deanlist, cpd_top20, cpd_gpa3, cpd_acttest, cpd_age, cpd_eligibility)


                 
                
                



In [17]:
# rank entropy
# evidence is a dictionary with known states

import math
from operator import itemgetter
from collections import OrderedDict
from pgmpy.inference import VariableElimination
import copy

infer = VariableElimination(model2)


#print(infer.query(['eligibility']) ['eligibility'])

def conditional_entropy(variable, my_evidence):
    cond_entropy = 0
    raw_p = infer.query([variable]) [variable].values
    card = len(raw_p)
    temp_ev = copy.deepcopy(my_evidence)
    
    #print "length is "+str(card)
    for v in range(0,card):
        # update the variable value to v
        temp_ev[variable] = v
        #  H(Z|x_j)*p(x_j)
        cond_entropy = cond_entropy+raw_p[v]* evidence_entropy(temp_ev)

    return cond_entropy

def evidence_entropy(my_evidence):
    dis_fact = infer.query(['eligibility'], evidence=my_evidence) ['eligibility'].values
    entropy = 0
    for v in dis_fact:
        entropy =entropy - v*math.log(v+1E-10)
    return entropy

def calculate_gain(variable, my_evidence):
    H_z= evidence_entropy(my_evidence)
    H_z_x = conditional_entropy(variable, my_evidence)
#     print('old entropy' + str(H_z))
#     print('new entropy' + str(H_z_x))
    gain = H_z - H_z_x
    return gain

def rank_variable(my_evidence, variable_list):
    ranked_q = {}
    for var in variable_list:
        if var not in my_evidence:
            #print(var)
            #print(calculate_gain(var,my_evidence))
            ranked_q[var]=calculate_gain(var,my_evidence)
    return OrderedDict(sorted(ranked_q.items(), key=itemgetter(1),reverse=True))

#print(known_evidence)
#print(calculate_gain('age', known_evidence))

#variable_list = ['schooltype','deanlist','top20','gpa3','acttest','age']
#print(rank_variable(known_evidence,variable_list))


In [18]:
infer = VariableElimination(model2)

In [18]:
variable_list = ['schooltype','deanlist','top20','gpa3','acttest','age']
known_evidence={'deanlist':0}
#question_order=rank_variable(known_evidence,variable_list)

In [19]:
import cProfile
cProfile.run('rank_variable(known_evidence,variable_list)')

  phi.values = phi.values / phi.values.sum()


         46253738 function calls (44743204 primitive calls) in 66.809 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        5    0.000    0.000   54.328   10.866 <ipython-input-17-12effb8d3ae3>:15(conditional_entropy)
       22    0.000    0.000   59.246    2.693 <ipython-input-17-12effb8d3ae3>:30(evidence_entropy)
        5    0.000    0.000   66.809   13.362 <ipython-input-17-12effb8d3ae3>:37(calculate_gain)
        1    0.000    0.000   66.809   66.809 <ipython-input-17-12effb8d3ae3>:45(rank_variable)
        1    0.000    0.000   66.809   66.809 <string>:1(<module>)
     2035    0.001    0.000    0.001    0.000 DiscreteFactor.py:114(scope)
       63    0.000    0.000    0.001    0.000 DiscreteFactor.py:131(get_cardinality)
       63    0.000    0.000    0.000    0.000 DiscreteFactor.py:159(<dictcomp>)
      150    0.003    0.000    0.011    0.000 DiscreteFactor.py:229(marginalize)
     1937    0.023    0.000    0.064 

In [45]:
question_order.items()

[('age', 0.32508297329144825),
 ('schooltype', 0.057212165318134889),
 ('top20', 5.5511151231257827e-17),
 ('gpa3', 0.0),
 ('acttest', -5.5511151231257827e-17)]

In [64]:
known_evidence={'age':1}
dis_fact = infer.query(['eligibility'], evidence=known_evidence) ['eligibility'].values
question_order=rank_variable(known_evidence,variable_list)

In [65]:
dis_fact

array([ 0.,  1.,  0.])

In [48]:
question_order.items()

[('schooltype', 0.03577623147404041),
 ('acttest', 2.5849394142282115e-26),
 ('deanlist', 1.2924697071141057e-26),
 ('gpa3', 0.0),
 ('top20', -1.2924697071141057e-26)]

In [42]:
variable_list = ['schooltype','deanlist','top20','gpa3','acttest','age']
known_evidence={'deanlist':0}


def get_nextquestion(my_evidence, variable_list):
    return rank_variable(known_evidence,variable_list).items()[0]

get_nextquestion(known_evidence, variable_list)

('age', 0.32508297329144825)

In [43]:
model2

<pgmpy.models.BayesianModel.BayesianModel at 0x7fc99c6ce1d0>

In [61]:
var_true

[1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
