In [1]:
# packages
import seaborn as sns
import pandas as pd
import numpy as np

# get data
peng = sns.load_dataset('penguins', cache=True, data_home=None)
print(peng.shape)

# clean data to make continous variables categorical
cont_cols = list( peng.select_dtypes('float64').columns )

# number of levels
levels = 2

for col in cont_cols:
    peng[col] = pd.cut(  peng[col], levels )

(344, 7)


In [2]:
# joint probs for only two variables
def joint_probs(DF, index, cols ):
    all_cols = index + cols
    N = DF.shape[0]
    
    joint_counts = pd.pivot_table( DF[all_cols] , index = index , columns= cols , aggfunc= 'size' ).replace(np.nan,0)
    
    joint_prob = np.round( joint_counts / N, 3)
    
    return joint_prob

In [3]:
JP = joint_probs(peng, ['species'], ['island'] )
print(JP,'\n')

island     Biscoe  Dream  Torgersen
species                            
Adelie      0.128  0.163      0.151
Chinstrap   0.000  0.198      0.000
Gentoo      0.360  0.000      0.000 



In [4]:
# conditional probs

def cond_prob_dist(joint_probs):
    # P(A | B) = P( A and B ) / P(B)
    ## https://en.wikipedia.org/wiki/Conditional_probability
    
    """
    calculates the conditions prob. distribution where:
    joint_probs: is a joint prob distribution as pandas dataframe
    A = {index   of joint_probs} = {a1, a2, .. an }
    B = {columns of joint_probs} = {b1, b2, .. bn }
    
    
    returns:
    CPD = the conditional probability dist P(A|B) as a pandas dataframe
    """
    
    CPD = joint_probs.copy()

    # column sum
    col_totals = joint_probs.sum(axis=0)
    
    for col in col_totals.index:
        CPD[col] =   CPD[col] / col_totals.loc[col]
        
    # rename columns
    CPD.columns = [ f'b{i+1} = {x}' for i,x in enumerate(CPD.columns) ]
    CPD.index   = [ f'a{i+1} = {x}' for i,x in enumerate(CPD.index) ]
        
    return CPD.round(3)

In [5]:
print( cond_prob_dist(JP) , '\n'*2)

                b1 = Biscoe  b2 = Dream  b3 = Torgersen
a1 = Adelie           0.262       0.452             1.0
a2 = Chinstrap        0.000       0.548             0.0
a3 = Gentoo           0.738       0.000             0.0 




In [6]:
print( cond_prob_dist(JP.T).T  )

                a1 = Biscoe  a2 = Dream  a3 = Torgersen
b1 = Adelie            0.29       0.369           0.342
b2 = Chinstrap         0.00       1.000           0.000
b3 = Gentoo            1.00       0.000           0.000
