In [8]:
import numpy  as np
import pandas as pd
import scipy  as sp

In [9]:
path       = 'c:/Users/byecs/Documents/GitHub/BlogWyrm/Posts/2020/July/Aristotle2Digital/'
data       = 'Gemstone Data.txt'
attributes = ['color','size','shape']
df         = pd.read_csv(path+data,names=attributes+['authenticity'])

In [10]:
def find_distinct_attributes(df,attributes_lst):
    distinct_attributes = {}
    for attribute in attributes_lst:
        distinct_attributes[attribute] = set(df[attribute])

    return distinct_attributes

In [11]:
def calculate_Laplace_smoothed_marginals(df,distinct_attributes):
    #initialize the marginals
    marginals = {}
    for attribute_type in distinct_attributes:
        for attribute in distinct_attributes[attribute_type]:
            marginals[attribute] = np.array([1,1])  #initializing to [1,1] implements Laplace smoothing
            
    for attribute_type in distinct_attributes:
        for attribute, authenticity in zip(df[attribute_type],df['authenticity']):
            marginals[attribute][authenticity] += 1
            
    return marginals    

In [12]:
def characterize_data_set(df):
    fake_label           = 0
    true_label           = 1
    summary              = {}
    authenticity_data    = df['authenticity']
    counts               = authenticity_data.value_counts()
    
    summary['num terms'] = counts[fake_label] + counts[true_label]
    summary['num fake']  = counts[fake_label]
    summary['num true']  = counts[true_label]
    
    return summary

In [13]:
def calculate_evidence(distinct_attributes,smoothed_marginals,summary,sample_attributes):
    fake_label           = 0
    true_label           = 1
    num_attributes       = len(distinct_attributes)
    
    prob_fake            = summary['num fake']/summary['num terms']
    prob_true            = summary['num true']/summary['num terms']
    smoothed_num_fake    = summary['num fake'] + num_attributes
    smoothed_num_true    = summary['num true'] + num_attributes
    
    sample_evidence_fake = 1
    for attribute in sample_attributes:
        sample_evidence_fake *= smoothed_marginals[attribute][fake_label]/smoothed_num_fake
    sample_evidence_fake *= prob_fake
    
    sample_evidence_true = 1
    for attribute in sample_attributes:
        sample_evidence_true *= smoothed_marginals[attribute][true_label]/smoothed_num_true
    sample_evidence_true *= prob_true
    
    normalization = sample_evidence_fake + sample_evidence_true
    
    return sample_evidence_fake/normalization, sample_evidence_true/normalization

In [14]:
distinct_attributes = find_distinct_attributes(df,attributes)
smoothed_marginals  = calculate_Laplace_smoothed_marginals(df,distinct_attributes)
summary             = characterize_data_set(df)

In [15]:
calculate_evidence(distinct_attributes,smoothed_marginals,summary,['Cyan','Small','Twisted'])

(0.3854670113521412, 0.6145329886478589)

In [83]:

df['color'].value_counts()

Dune    20
Aqua     9
Cyan     6
Blue     5
Name: color, dtype: int64