In [22]:
from itertools import product
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
tr = pd.read_csv("input/cmc_train.csv")
ts = pd.read_csv("input/cmc_test.csv")
df = pd.concat([tr, ts], axis=0)
df['contraceptive'] = df['contraceptive'].replace(1, 'no-use')
df['contraceptive'] = df['contraceptive'].replace(2, 'long-term')
df['contraceptive'] = df['contraceptive'].replace(3, 'short-term')

In [6]:
def is_var(data, name):
    return name in data.columns

In [9]:
def get_domain(data, var_name):
    if is_var(data, var_name):
        return data[var_name].unique()

In [19]:
def prob_dist(data, var_name):
    if not is_var(data, var_name):
        return
    
    prob_key = 'p'
    total = 0
    table = {}
    table[prob_key] = []
    table[var_name] = []
        
    for event in get_domain(data, var_name):
        frequency = data[data[var_name] == event].shape[0]
        event_name = '{}_{}'.format(var_name, event)
        table[var_name].append(event_name)
        table[prob_key].append(frequency)
        total += frequency
    
    for i, v in enumerate(table[prob_key]):
        table[prob_key][i] = v / total
    
    return pd.DataFrame(table)

prob_dist(tr, 'wifes_age')

Unnamed: 0,p,wifes_age
0,0.237436,wifes_age_1
1,0.251275,wifes_age_4
2,0.264385,wifes_age_3
3,0.246905,wifes_age_2


In [98]:
def get_events_names(events):
    return ['{}_{}'.format(var_name, var_domain) for (var_name, var_domain) in list(events)]

def get_events_expr(events):
    return ' & '.join(['({} == {})'.format(var_name, var_domain) for (var_name, var_domain) in list(events)])

In [51]:
def joint_dist(data, var_names):
    if not isinstance(var_names, list):
        return
    
    if not all(is_var(data, var_name) for var_name in var_names):
        return
    
    prob_key = 'p'
    total = 0
    table = {}
    table[prob_key] = []
    
    vars_domains = []
    for var_name in var_names:
        table[var_name] = []
        var_domain = get_domain(data, var_name)
        vars_domains.append(list(product([var_name], var_domain)))
    
    for events in product(*vars_domains):
        for event_name in get_events_names(events):
            for table_key in table:
                if table_key in event_name:
                    table[table_key].append(event_name)
                    break
        
        expr = get_events_expr(events)
        frequency = data.query(expr).shape[0]
        table[prob_key].append(frequency)
        total += frequency

    for i, v in enumerate(table[prob_key]):
        table[prob_key][i] = v / total
    
    return pd.DataFrame(table)

joint_dist(tr, ['wifes_age', 'sol', 'wifes_rel', 'n_children'])

Unnamed: 0,n_children,p,sol,wifes_age,wifes_rel
0,n_children_3,0.010925,sol_3,wifes_age_1,wifes_rel_1
1,n_children_5,0.000728,sol_3,wifes_age_1,wifes_rel_1
2,n_children_0,0.006555,sol_3,wifes_age_1,wifes_rel_1
3,n_children_1,0.031318,sol_3,wifes_age_1,wifes_rel_1
4,n_children_2,0.021122,sol_3,wifes_age_1,wifes_rel_1
5,n_children_4,0.004370,sol_3,wifes_age_1,wifes_rel_1
6,n_children_3,0.000000,sol_3,wifes_age_1,wifes_rel_0
7,n_children_5,0.000000,sol_3,wifes_age_1,wifes_rel_0
8,n_children_0,0.000000,sol_3,wifes_age_1,wifes_rel_0
9,n_children_1,0.002185,sol_3,wifes_age_1,wifes_rel_0


In [86]:
def cond_dist(data, hypothesis, evidencies):
    if not is_var(data, hypothesis):
        return
    
    if not isinstance(evidencies, list):
        return
    
    if not all(is_var(data, var_name) for var_name in evidencies):
        return
    
    prob_key = 'p'
    table = {}
    table[prob_key] = []
    
    vars_domains = []
    table[hypothesis] = []
    for var_name in evidencies:
        table[var_name] = []
        var_domain = get_domain(data, var_name)
        vars_domains.append(list(product([var_name], var_domain)))

    for events in product(*vars_domains):
        expr = get_events_expr(events)
        df = data.query(expr)

        for var_domain in get_domain(data, hypothesis):
            hypothesis_name = get_events_names([(hypothesis, var_domain)])
            table[hypothesis].append(next(iter(hypothesis_name)))
            for event_name in get_events_names(events):
                for table_key in table:
                    if table_key in event_name:
                        table[table_key].append(event_name)
                        break

            frequency = df[df[hypothesis] == var_domain].shape[0]
            total = df.shape[0]
            table[prob_key].append(frequency / total)

    return pd.DataFrame(table)

cond_dist(tr, 'wifes_age', ['wifes_working', 'wifes_rel'])

Unnamed: 0,p,wifes_age,wifes_rel,wifes_working
0,0.288136,wifes_age_1,wifes_rel_1,wifes_working_1
1,0.238418,wifes_age_4,wifes_rel_1,wifes_working_1
2,0.228249,wifes_age_3,wifes_rel_1,wifes_working_1
3,0.245198,wifes_age_2,wifes_rel_1,wifes_working_1
4,0.099291,wifes_age_1,wifes_rel_0,wifes_working_1
5,0.368794,wifes_age_4,wifes_rel_0,wifes_working_1
6,0.361702,wifes_age_3,wifes_rel_0,wifes_working_1
7,0.170213,wifes_age_2,wifes_rel_0,wifes_working_1
8,0.19573,wifes_age_1,wifes_rel_1,wifes_working_0
9,0.238434,wifes_age_4,wifes_rel_1,wifes_working_0


In [96]:
def marginal_dist(data, var_names):
    subset = []
    prob_key = 'p'
    df = joint_dist(data, var_names)
    for var_name in var_names:
        subset.append(df.groupby(var_name)[prob_key].sum().reset_index())
    return subset

marginal_dist(tr, ['husbands_edu', 'husbands_occ'])

Unnamed: 0,husbands_edu,p
0,husbands_edu_1,0.029862
1,husbands_edu_2,0.123816
2,husbands_edu_3,0.237436
3,husbands_edu_4,0.608886


In [121]:
def is_domain_of(data, var_name, var_domain):
    return is_var(data, var_name) and var_domain in get_domain(data, var_name)

In [123]:
def get_var_domain(event):
    var_name = '_'.join(event.split('_')[:-1])
    var_domain = int(event.split('_')[-1])
    return var_name, var_domain

In [124]:
def prob(data, event):
    prob_key = 'p'
    var_name, var_domain = get_var_domain(event)

    if not is_domain_of(data, var_name, var_domain):
        return
    df = prob_dist(data, var_name)
    return df[df[var_name] == event][prob_key][0]

prob(tr, 'wifes_rel_1')

0.8492352512745812

In [166]:
def cond_prob(data, hypothesis, evidencies):
    hypothesis_var_name, hypothesis_var_domain = get_var_domain(hypothesis)
    if not is_domain_of(data, hypothesis_var_name, hypothesis_var_domain):
        return
    
    prob_key = 'p'
    evidencies_vars = []
    evidencies_vars_domains = []
    for evidence in evidencies:
        # TODO: CHECK FOR EVIDENCIES VAR NAMES
        evidence_var_name, evidence_var_domain = get_var_domain(evidence)
        evidencies_vars_domains.append((evidence_var_name, evidence_var_domain))
        evidencies_vars.append(evidence_var_name)

    df = cond_dist(data, hypothesis_var_name, evidencies_vars)

    expr = []
    events = [hypothesis] + evidencies
    for event in events:
        var, _ = get_var_domain(event)
        expr.append("({} == '{}')".format(var, event))
    expr = ' & '.join(expr)

    return df.query(expr).reset_index(drop=True)[prob_key][0]
    
cond_prob(tr, 'wifes_working_1', ['wifes_rel_1', 'wifes_edu_1'])

0.7647058823529411