In [1]:
import numpy as np
import pandas as pd

In [2]:
data_location = "../data/compas-scores-two-years.csv"
compas_score_full = pd.read_csv(data_location)

def data_prep(compas_score):
        compas_score["sex"].replace({'Male': 1, 'Female': 0}, inplace=True)
        # compas_score["is_recid"].replace({0: False, 1: True}, inplace=True)
        compas_score = compas_score.drop(
            ['last', 'first', 'out_custody', 'in_custody', 'c_offense_date', 'decile_score.1', 'priors_count.1',
             'c_case_number', 'start', 'end', 'event', 'screening_date', 'c_case_number',
             'juv_other_count', 'juv_misd_count', 'juv_fel_count', 'r_days_from_arrest', 'id', 'r_charge_degree',
             'r_offense_date', 'vr_case_number', 'r_case_number', 'r_jail_out', 'c_arrest_date', 'r_charge_desc',
             'r_jail_in', 'violent_recid', 'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc'], axis=1)
        compas_score['c_jail_in'] = pd.to_datetime(compas_score['c_jail_in'])
        compas_score['c_jail_out'] = pd.to_datetime(compas_score['c_jail_out'])
        compas_score['days_in_jail'] = round(abs((compas_score['c_jail_out'] - compas_score['c_jail_in']).dt.days),2)
        # TODO: mean modeling should only be done after train test splitting not before
        for col in compas_score.columns:
            if compas_score[col].dtype == "object":
                compas_score[col] = compas_score[col].fillna("UNKNOWN")
            else:
                compas_score[col] = compas_score[col].fillna(compas_score[col].mean())

        compas_score = compas_score[compas_score['days_b_screening_arrest'] <= 30]
        compas_score = compas_score[compas_score['days_b_screening_arrest'] >= -30]
        compas_score = compas_score[compas_score['is_recid'] != -1]
        compas_score = compas_score[compas_score['c_charge_degree'] != 'O']
        compas_score = compas_score[compas_score['score_text'] != 'N/A']
        # X = compas_score[['days_in_jail', 'age', 'decile_score', 'priors_count', 'c_days_from_compas', 'is_violent_recid',
        #             'v_decile_score']]
        # y = compas_score['is_recid']
        data = compas_score[['days_in_jail', 'age', 'decile_score', 'priors_count', 'c_days_from_compas', 'is_violent_recid',
                    'v_decile_score','is_recid']]
        return data

# 

In [3]:
pdf_data = data_prep(compas_score_full)
pdf_data.head()

Unnamed: 0,days_in_jail,age,decile_score,priors_count,c_days_from_compas,is_violent_recid,v_decile_score,is_recid
0,0.0,69,1,0,1.0,0,1,0
1,10.0,34,3,0,1.0,1,1,1
2,1.0,24,4,4,1.0,0,3,1
3,18.295063,23,8,1,1.0,0,6,0
4,18.295063,43,1,2,76.0,0,1,0


In [31]:
pdf_data.shape

(6479, 8)

# 'pgmpy' implementation

In [4]:
from pgmpy.models import BayesianModel, BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Define the structure of the Bayesian network
structure = [("days_in_jail", "is_recid"), ("age", "is_recid"), ("decile_score", "is_recid"), ("priors_count", "is_recid"), ("c_days_from_compas", "is_recid"),("is_violent_recid", "is_recid"), ("v_decile_score", "is_recid")]
b_net = BayesianNetwork(structure)

In [54]:
# b_net.check_model()
pdf_data_sample = pdf_data.head(50)
pdf_data_sample.shape

(50, 8)

In [55]:
# Assume df is your dataset with 5 features (X1, X2, X3, X4, X5) and 1 target (Y)
# Estimate the parameters of the model using Maximum Likelihood Estimation

b_net.fit(pdf_data_sample, estimator=MaximumLikelihoodEstimator) #R maximm likelihood might not be right##

In [40]:
b_net.get_cpds()

[<TabularCPD representing P(days_in_jail:9) at 0x29c4a51d0>,
 <TabularCPD representing P(is_recid:2 | age:14, c_days_from_compas:5, days_in_jail:9, decile_score:7, is_violent_recid:2, priors_count:10, v_decile_score:6) at 0x29c125310>,
 <TabularCPD representing P(age:14) at 0x29c38c310>,
 <TabularCPD representing P(decile_score:7) at 0x29bff0bd0>,
 <TabularCPD representing P(priors_count:10) at 0x29c720a50>,
 <TabularCPD representing P(c_days_from_compas:5) at 0x29c71ea10>,
 <TabularCPD representing P(is_violent_recid:2) at 0x29c71f110>,
 <TabularCPD representing P(v_decile_score:6) at 0x29c71dcd0>]

In [6]:
import time 

In [7]:
# Time the experiments 
print("Time taken to train Baysian Network")
data_size = [10,20, 50,100, 1000, 5000]
for i in data_size:
    start_time = time.time()
    pdf_data_sample = pdf_data.head(i)
    b_net = BayesianNetwork(structure)
    b_net.fit(pdf_data_sample, estimator=MaximumLikelihoodEstimator) #R maximm likelihood might not be right##
    print(f"\n data samples: {i}, time taken: {time.time()-start_time}")

Time taken to train Baysian Network

 data samples: 10, time taken: 3.363227128982544

 data samples: 20, time taken: 1.9421329498291016

 data samples: 50, time taken: 28.358377933502197

 data samples: 100, time taken: 165.75863695144653


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [57]:
# print("CPDs:")
# for variable in b_net.get_cpds():
#     print(variable)``
#     # print(b_net.get_cpds(variable))

CPDs:
+---------------------------------+------+
| days_in_jail(0.0)               | 0.3  |
+---------------------------------+------+
| days_in_jail(1.0)               | 0.32 |
+---------------------------------+------+
| days_in_jail(2.0)               | 0.04 |
+---------------------------------+------+
| days_in_jail(3.0)               | 0.06 |
+---------------------------------+------+
| days_in_jail(4.0)               | 0.02 |
+---------------------------------+------+
| days_in_jail(6.0)               | 0.04 |
+---------------------------------+------+
| days_in_jail(10.0)              | 0.02 |
+---------------------------------+------+
| days_in_jail(11.0)              | 0.02 |
+---------------------------------+------+
| days_in_jail(14.0)              | 0.02 |
+---------------------------------+------+
| days_in_jail(17.0)              | 0.02 |
+---------------------------------+------+
| days_in_jail(18.29506297958593) | 0.04 |
+---------------------------------+------+
| day

KeyboardInterrupt: 

In [41]:
predict_data = pdf_data_sample.copy()
predict_data.head()

Unnamed: 0,days_in_jail,age,decile_score,priors_count,c_days_from_compas,is_violent_recid,v_decile_score,is_recid
0,0.0,69,1,0,1.0,0,1,0
1,10.0,34,3,0,1.0,1,1,1
2,1.0,24,4,4,1.0,0,3,1
3,18.295063,23,8,1,1.0,0,6,0
4,18.295063,43,1,2,76.0,0,1,0


In [42]:
# inference on the same data as training 
pdf_data_sample = pdf_data.head(20)
predict_data = pdf_data_sample.copy()
predict_data.drop('is_recid', axis=1, inplace=True)
y_pred = b_net.predict(predict_data)
y_pred_prob = b_net.predict_probability(predict_data)

  0%|          | 0/19 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:00<00:00, 63.72it/s]


In [45]:
y_pred 

Unnamed: 0,is_recid
0,0
1,1
2,1
3,0
4,0
5,0
6,1
7,0
8,0
9,0


In [46]:
y_pred_prob

Unnamed: 0,is_recid_0,is_recid_1
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0
5,1.0,0.0
6,0.0,1.0
7,1.0,0.0
8,1.0,0.0
10,1.0,0.0


In [48]:
pdf_data_sample = pdf_data.head(30)
predict_data = pdf_data_sample.copy()
predict_data.drop('is_recid', axis=1, inplace=True)
y_pred = b_net.predict(predict_data)
y_pred_prob = b_net.predict_probability(predict_data)

100%|██████████| 29/29 [00:02<00:00,  9.86it/s]


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
y_pred_prob

# 'pomegranate-1.0.3' implementation

In [68]:
from pomegranate.distributions import Categorical
from pomegranate.distributions import ConditionalCategorical, DiscreteDistribution
from pomegranate.bayesian_network import BayesianNetwork

ImportError: cannot import name 'DiscreteDistribution' from 'pomegranate.distributions' (/Users/rijulizer/miniconda3/lib/python3.11/site-packages/pomegranate/distributions/__init__.py)

In [60]:
pdf_data.head()

Unnamed: 0,days_in_jail,age,decile_score,priors_count,c_days_from_compas,is_violent_recid,v_decile_score,is_recid
0,0.0,69,1,0,1.0,0,1,0
1,10.0,34,3,0,1.0,1,1,1
2,1.0,24,4,4,1.0,0,3,1
3,18.295063,23,8,1,1.0,0,6,0
4,18.295063,43,1,2,76.0,0,1,0


In [64]:
pdf_data_sample = pdf_data.head(50)
X = pdf_data_sample[['days_in_jail', 'age', 'decile_score', 'priors_count', 'c_days_from_compas', 'is_violent_recid',
                    'v_decile_score']]
Y = pdf_data_sample['is_recid']

In [67]:
# Define the structure of the Bayesian network
model = BayesianNetwork()

# Add nodes (variables) to the network
for i in range(X.shape[1]):
    model.add_node(DiscreteDistribution.from_samples(X[:, i], name=f'X{i}'))

model.add_node(DiscreteDistribution.from_samples(Y, name='Y'))

# Add edges (dependencies) to the network
for i in range(X.shape[1]):
    model.add_edge(f'X{i}', 'Y')

# Bake the network
model.bake()

NameError: name 'DiscreteDistribution' is not defined

In [58]:


# Perform inference (get the probability distribution for the target variable given evidence)
# In this example, we provide values for the first three features (X0, X1, X2)
evidence = [0, 1, 0, None, None]  # None represents variables without evidence
predicted_distribution = model.predict_proba([evidence])

# Print or use the results as needed
print("Predicted Probability Distribution for Y:", predicted_distribution)

# Open Questions

In [None]:
#Questioins:
# do we experiment with defining the graph(DAG) or is it ok with the curret structure
# how to incorporate the concept of distance in BN (Repeating the dataset? does it affect)
# how to get the interpretability factor? (There are no model weights to express feature importance)

# How to resourceful is the library any suggestions on library and implementation?
# How to handle the problem with unknown feature values