In [113]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_excel("responses.xlsx")
data = data[~data['Remove PII'].isna()]

data = data.rename(columns={"Remove PII": "remove_pii", "Symptoms, Reason to call, Rec": "get_symptoms", "Summarized": "summary"})
error_columns = ['remove_pii', 'get_symptoms', 'summary']

In [114]:
data[error_columns] = data[error_columns].astype('bool')

data[error_columns].head()

data['no_errors'] = data[error_columns].all(axis=1)

In [115]:
class Node:
    def __init__(self, name):
        self.name = name
        self.parents = []

    def add_parent(self, parent):
        self.parents.append(parent)
    
def create_graph():
    nodes = {
        "remove_pii": Node("remove_pii"),
        "get_symptoms": Node("get_symptoms"),
        "summary": Node("summary")
    }
    nodes['get_symptoms'].add_parent(nodes['remove_pii'])
    nodes['summary'].add_parent(nodes['get_symptoms'])

    return nodes

In [116]:
graph = create_graph()

In [235]:
import json
# Run the calculation
probabilities = {}

for node_name, node in reversed(graph.items()):
    # Skip nodes with no parents
    if not node.parents:
        continue
        
    parent = node.parents[0].name
    
    # Calculate overall probabilities
    prob_that_parent_fails = len(data[data[parent] == False]) / len(data)
    prob_that_parent_passes = len(data[data[parent] == True]) / len(data)
    
    # Calculate conditional probabilities
    prob_node_fails_given_parent_fails = len(data[(data[node_name] == False) & (data[parent] == False)]) / len(data[data[parent] == False])
    prob_node_fails_given_parent_passes = len(data[(data[node_name] == False) & (data[parent] == True)]) / len(data[data[parent] == True])
    
    # Calculate overall impact
    probability_node_fails_if_parent_failing = prob_node_fails_given_parent_fails
    probability_node_fails_if_parent_passing = prob_node_fails_given_parent_passes

    # Store the probabilities
    probabilities[node_name] = {
        # f"P({node_name} fails | {parent} fails)": round(prob_node_fails_given_parent_fails, 6),
        # f"P({node_name} fails | {parent} passes)": round(prob_node_fails_given_parent_passes, 6),
        "prob_of_node_failing_with_parent_failing": round(probability_node_fails_if_parent_failing, 6),
        "Prob_of_node_failing_with_parent_passing": round(probability_node_fails_if_parent_passing, 6),
        # Check whether the node fails more because of failing parent or passing parent. 
        # With that calculated, write the problem as the node itself or the parent
        "Problem": f"{'self' if probability_node_fails_if_parent_passing > probability_node_fails_if_parent_failing else 'parent'}"
    }

print(json.dumps(probabilities, indent=4))


{
    "summary": {
        "prob_of_node_failing_with_parent_failing": 0.789474,
        "Prob_of_node_failing_with_parent_passing": 0.02,
        "Problem": "parent"
    },
    "get_symptoms": {
        "prob_of_node_failing_with_parent_failing": 0.4,
        "Prob_of_node_failing_with_parent_passing": 0.179487,
        "Problem": "parent"
    }
}


## Algorithm

In [236]:
def calculate_probabilities(node, data, dependency):    
    prob_that_parent_fails = (data[dependency.name] == False).sum() / len(data)
    prob_that_parent_passes = (data[dependency.name] == True).sum() / len(data)
    
    # Calculate conditional probabilities
    prob_node_fails_given_parent_fails = (data[(data[node] == False) & (data[dependency.name] == False)].shape[0]) / (data[dependency.name] == False).sum()
    prob_node_fails_given_parent_passes = (data[(data[node] == False) & (data[dependency.name] == True)].shape[0]) / (data[dependency.name] == True).sum()
    
    return prob_node_fails_given_parent_fails, prob_node_fails_given_parent_passes

def calculate_impact(node, data, dependencies):
    impacts = {}
    
    for dep in dependencies:
        prob_node_fails_given_dep_fails, prob_node_fails_given_dep_passes = calculate_probabilities(node, data, dep)
        impacts[dep.name] = round(prob_node_fails_given_dep_fails - prob_node_fails_given_dep_passes, 6)

    # Calculate overall probabilities for the node itself
    prob_any_dep_fails = (data[[i.name for i in dependencies]].any(axis=1) == False).sum() / len(data)
    prob_no_dep_fails = (data[[i.name for i in dependencies]].all(axis=1) == True).sum() / len(data)
    
    # Calculate conditional probabilities for the node itself
    prob_node_fails_given_any_dep_fails = (data[(data[node] == False) & (data[[i.name for i in dependencies]].any(axis=1) == False)].shape[0]) / (data[[i.name for i in dependencies]].any(axis=1) == False).sum()
    prob_node_fails_given_no_dep_fails = (data[(data[node] == False) & (data[[i.name for i in dependencies]].all(axis=1) == True)].shape[0]) / (data[[i.name for i in dependencies]].all(axis=1) == True).sum()
    
    # Compute the impact of the node itself
    impact_node = (prob_node_fails_given_any_dep_fails) - (prob_node_fails_given_no_dep_fails)
    impacts[node] = round(impact_node, 6)
    
    return impacts

def improve_system(node: str, data, graph):
    dependencies = [parent for parent in graph[node].parents]

    print(f"Node: {node}")
    print(f"dependencies: {[i.name for i in dependencies]}")
    
    if not dependencies:
        return node  # No dependencies, focus on improving the node itself
    
    impacts = calculate_impact(node, data, dependencies)

    if len(dependencies) == 1:
        D_max = dependencies[0]
    else:
        D_max = max(impacts, key=impacts.get)
    
    if impacts[node] > impacts[D_max.name]:
        return node, impacts[node]  # Focus on improving the node itself
    
    print("-"*100)
    return improve_system(node=D_max.name, data=data, graph=graph)

print(f"Focus on node: {improve_system(node='summary', data=data, graph=graph)}")


Node: summary
dependencies: ['get_symptoms']
0.8094736842105263
----------------------------------------------------------------------------------------------------
Node: get_symptoms
dependencies: ['remove_pii']
0.5794871794871795
----------------------------------------------------------------------------------------------------
Node: remove_pii
dependencies: []
Focus on node: remove_pii
