In [9]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_excel("responses.xlsx")
data = data[~data['Remove PII -4'].isna()]


data = data.rename(columns={
                        'Remove PII - 1': 'pii_node_1',
                        'Remove PII-2': 'pii_node_2',
                        'Remove PII-3': 'pii_node_3',
                        'Remove PII -4': 'pii_node_4',
                        "Symptoms, Reason to call, Rec": "get_symptoms",
                        "Summarized": "summary"
                        })

error_columns = ['pii_node_1', 'pii_node_2', 'pii_node_3', 'pii_node_4', 'get_symptoms', 'summary']

In [10]:
data[error_columns] = data[error_columns].astype('bool')

data[error_columns].head()

data['no_errors'] = data[error_columns].all(axis=1)

In [14]:
class Node:
    def __init__(self, name):
        self.name = name
        self.parents = []

    def add_parent(self, parent):
        self.parents.append(parent)
    
def create_graph():
    nodes = {
        "pii_node_1": Node("pii_node_1"),
        "pii_node_2": Node("pii_node_2"),
        "pii_node_3": Node("pii_node_3"),
        "pii_node_4": Node("pii_node_4"),
        "get_symptoms": Node("get_symptoms"),
        "summary": Node("summary")
    }

    nodes['pii_node_2'].add_parent(nodes['pii_node_1'])
    nodes['pii_node_3'].add_parent(nodes['pii_node_2'])
    nodes['pii_node_4'].add_parent(nodes['pii_node_3'])
    nodes['get_symptoms'].add_parent(nodes['pii_node_4'])
    nodes['summary'].add_parent(nodes['get_symptoms'])

    return nodes

In [15]:
graph = create_graph()

In [19]:
import json
# Run the calculation
probabilities = {}

for node_name, node in reversed(graph.items()):
    # Skip nodes with no parents
    if not node.parents:
        continue
        
    parent = node.parents[0].name
    
    # Calculate conditional probabilities
    prob_node_fails_given_parent_fails = len(data[(data[node_name] == False) & (data[parent] == False)]) / len(data[data[parent] == False])
    prob_node_fails_given_parent_passes = len(data[(data[node_name] == False) & (data[parent] == True)]) / len(data[data[parent] == True])
    
    # Calculate overall impact
    probability_node_fails_if_parent_failing = prob_node_fails_given_parent_fails
    probability_node_fails_if_parent_passing = prob_node_fails_given_parent_passes

    # Store the probabilities
    probabilities[node_name] = {
        "prob_of_node_failing_with_parent_failing": round(probability_node_fails_if_parent_failing, 6),
        "Prob_of_node_failing_with_parent_passing": round(probability_node_fails_if_parent_passing, 6),
        # Check whether the node fails more because of failing parent or passing parent. 
        # With that calculated, write the problem as the node itself or the parent
        "Problem": f"{'self' if probability_node_fails_if_parent_passing > probability_node_fails_if_parent_failing else 'parent'}"
    }

print(json.dumps(probabilities, indent=4))


{
    "summary": {
        "prob_of_node_failing_with_parent_failing": 0.833333,
        "Prob_of_node_failing_with_parent_passing": 0.0,
        "Problem": "parent"
    },
    "get_symptoms": {
        "prob_of_node_failing_with_parent_failing": 0.3,
        "Prob_of_node_failing_with_parent_passing": 0.075,
        "Problem": "parent"
    },
    "pii_node_4": {
        "prob_of_node_failing_with_parent_failing": 1.0,
        "Prob_of_node_failing_with_parent_passing": 0.0,
        "Problem": "parent"
    },
    "pii_node_3": {
        "prob_of_node_failing_with_parent_failing": 1.0,
        "Prob_of_node_failing_with_parent_passing": 0.02439,
        "Problem": "parent"
    },
    "pii_node_2": {
        "prob_of_node_failing_with_parent_failing": 1.0,
        "Prob_of_node_failing_with_parent_passing": 0.02381,
        "Problem": "parent"
    }
}


## Algorithm

In [31]:
def calculate_node_probability_given_dependency(node, data, dependency):        
    # Calculate conditional probabilities
    prob_node_fails_given_parent_fails = (data[(data[node] == False) & (data[dependency.name] == False)].shape[0]) / (data[dependency.name] == False).sum()
    prob_node_fails_given_parent_passes = (data[(data[node] == False) & (data[dependency.name] == True)].shape[0]) / (data[dependency.name] == True).sum()
    
    return prob_node_fails_given_parent_fails, prob_node_fails_given_parent_passes

def calculate_node_probability(node, data, dependencies):
    prob_node_fails_given_no_dep_fails = (data[(data[node] == False) & (data[[i.name for i in dependencies]].all(axis=1) == True)].shape[0]) / (data[[i.name for i in dependencies]].all(axis=1) == True).sum()
    return prob_node_fails_given_no_dep_fails

def calculate_impact(node, data, dependencies):
    impacts = {}
    
    for dep in dependencies:
        prob_node_fails_given_dep_fails, prob_node_fails_given_dep_passes = calculate_node_probability_given_dependency(node, data, dep)
        impacts[dep.name] = round(prob_node_fails_given_dep_fails - prob_node_fails_given_dep_passes, 6)
    
    # Calculate conditional probabilities for the node itself
    # prob_node_fails_given_any_dep_fails = (data[(data[node] == False) & (data[[i.name for i in dependencies]].any(axis=1) == False)].shape[0]) / (data[[i.name for i in dependencies]].any(axis=1) == False).sum()
    prob_node_fails_given_no_dep_fails = calculate_node_probability(node, data, dependencies)
    
    # Compute the impact of the node itself
    impacts[node] = round(prob_node_fails_given_no_dep_fails, 6)
    
    return impacts

def improve_system(node: str, data, graph, records):
    dependencies = [parent for parent in graph[node].parents]
    
    records = {"node": node, "dependencies": dependencies[0].name}
    print(f"Node: {node}")
    print(f"dependencies: {[i.name for i in dependencies]}")
    
    if not dependencies:
        return node  # No dependencies, focus on improving the node itself
    
    impacts = calculate_impact(node, data, dependencies)

    if len(dependencies) == 1:
        D_max = dependencies[0]
    else:
        D_max = max(impacts, key=impacts.get)
    
    if impacts[node] > impacts[D_max.name]:
        return node, impacts[node]  # Focus on improving the node itself

    # print("-"*100)
    # return improve_system(node=D_max.name, data=data, graph=graph, records)

print(f"Focus on node: {improve_system(node='summary', data=data, graph=graph, records={})}")


Node: summary
dependencies: ['get_symptoms']
Focus on node: None


In [57]:
def improve_system(node: str, data, graph, records=None):
    if records is None:
        records = []

    dependencies = [parent for parent in graph[node].parents]
    
    current_record = {"node": node, "dependencies": [d.name for d in dependencies]}
    records.append(current_record)
    
    if not dependencies:
        return node, records  # No dependencies, focus on improving the node itself
    
    impacts = calculate_impact(node, data, dependencies)
    current_record["impacts"] = impacts
    
    if len(dependencies) == 1:
        D_max = dependencies[0]
    else:
        D_max = max(impacts, key=impacts.get)
    
    if impacts[node] > impacts[D_max.name]:
        return node, records  # Focus on improving the node itself
    
    # Recursive call
    return improve_system(node=D_max.name, data=data, graph=graph, records=records)

# Usage
focus_node, all_records = improve_system(node='summary', data=data, graph=graph)
print(f"Focus on node: {focus_node}")

Focus on node: pii_node_1


In [58]:
impact_table = []
for record in all_records:
    node = record['node']
    dependencies = record.get('dependencies', [])
    impacts = record.get('impacts', {})

    dependency = dependencies[0] if dependencies else None
    
    impact_table.append({
        "node": node,
        "dependency": dependency,
        "self_impact": impacts.get(node) if impacts else None,
        "dependency_impact": impacts.get(dependency) if impacts else None
    })
pd.DataFrame(impact_table)

Unnamed: 0,node,dependency,self_impact,dependency_impact
0,summary,get_symptoms,0.0,0.833333
1,get_symptoms,pii_node_4,0.075,0.225
2,pii_node_4,pii_node_3,0.0,1.0
3,pii_node_3,pii_node_2,0.02439,0.97561
4,pii_node_2,pii_node_1,0.02381,0.97619
5,pii_node_1,,,
