In [114]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_excel("responses.xlsx")
data = data[~data['Remove PII'].isna()]

error_columns = ['Remove PII', 'Symptoms, Reason to call, Rec', 'Summarized']

In [115]:
data[error_columns] = data[error_columns].astype('bool')

### Error rates for each node

In [116]:
def error_rate_for_node(df, node_name):
    num_rows = len(df)
    return df[node_name].value_counts(normalize=True)
    

In [117]:
print(f"PII node error rate: {error_rate_for_node(data, node_name=error_columns[0])}")

PII node error rate: Remove PII
True     0.565217
False    0.434783
Name: proportion, dtype: float64


#### 43% of the data had PII information. This is a problematic node.

In [118]:
print(f"Main Reason node error rate: {error_rate_for_node(data, node_name=error_columns[1])}")

Main Reason node error rate: Symptoms, Reason to call, Rec
True     0.724638
False    0.275362
Name: proportion, dtype: float64


In [119]:
print(f"Summarizer node error rate: {error_rate_for_node(data, node_name=error_columns[2])}")

Summarizer node error rate: Summarized
True     0.768116
False    0.231884
Name: proportion, dtype: float64


In [120]:
data['all_three_correct'] = data[error_columns[0]] | data[error_columns[1]] | data[error_columns[2]] 

data['node_2_and_3_incorrect'] = data[error_columns[1]] | data[error_columns[2]]

In [121]:
data['all_three_correct'].value_counts(normalize=True)

all_three_correct
True     0.826087
False    0.173913
Name: proportion, dtype: float64

#### Seems like 17% of the time all three of the nodes breakdown. That could be because PII leaked through the chain, or that the symptoms or main reason for the call was off in addition to PII.

In [122]:
data['node_2_and_3_incorrect'].value_counts(normalize=True)

node_2_and_3_incorrect
True     0.782609
False    0.217391
Name: proportion, dtype: float64

#### Node 2 and 3 are certainly linked and their errors move together

In [123]:
data[data['node_2_and_3_incorrect'] == False]['Notes']

4      PII and main reason. As a result summary is off.
5               PII leaked into recs through the chain 
9      PII and main reason. As a result summary is off.
12    PII and main reason. Summary also doesn't capt...
19              PII leaked into recs through the chain 
33     PII and main reason. As a result summary is off.
35    PII and main reason. Summary also doesn't capt...
36    PII and main reason. Summary also doesn't capt...
45               PII, "some symptoms", summary off too 
47                                                  NaN
49              PII leaked into recs through the chain 
53              PII leaked into recs through the chain 
55    Main reason for the call leaked through to the...
60     PII and main reason. As a result summary is off.
68    Main reason for the call leaked through to the...
Name: Notes, dtype: object

#### Seems like there are instances where PII leaks through the entire chain. But more commonly if the main reason is wrong, the summary is wrong.

In [139]:
data = data.rename(columns={"Remove PII": "Node1", "Symptoms, Reason to call, Rec": "Node2", "Summarized": "Node3"})

# Calculate Conditional Probabilities

In [240]:
conditions = [
    {"targets": ["Node1", "Node2"], "given": "Node1"}, # P(1, 2 | 1)
    {"targets": ["Node1", "Node3"], "given": "Node1"}, # P(1, 3 | 1)
    {"targets": ["Node1", "Node2", "Node3"], "given": "Node1"}, #  P(1, 2, 3 | 1)
    {"targets": ["Node2", "Node3"], "given": "Node2"}, # P(2, 3 | 2)
    # {"targets": ["Node2", "Node1"], "given": "Node2"} , # P(2, 1 | 2)
    # {"targets": ["Node1", "Node2", "Node3"], "given": "Node2"}, # P(1, 2, 3 | 2)
    # {"targets": ["Node1", "Node3"], "given": "Node3"}, # P(1, 3 | 3)
    # {"targets": ["Node2", "Node3"], "given": "Node3"}, # P(2, 3 | 3)
    # {"targets": ["Node1", "Node2", "Node3"], "given": "Node3"} # P(1, 2, 3 | 3)
]

In [241]:
import numpy as np

def weighted_conditional_prob(data, conditions):
    results = []
    sample_sizes = []
    total_samples = len(data)
    
    for condition in conditions:
        targets = condition['targets']
        given = condition['given']
        
        given_samples = len(data[data[given] == False])
        filtered_df = data[data[given] == False]
        
        for target in targets:
            filtered_df = filtered_df[filtered_df[target] == False]
        
        probability = len(filtered_df) / given_samples
        
        results.append(probability)
        sample_sizes.append(given_samples)

    weights = np.array(sample_sizes)
    weights = weights / np.sum(weights)
    # Apply weights to probabilities
    weighted_probs = np.array(results) * weights
    
    return results, weighted_probs, weights, sample_sizes, total_samples

In [244]:
original_probs, weighted_probs, weights, sample_sizes, total_samples = weighted_conditional_prob(data, conditions)

# Create a DataFrame
results_df = pd.DataFrame({
    'condition': [f"P({', '.join(f'-{t}' for t in c['targets'])} | -{c['given']})" for c in conditions],
    'original_probability': original_probs,
    'weighted_probability': weighted_probs,
    'weight': weights,
    'sample_size': sample_sizes,
    'total_samples': total_samples
})

# Format the numeric columns
results_df['original_probability'] = results_df['original_probability'].map('{:.6f}'.format)
results_df['weighted_probability'] = results_df['weighted_probability'].map('{:.6f}'.format)
results_df['weight'] = results_df['weight'].map('{:.6f}'.format)

In [245]:
results_df.sort_values("weighted_probability", ascending=False)

Unnamed: 0,condition,original_probability,weighted_probability,weight,sample_size,total_samples
3,"P(-Node2, -Node3 | -Node2)",0.789474,0.137615,0.174312,19,69
0,"P(-Node1, -Node2 | -Node1)",0.4,0.110092,0.275229,30,69
1,"P(-Node1, -Node3 | -Node1)",0.4,0.110092,0.275229,30,69
2,"P(-Node1, -Node2, -Node3 | -Node1)",0.4,0.110092,0.275229,30,69
