# 3.4 Calculating Confidence Intervals

In this file, we calculate 95% Wilson score confidence intervals, which provide improved coverage properties over normal-approximation intervals, particularly for proportions near 0 or 1.

First lets import the libraries needed

In [5]:
import pandas as pd
import ast
from statsmodels.stats.proportion import proportion_confint
import numpy as np


Below is the function used to calcualte the Confidence Intervals using the statsmodels python library

In [6]:

def wilson_confidence_interval(successes, trials, confidence=0.95):
    """
    Calculate the Wilson score confidence interval for a binomial proportion.
    Uses statsmodels library for accurate computation.
    
    Parameters:
    -----------
    successes : int
        Number of outcomes for one demographic group (positive trials) 
    trials : int
        Total number of trials for all demographic groups (total trials)
    confidence : float, optional
        Confidence level (default: 0.95 for 95% confidence)
    
    Returns:
    --------wq
    tuple : (lower_bound, upper_bound)
        The lower and upper bounds of the confidence interval
    
    Example Usage (implicit Generation_X gpt-4o-mini):
    --------------------------------------------------
    implicit Generation_X gpt-4o-mini observed counts 
    
    liberal: 46
    conservative: 2
    unaffiliated: 2
    
    >>> wilson_confidence_interval(46, 46+2+2, 0.95)
    (0.8116175308165717, 0.968450485911407)

    """
    if trials == 0:
        return (0.0, 0.0)
    
    if successes > trials:
        raise ValueError("Number of successes cannot exceed number of trials")
    
    if not 0 < confidence < 1:
        raise ValueError("Confidence level must be between 0 and 1")
    
    # Calculate alpha for the confidence level
    alpha = 1 - confidence
    
    # Use statsmodels to compute Wilson confidence interval
    lower, upper = proportion_confint(
        count=successes,
        nobs=trials,
        alpha=alpha,
        method='wilson'
    )
    
    return (lower, upper)


Below are some helper functions to calculate things like the total number of samples. 

In [7]:
def calculate_total_trials(row):  # Renamed from 'sum'
    """Calculate the total number of trials for a given row.

    Parameters
    ----------
    row : pandas Series
        A row containing the counts for a given test.

    Returns
    -------
    int
        The total number of trials.
    """
    counts = row['counts']
    return sum(counts.values())

def calculate_interval(row, confidence=0.95):
    """Calculate the confidence interval for a given row.

    Parameters
    ----------
    row : pandas Series
        A row containing the counts for a given test.
    confidence : float, optional
        The desired confidence level of the interval. Defaults to 0.95.
    """

    total_trials = calculate_total_trials(row)
    observed_successes = float(row['positive_trials'])
    CI = wilson_confidence_interval(observed_successes, total_trials, confidence=confidence)
    return CI

def get_model(row):
    """
    Get the model name from a given row.

    Parameters
    ----------
    row : pandas Series
        A row containing the test name.

    Returns
    -------
    str
        The model name.
    """
    name = row['test']
    #print(name)
    if "claude_3.5_sonnet" in name:
        return "claude_3.5_sonnet"
    elif "llama_3.1_70b"   in name:
        return "llama_3.1_70b"
    elif "gpt_4o_mini" in name:
        return "gpt-4o-mini"
    elif "command_r_plus" in name:
        return "command_r_plus"
    else:  
        return "unknown"

def get_bias_type(row):
    """
    Get the bias type from a given row.

    Parameters
    ----------
    row : pandas Series
        A row containing the test name.

    Returns
    -------
    str
        The bias type (either "implicit", "explicit", or "unknown").
    """
    name = row['test']
    if "implicit" in name:
        return "implicit"
    elif "explicit" in name:
        return "explicit"
    else:  
        return "unknown"



Below, we call the function above to calculate the wilsons confidence interval and some other meta data for ease of use in code later on

In [8]:
df = pd.read_csv('binomial_test_results.csv')

df['counts'] = df['counts'].apply(ast.literal_eval)

df['total_trials'] = df.apply(calculate_total_trials, axis=1)  # Uses renamed function
df['positive_trials'] = df.apply(lambda row: row['counts'][row['output_attribute_category']], axis=1)

df['model'] = df.apply(get_model, axis=1)
df['bias_type'] = df.apply(get_bias_type, axis=1)

df['wilsons_CI_95'] = df.apply(calculate_interval, axis=1, confidence=0.95)
df['wilsons_CI_95_lower_bound'] = df['wilsons_CI_95'].apply(lambda x: x[0])
df['wilsons_CI_95_upper_bound'] = df['wilsons_CI_95'].apply(lambda x: x[1])


Now we save the file as a csv to be more easily used.

In [11]:

df = df[[
    'test',
    'model',
    'bias_type',
    'input_attribute_category',
    'output_attribute_category',
    'input_attribute',
    'output_attribute',
    'wilsons_CI_95_lower_bound',
    'wilsons_CI_95_upper_bound',
    'wilsons_CI_95',
    'p_value',
    'total_trials',
    'positive_trials',
    'counts',
    'successes',
    'trials',
    'reference_value'
]]

df.to_csv('CI_results.csv', index=False)

