In [8]:
from itertools import product, chain, combinations
from collections import Counter
import numpy as np
import pandas as pd


In [9]:

class Container:
    def __init__(self, params=False):
        if params:
            self.param = dict()

    def export(self, name, index=False):
        with pd.ExcelWriter(name) as writer:
            self.signals.to_excel(writer, sheet_name="Signals", index=index)
            self.all_signals.to_excel(writer, sheet_name="all_data", index=index)

def convert(
    data_frame,
    margin_threshold=1,
    product_label="name",
    count_label="count",
    ae_label="AE",
    count_unique_ids=False
):
    """
    Convert a Pandas dataframe object into a container class for use
    with the disproportionality analyses. Column names in the DataFrame
    must include or be specified in the arguments:
        "name" -- A brand/generic name for the product. This module
                    expects that you have already cleaned the data
                    so there is only one name associated with a class.
        "AE" -- The adverse event(s) associated with a drug/device.
        "count" -- The number of AEs associated with that drug/device
                    and AE. You can input a sheet with single counts
                    (i.e. duplicate rows) or pre-aggregated counts

    Arguments:
        data_frame (Pandas DataFrame): The Pandas DataFrame object

        margin_threshold (int): The threshold for counts. Lower numbers will
                             be removed from consideration

        count_unique_ids (bool): Essentially whether we count each drug/adverse event
                                 once 

    Returns:
        RES (DataStorage object): A container object that holds the necessary
                                    components for DA.

    """
    data_cont = compute_contingency(
        data_frame, product_label, count_label, ae_label, margin_threshold
    )
    col_sums = np.sum(data_cont, axis=0)
    row_sums = np.sum(data_cont, axis=1)

    if (not count_unique_ids):
        # Compute the flattened table from the contingency table.
        data_df = count(data_cont, row_sums, col_sums)
    else:
        # Compute the flattened table directly from the data
        # This is how OpenEBGM makes counts
        
        # there must be an id column (called id)

        actual = data_frame.groupby([product_label, ae_label]).id.nunique().reset_index(name='events') # number of times this product/ae pair occurs
        
        product_marg = data_frame.groupby([product_label]).id.nunique().reset_index(name='product_aes') # number of times this product appears
        
        ae_marg = data_frame.groupby([ae_label]).id.nunique().reset_index(name='count_across_brands') # number of times this ae appears 
        
        data_df = actual.merge(product_marg, on=product_label, how='inner')
        data_df = data_df.merge(ae_marg, on=ae_label, how='inner')
        data_df = data_df[[ 'events', 'product_aes', 'count_across_brands', ae_label, product_label]]

    # Initialize the container object and assign the data
    DC = Container()
    DC.contingency = data_cont
    DC.data = data_df
    DC.N = data_df["events"].sum()
    return DC


def compute_contingency(
    data_frame, product_label, count_label, ae_label, margin_threshold
):
    """Compute the contingency table for DA

    Args:
        data_frame (pd.DataFrame): A count data dataframe of the drug/device and events data
        product_label (str): Label of the column containing the product names
        count_label (str): Label of the column containing the event counts
        ae_label (str): Label of the column containing the adverse event counts
        margin_threshold (int): The minimum number of events required to keep a drug/device-event pair.

    Returns:
        pd.DataFrame: A contingency table with adverse events as columns and products as rows.
    """
    # Create a contingency table based on the brands and AEs
    data_cont = pd.pivot_table(
        data_frame,
        values=count_label,
        index=product_label,
        columns=ae_label,
        aggfunc="sum",
        fill_value=0,
    )

    # Calculate empty rows/columns based on margin_threshold and remove
    cut_rows = np.where(np.sum(data_cont, axis=1) < margin_threshold)
    drop_rows = data_cont.index[cut_rows]

    cut_cols = np.where(np.sum(data_cont, axis=0) < margin_threshold)
    drop_cols = data_cont.columns[cut_cols]

    data_cont = data_cont.drop(drop_rows)
    data_cont = data_cont.drop(drop_cols, axis=1)
    return data_cont

def count(data, rows, cols):
    """
    Convert the input contingency table to a flattened table

    Arguments:
        data (Pandas DataFrame): A contingency table of brands and events

    Returns:
        df: A Pandas DataFrame with the count information

    """
    d = {
        "events": [],
        "product_aes": [],
        "count_across_brands": [],
        "ae_name": [],
        "product_name": [],
    }
    for col, row in product(data.columns, data.index):
        n11 = data[col][row]
        if n11 > 0:
            d["count_across_brands"].append(cols[col])
            d["product_aes"].append(rows[row])
            d["events"].append(n11)
            d["product_name"].append(row)
            d["ae_name"].append(col)

    df = pd.DataFrame(d)
    return df

In [10]:
# trying to make a dataset that will demonstrate what we believe to be the problem

patient_data = {
    'id' : [1,1,1,1,1,2,3,4,5,6],
    'var1': ['A', 'B', 'C', 'D', 'E', 'A', 'B', 'C', 'D', 'E'],
    'var2': ['AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA']
}
test_df = pd.DataFrame(patient_data)
test_df['counts'] = 1
test_df.to_csv('difference.csv', index=False)

In [11]:
test_df

Unnamed: 0,id,var1,var2,counts
0,1,A,AA,1
1,1,B,AA,1
2,1,C,AA,1
3,1,D,AA,1
4,1,E,AA,1
5,2,A,AA,1
6,3,B,AA,1
7,4,C,AA,1
8,5,D,AA,1
9,6,E,AA,1


In [12]:

converted = convert(test_df, product_label="var1", count_label="counts", ae_label="var2")
summarydata = converted.data
summarydata.rename(columns={
    'product_aes': '# product appears in database',
    'count_across_brands': '# AE appears in database',
    'events' : '# of times this drug/AE combo appears'
}, inplace=True
)

In [13]:
summarydata

Unnamed: 0,# of times this drug/AE combo appears,# product appears in database,# AE appears in database,ae_name,product_name
0,2,2,10,AA,A
1,2,2,10,AA,B
2,2,2,10,AA,C
3,2,2,10,AA,D
4,2,2,10,AA,E


In [14]:
### This is essentially what Vigipy does to calculate the number of times things appear


#v1_marg and v2_marg should be comparable to the row_sums and col_sums?
data_cont = compute_contingency(
    data_frame=test_df, 
    product_label='var1', 
    count_label='counts', 
    ae_label='var2', 
    margin_threshold=1
)

col_sums = np.sum(data_cont, axis=0) # adverse events
row_sums = np.sum(data_cont, axis=1) # products
col_sums = col_sums.to_frame().reset_index()
col_sums.columns = ['var2', 'N_v2']
col_sums = col_sums.sort_values(by='N_v2', ascending=False)
row_sums = row_sums.to_frame().reset_index()
row_sums.columns = ['var1', 'N_v1']
row_sums = row_sums.sort_values(by='N_v1', ascending=False)
col_sums


Unnamed: 0,var2,N_v2
0,AA,10


In [15]:
# this is essentially what OpenEBGM does 
actual = test_df.groupby(['var1', 'var2']).id.nunique().reset_index(name='N')
v1_marg = test_df.groupby(['var1']).id.nunique().reset_index(name='N_v1') 
v2_marg = test_df.groupby(['var2']).id.nunique().reset_index(name='N_v2')

final = actual.merge(v1_marg, on='var1', how='left')
final = final.merge(v2_marg, on='var2', how='left')
final = final[['N', 'N_v1', 'N_v2', 'var2', 'var1']]
final.rename(columns={
    'var1': 'product_name',
    'var2': 'ae_name',
    'N' : '# of times this drug/AE combo appears', 
    'N_v1': '# product appears in database',
    'N_v2': '# AE appears in database',
}, inplace=True
)
final


Unnamed: 0,# of times this drug/AE combo appears,# product appears in database,# AE appears in database,ae_name,product_name
0,2,2,6,AA,A
1,2,2,6,AA,B
2,2,2,6,AA,C
3,2,2,6,AA,D
4,2,2,6,AA,E
