# Multi-Item Processing
  
This is a notebook to play around with a multi-item processing algorithm.  
This algorithm is essentially the core difference between GPS and MGPS methods.  
These changes are to be implemented within the `Vigipy` library.  

   
Aim is to:
- Write one in python, using pandas  
- Optimise this using Pyspark, because we anticipate this being slow  
  


In [14]:
from vigipy import *
import pandas as pd
import numpy as np

In [15]:
def basic_test():
    ## really, really basic model dataset
    df = pd.DataFrame(
        {
            "id": [1, 1, 1, 1, 1, 2, 3, 4, 4, 4],
            "name": ["d", "a", "c", "b", "e", "a", "b", "c", "a", "b"],
            "AE": ["alpha", "alpha", "alpha", "alpha", "beta", "beta", "gamma", "delta", "alpha", "alpha"],
        }
    )

    df['count'] = 1

    # Takes reports that come in as drug:ae lists for each id, and splits them into a new data structure

    id_label = 'id'
    ae_label = 'AE'
    prod_label = 'name'
    print("RAW DATA")
    print(df)
    print("\n")
    print("PROCESSED DATA")
    clean_df = convert_multi_item_pipeline(df, id_label, prod_label, ae_label)
    print(clean_df.data)

In [16]:
def caers_test():
    caers_dataset = "C:/Users/damlteam/Documents/vigipy_devops/vigipy/example_notebooks/test_datasets/caers_dataset.csv"

    df = pd.read_csv(caers_dataset, header = 0)
    df.rename(columns={'var1': 'name'}, inplace=True)
    df.rename(columns={'var2': 'AE'}, inplace=True)

    # drop duplicates
    df = df.drop_duplicates(subset=['id', 'name', 'AE'], keep='first')

    id_label='id'
    ae_label='AE'
    prod_label='name' 

    clean_df = convert_multi_item_pipeline(df, id_label, prod_label, ae_label, tuple_size_limit=1)
    return clean_df 

def caers_single_test():
    caers_dataset = "C:/Users/damlteam/Documents/vigipy_devops/vigipy/example_notebooks/test_datasets/caers_dataset.csv"

    df = pd.read_csv(caers_dataset, header = 0)
    df.rename(columns={'var1': 'name'}, inplace=True)
    df.rename(columns={'var2': 'AE'}, inplace=True)

    # drop duplicates
    df = df.drop_duplicates(subset=['id', 'name', 'AE'], keep='first')
    df['count'] = 1

    vigipy_data = convert(df, count_unique_ids=True)
    return vigipy_data

In [17]:
basic_test()


RAW DATA
   id name     AE  count
0   1    d  alpha      1
1   1    a  alpha      1
2   1    c  alpha      1
3   1    b  alpha      1
4   1    e   beta      1
5   2    a   beta      1
6   3    b  gamma      1
7   4    c  delta      1
8   4    a  alpha      1
9   4    b  alpha      1


PROCESSED DATA
    events  product_aes  count_across_brands     AE         name
0        2            3                    2  alpha            a
1        1            3                    2   beta            a
2        2            2                    2  alpha  a    |    b
3        1            1                    2  alpha  a    |    c
4        1            1                    2  alpha  a    |    d
5        2            3                    2  alpha            b
6        1            3                    1  gamma            b
7        1            1                    2  alpha  b    |    c
8        1            1                    2  alpha  b    |    d
9        1            2                    2  alp

In [18]:
caers_multi = caers_test()

In [19]:
caers_single = caers_single_test()

In [20]:
# I believe we need to use truncation in order to get a sensible answer out of this! 
# going to use the inbuilt functions, because something is slightly odd about the other ones!
EPS = np.finfo(np.float64).eps

In [29]:
help(gps)

Help on function gps in module vigipy.GPS.GPS:

gps(container, relative_risk=1, min_events=1, decision_metric='rank', decision_thres=0.05, ranking_statistic='log2', truncate=False, truncate_thres=1, prior_init={'alpha1': 0.2041, 'beta1': 0.05816, 'alpha2': 1.415, 'beta2': 1.838, 'w': 0.0969}, prior_param=None, expected_method='mantel-haentzel', method_alpha=1, minimization_method='CG', minimization_bounds=((np.float32(1.1920929e-07), 20), (np.float32(1.1920929e-07), 10), (np.float32(1.1920929e-07), 20), (np.float32(1.1920929e-07), 10), (0, 1)), minimization_options=None, message=False, opt_likelihood=False, number_of_iterations=1000, tol_value=0.0001, sim_anneal=False, product_label='name', ae_label='AE')
    Perform disproportionality analysis using the Multi-Item Gamma Poisson Shrinker (GPS) algorithm.
    
    This function implements a gamma-poisson shrinker algorithm for analyzing adverse event 
    data and detecting disproportionality signals. It optimizes hyperparameters, calcu

In [31]:
results = gps(caers_single, truncate=True, truncate_thres=1,
              prior_init={"alpha1":3.25598676, "beta1":0.39999063, "alpha2":2.02374327, "beta2":1.90612694, "w":0.06530685},
              minimization_method='CG', opt_likelihood=True, message=True, min_events=1)

BEGINNING HYPERPARAMETER OPTIMISATION
OPTIMISED PRIORS REACHED:  [3.25598677 0.39999061 2.02374353 1.9061273  0.0653071 ]
OPTIMISED FUNCTION VALUE =  4162.455710380101
Desired error not necessarily achieved due to precision loss.
CALCULATING EBGM SCORES
CALCULATING QUANTILES


In [22]:
report = results.all_signals
report = report.drop(columns=['fdr', 'FNR', 'product margin', 'event margin', 'Se', 'Sp', 'p_value', 'count/expected'])
report.sort_values(by='log2', ascending=False)
outreport = report.head(6)
outreport.to_csv('single_output.csv')

In [23]:
outreport

Unnamed: 0,Product,Adverse Event,Count,Expected Count,log2,LowerBound
0,REUMOFAN PLUS,WEIGHT INCREASED,16.0,0.406436,4.539834,15.686438
1,REUMOFAN PLUS,IMMOBILE,6.0,0.078665,4.191967,10.161142
2,HYDROXYCUT REGULAR RAPID RELEASE CAPLETS,EMOTIONAL DISTRESS,19.0,0.896901,4.0684,11.646773
3,"EMERGEN-C (ASCORBIC ACID, B-COMPLEX, ELECTROLY...",COUGH,6.0,0.144815,4.002404,8.882635
4,HYDROXYCUT HARDCORE CAPSULES,MULTIPLE INJURIES,5.0,0.092372,3.966696,8.271694
5,FLINSTONES COMPLETE MULTIVITAMINS CHEWABLE TABLET,OVERDOSE,5.0,0.096544,3.95391,8.191228


In [24]:
results_multi = gps(caers_multi,truncate=True, truncate_thres=1, 
              prior_init={"alpha1":2, "beta1":1, "alpha2":2, "beta2":1, "w":0.1},
              minimization_method="Nelder-Mead", opt_likelihood=True, message=True, min_events=1)

BEGINNING HYPERPARAMETER OPTIMISATION
OPTIMISED PRIORS REACHED:  [3.25598676 0.39999063 2.02374327 1.90612694 0.06530685]
OPTIMISED FUNCTION VALUE =  4162.455710380694
Optimization terminated successfully.
CALCULATING EBGM SCORES
CALCULATING QUANTILES
GENERATING REPORT


In [25]:
report2 = results_multi.all_signals
report2 = report2.drop(columns=['fdr', 'FNR', 'product margin', 'event margin', 'Se', 'Sp', 'p_value', 'count/expected'])
report2.sort_values(by='log2', ascending=False)
outreport2 = report2.head(6)
outreport2.to_csv('multi_output.csv')

In [26]:
outreport2

Unnamed: 0,Product,Adverse Event,Count,Expected Count,log2,LowerBound
0,REUMOFAN PLUS,WEIGHT INCREASED,16.0,0.406436,4.539834,15.686438
1,REUMOFAN PLUS,IMMOBILE,6.0,0.078665,4.191967,10.161142
2,HYDROXYCUT REGULAR RAPID RELEASE CAPLETS,EMOTIONAL DISTRESS,19.0,0.896901,4.0684,11.646773
3,"EMERGEN-C (ASCORBIC ACID, B-COMPLEX, ELECTROLY...",COUGH,6.0,0.144815,4.002404,8.882635
4,HYDROXYCUT HARDCORE CAPSULES,MULTIPLE INJURIES,5.0,0.092372,3.966696,8.271694
5,FLINSTONES COMPLETE MULTIVITAMINS CHEWABLE TABLET,OVERDOSE,5.0,0.096544,3.95391,8.191228
