In [1]:
from vigipy import *
import pandas as pd
import numpy as np 
import time as time

In [2]:
# read in squashed data 

squashed = pd.read_csv('C:/Users/farrowg/Documents/vigipy_devops/data/squashed_data.csv')
squashed

Unnamed: 0.1,Unnamed: 0,N,E,weight
0,1,1,0.000298,31
1,2,1,0.000298,31
2,3,1,0.000298,31
3,4,1,0.000298,31
4,5,1,0.000298,31
...,...,...,...,...
1133,1134,27,7.526818,1
1134,1135,28,5.053635,1
1135,1136,42,11.290226,1
1136,1137,53,14.300954,1


In [3]:
# read in normal data and just test optimisation works the way we think it should be working
caers_dataset = "C:/Users/farrowg/Documents/vigipy_devops/data/caers_data.csv" # put your own path to the dataset

# just reading in the CAERS dataset
df = pd.read_csv(caers_dataset, header = 0)
df.rename(columns={'var1': 'name'}, inplace=True)
df.rename(columns={'var2': 'AE'}, inplace=True)
df['count'] = 1

# drop duplicates from dataset 
df = df.drop_duplicates(subset=['id', 'name', 'AE'], keep='first')
df

Unnamed: 0.1,Unnamed: 0,id,name,AE,strat1,count
0,1,147289,PREVAGEN,BRAIN NEOPLASM,Female,1
1,2,147289,PREVAGEN,CEREBROVASCULAR ACCIDENT,Female,1
2,3,147289,PREVAGEN,RENAL DISORDER,Female,1
3,4,147289,PREVAGEN,GOUT,Female,1
4,5,147289,PREVAGEN,HYPERTENSION,Female,1
...,...,...,...,...,...,...
20151,20153,160591,"CENTRUM SILVER WOMEN'S 50 PLUS (MULTIMINERALS,...",CHOKING,Female,1
20152,20154,160592,"CENTRUM SILVER WOMEN'S 50+ (MULTIMINERALS, MUL...",CHOKING,Female,1
20153,20155,160592,"CENTRUM SILVER WOMEN'S 50+ (MULTIMINERALS, MUL...",PALPITATIONS,Female,1
20154,20156,160592,"CENTRUM SILVER WOMEN'S 50+ (MULTIMINERALS, MUL...",DYSPHAGIA,Female,1


In [4]:
vigipy_data = convert(df, count_unique_ids=True) # converting in the openEBGM way (always optimised)

In [5]:
# MHRA EBGM cut-off is 2.5
log2_ebgm = np.log2(2.5)

# bound for minimisation because values cannot go below zero
EPS = np.finfo(np.float64).eps

In [6]:
## not all of these parameters in GPS are needed!
time1 = time.time()
gps_obj = gps(
    vigipy_data, # container that was processed with convert
    relative_risk=1, # relative risk parameter
    min_events=3, # minimum number of events for something to be considered a signal 
    decision_metric='rank', # what should we rank signals by? 'rank' means by the ranking_statistic later 
    decision_thres=log2_ebgm, # minimum value of the ranking statistic for something to be considered a signal
    ranking_statistic='log2', # which ranking statistic to use, here log2 means the log2 of the EBGM score
    truncate=True, # whether to truncate or not
    truncate_thres=1,  # threshold for truncation
    prior_init={"alpha1": 0.2041, "beta1": 0.05816, "alpha2": 1.415, "beta2": 1.838, "w": 0.0969}, # initial guesses for priors
    prior_param=None, # feed in an array if you want to just use those priors and do no optimisation
    expected_method="mantel-haentzel", # method for calculating expected counts
    method_alpha=1, # parameter for other methods of calculating expected counts
    minimization_method="Nelder-Mead", # which minimisation algorithm from scipy to use!
    minimization_bounds=((EPS, 20), (EPS, 10), (EPS, 20), (EPS, 10), (EPS, 1)), # bounds on minimisation: will only be applied to certain algorithms
    minimization_options=None, # any supplementary options for the minimiser
    message=True, # whether to be verbose and print messages
    opt_likelihood=True, # whether to use the optimised versions of likelihood functions (really should always be true)
    number_of_iterations=1000, # number of iterations of the optimiser
    tol_value=1.0e-6, # tolerance for the optimiser
    sim_anneal=False, # whether to use simulated annealing optimisation: very experimental
    product_label='name', # the name of the column in the processed data that contains the product names
    ae_label='AE', # the name of the column in the processed data that contains the AE names
    squashing=False,
    callback_freq=10
)
gps_obj.initial_data_prep()
gps_obj.optimize()
gps_obj.ebgm_calculate()
gps_obj.report_generate()
results = gps_obj.output_results()
time2 = time.time()
print("TIME TAKEN TO PRODUCE DATA = ", time2-time1)

BEGINNING INITIAL DATA PREP
BEGINNING HYPERPARAMETER OPTIMISATION
OPTIMISED PRIORS REACHED:  [3.25597525 0.39999047 2.02375183 1.906132   0.06530732]
OPTIMISED FUNCTION VALUE =  4162.455710379452
Optimization terminated successfully.
CALCULATING EBGM SCORES
CALCULATING QUANTILES
GENERATING REPORT
PRODUCING FINAL REPORT
TIME TAKEN TO PRODUCE DATA =  28.219666242599487


In [7]:
## not all of these parameters in GPS are needed!
time1 = time.time()
gps_obj_sq = gps(
    vigipy_data, # container that was processed with convert
    relative_risk=1, # relative risk parameter
    min_events=3, # minimum number of events for something to be considered a signal 
    decision_metric='rank', # what should we rank signals by? 'rank' means by the ranking_statistic later 
    decision_thres=log2_ebgm, # minimum value of the ranking statistic for something to be considered a signal
    ranking_statistic='log2', # which ranking statistic to use, here log2 means the log2 of the EBGM score
    truncate=True, # whether to truncate or not
    truncate_thres=1,  # threshold for truncation
    prior_init={"alpha1": 0.2041, "beta1": 0.05816, "alpha2": 1.415, "beta2": 1.838, "w": 0.0969}, # initial guesses for priors
    prior_param=None, # feed in an array if you want to just use those priors and do no optimisation
    expected_method="mantel-haentzel", # method for calculating expected counts
    method_alpha=1, # parameter for other methods of calculating expected counts
    minimization_method="Nelder-Mead", # which minimisation algorithm from scipy to use!
    minimization_bounds=((EPS, 20), (EPS, 10), (EPS, 20), (EPS, 10), (EPS, 1)), # bounds on minimisation: will only be applied to certain algorithms
    minimization_options=None, # any supplementary options for the minimiser
    message=True, # whether to be verbose and print messages
    opt_likelihood=True, # whether to use the optimised versions of likelihood functions (really should always be true)
    number_of_iterations=1000, # number of iterations of the optimiser
    tol_value=1.0e-6, # tolerance for the optimiser
    sim_anneal=False, # whether to use simulated annealing optimisation: very experimental
    product_label='name', # the name of the column in the processed data that contains the product names
    ae_label='AE', # the name of the column in the processed data that contains the AE names
    squashing=True,
    callback_freq=10
)
gps_obj_sq.initial_data_prep()

BEGINNING INITIAL DATA PREP
You are using squashing: make sure you now call initial_data_prep_squashing


In [8]:
gps_obj_sq.initial_data_prep_squashing(squashed['N'],squashed['E'],squashed['weight'])

In [9]:
gps_obj_sq.optimize()

BEGINNING HYPERPARAMETER OPTIMISATION
YOU ARE USING SQUASHING: USE WITH CAUTION
OPTIMISED PRIORS REACHED:  [3.26235787 0.4004554  2.02412004 1.90608627 0.06523564]
OPTIMISED FUNCTION VALUE =  4161.33165291664
Optimization terminated successfully.
