In [1]:
%matplotlib inline
import nest_asyncio
nest_asyncio.apply()

import multiprocessing
multiprocessing.set_start_method("fork")

import pickle
import os
import math
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
import pylab as pb
import matplotlib.pyplot as plt
import random
import seaborn as sns
import math
import statsmodels
import stan
import arviz as az
import cmdstanpy
from cmdstanpy import cmdstan_path, CmdStanModel

# Preprocessing

In [8]:
# Mapping of relevant trials
df = pd.read_csv(os.path.join(
    "Hashed_IMPACT_Patient_Figures_Pull_2021-05-17.csv"
), low_memory=False)

# pd.set_option('display.max_rows', 50)
pd.set_option("display.max_rows", None, "display.max_columns", None)

# filtered_df : data only consisting of good trials and phase 3 trials
filtered_df = df.loc[df['study_number'].isin(['AS0005', 'AS0011', 'EP0091', 'MG0002', 'PS0008', 'PS0010','PS0015','RA0098'])
                    & df['study_phase'].isin(['Phase III'])
                    ]

filtered_df1 = filtered_df[['study_number', 'site_initiation_visit_(a)', 'patient_figures_updated_date', 'study_country_fpfv_(a)', 'fpfv_(a)', 'lpfv_(a)', 'country_description',
                            'site_number','site_status_w/hcs','entered_screening','entered_treatment','patient_figure_type']]

filtered_df1 = filtered_df1.sort_values(by=['study_number','country_description','site_number', 'entered_screening'])

filtered_df1 = filtered_df1.loc[df['site_status_w/hcs'].isin(['SIV','FPFV','LPFV','LPLV','Study site closed'])
                               &df['patient_figure_type'].isin(['C'])]


filtered_df1['patient_figures_updated_date'] = filtered_df1['patient_figures_updated_date'].apply(pd.to_datetime, infer_datetime_format=True)
filtered_df1['fpfv_(a)'] = pd.to_datetime(filtered_df1['fpfv_(a)'], infer_datetime_format=True)  
filtered_df1['lpfv_(a)'] = pd.to_datetime(filtered_df1['lpfv_(a)'], infer_datetime_format=True)  
filtered_df1['study_country_fpfv_(a)'] = pd.to_datetime(filtered_df1['study_country_fpfv_(a)'], infer_datetime_format=True)  
filtered_df1['site_initiation_visit_(a)'] = pd.to_datetime(filtered_df1['site_initiation_visit_(a)'], infer_datetime_format=True)  
filtered_df1 = filtered_df1.drop(columns = ['site_status_w/hcs','patient_figure_type'])

# Datediff : number of days since the initation of this site
filtered_df1['Datediff'] = filtered_df1['patient_figures_updated_date'] - filtered_df1['site_initiation_visit_(a)']

# Days : Datediff as integer
filtered_df1['site_days'] = (filtered_df1['Datediff'] / np.timedelta64(1, 'D')).astype(int)
filtered_df1 = filtered_df1.drop(columns = ['Datediff'])
filtered_df1 = filtered_df1.sort_values(by=['country_description','site_number', 'site_days'])

# Remove rows where the number of entered patients don't change
filtered_df1['number_recruited'] = filtered_df1['entered_screening'].diff()
filtered_df1 = filtered_df1[filtered_df1.number_recruited > 0]

filtered_df1['week'] = filtered_df1['site_days']/7
filtered_df1['week'] = filtered_df1['week'].apply(np.floor)

# Remove negative site_days, i.e. observations before the site was activated
filtered_df1 = filtered_df1[filtered_df1['site_days'] >= 0]

filtered_df1['dates'] = (((filtered_df1['patient_figures_updated_date']).dt.dayofyear)/7).apply(np.floor)

df3 = filtered_df1.groupby(['country_description','site_number'])['site_initiation_visit_(a)'].min()
df3 = pd.DataFrame(df3)
df3.reset_index(inplace=True)

country_idx = df3['country_description']
site_idx = df3['site_number']

dfs_to_join = []
for i,j in zip(country_idx, site_idx):
    max_date = filtered_df1.loc[(filtered_df1.country_description== i) & (filtered_df1.site_number== j),'week'].max()
    temp_df = pd.DataFrame({'week': np.arange(0, max_date, 1)})
    temp_df['country_description'] = i
    temp_df['site_number'] = j
    dfs_to_join.append(temp_df)
df_to_join = pd.concat(dfs_to_join)


processed_data_df = df_to_join.merge(filtered_df1, how='outer', on=['country_description', 'site_number','week']).fillna(0)
processed_data_df = processed_data_df.sort_values(by=['country_description','site_number', 'week'])

processed_data_df['site_days'] += (processed_data_df['site_days']==0)*7*processed_data_df['week']

processed_data_df = processed_data_df.drop(columns = ['study_number','fpfv_(a)','lpfv_(a)','study_country_fpfv_(a)','site_initiation_visit_(a)','site_days'])

list_of_col_names = ['country_description','site_number','patient_figures_updated_date','entered_screening','entered_treatment','number_recruited','diff','week','dates']
df3 = processed_data_df.filter(list_of_col_names)
df3.reset_index(inplace=True)
df3 = df3.drop(columns=['index'])

prev_idx = -1
for i in range(len(df3)):
    if df3.iloc[i]['dates'] != 0:
        cur_idx = i
        num_operation = cur_idx-prev_idx -1
        for j in range(num_operation):
            num_to_subtract = df3.iloc[i]['dates']-(j+1)
            if num_to_subtract < 0:
                frac = -math.floor(num_to_subtract/52)
                if num_to_subtract%52 != 0 : 
                    num_to_subtract = num_to_subtract+frac*52
                    df3.at[i - (j+1), 'dates']= num_to_subtract
                else : 
                    num_to_subtract = num_to_subtract+(frac+1)*52
                    df3.at[i - (j+1), 'dates']= num_to_subtract
                    
            elif num_to_subtract == 0:
                frac = 1
                num_to_subtract = num_to_subtract+frac*52
                df3.at[i - (j+1), 'dates']= num_to_subtract
            else :
                df3.at[i - (j+1), 'dates']= num_to_subtract
        prev_idx = i

df3.reset_index(inplace=True)
df3 = df3.drop(columns=['index'])
        
prev_num = -1
site_num = -1
week_num = -1
drop_list = []
for i in range(len(df3)):
    if df3.iloc[i]['dates'] == prev_num:
        if df3.iloc[i]['week'] == week_num:
            if df3.iloc[i]['site_number'] == site_num:
                    num_to_add = df3.iloc[i-1]['number_recruited']
                    df3.at[i, 'number_recruited'] = df3.iloc[i]['number_recruited'] + num_to_add
                    drop_list.append(i-1)
    prev_num = df3.iloc[i]['dates']
    site_num = df3.iloc[i]['site_number'] 
    week_num = df3.iloc[i]['week'] 
df3 = df3.drop(drop_list)

df3.reset_index(inplace=True)
df3 = df3.drop(columns=['index'])
        
site_means = df3.groupby(['country_description','site_number'])['number_recruited'].mean()
site_var = df3.groupby(['country_description','site_number'])['number_recruited'].var()
site_count = df3.groupby(['country_description','site_number'])['number_recruited'].count()

country_mean = df3.groupby(['country_description'])['number_recruited'].mean()
country_var = df3.groupby(['country_description'])['number_recruited'].var()
country_count = df3.groupby(['country_description'])['number_recruited'].count()

df4 = pd.DataFrame([country_mean, country_var, country_count]).T
df4 = df4.set_axis(['mean', 'variance', 'count'], axis=1)
indices = list(df4.index.values)



df5 = pd.DataFrame([site_means, site_var, site_count]).T
df5 = df5.set_axis(['mean', 'variance', 'count'], axis=1)
df5 = df5[df5['count']>20]


df3['week']=df3['week']+1
countries_to_remove = ['Bulgaria', 'Romania', 'Australia', 'Canada', 'United States of America', 'Germany', 'Poland', 'South Korea', 'Russian Federation']
df3 = df3[~df3["country_description"].isin(countries_to_remove)]
df = df3[['country_description','site_number', 'number_recruited', 'week']]

df.columns=['country',  'site', 'number_recruited', 'week']
df["country_number"] = ""
df["site_number"] = ""
df = df.reset_index()

N = len(df)
c_num, s_num = 0,0
last_c = 'ddadsfgas'
last_s = 'sdfsdf'

for i in range(N):
    if df.country[i] == last_c :
        if df.site[i] == last_s :
            df.at[i, 'country_number'] = c_num
            df.at[i, 'site_number'] = s_num
        else : 
            df.at[i, 'country_number'] = c_num
            last_s = df.site[i]
            s_num += 1
            df.at[i, 'site_number'] = s_num
    else : 
        last_c = df.country[i]
        last_s = df.site[i]
        c_num += 1
        s_num += 1
        df.at[i, 'country_number'] = c_num
        df.at[i, 'site_number'] = s_num

            
df = df.drop(columns=['site','index'])
df = df.reindex(['country_number','country','site_number','number_recruited','week'], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["country_number"] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["site_number"] = ""


In [5]:
# Number of countries
totalN = len(df)
J = len(df.country_number.unique())
S = len(df.site_number.unique())
country = np.reshape((np.array([df.country_number])),(totalN,))
site = (np.reshape((np.array([df.site_number])),(totalN,)))
N = list(df.groupby(['country_number']).count().max(axis=1))
maxN = max(df.groupby(['country_number']).count().max(axis=1))

In [6]:
df[df['country_number']==0]
dfs_to_join_x = []
dfs_to_join_y = []
dfs_to_join_s = []

for j in range(J):
    temp_df = df[df['country_number']==j+1]
    temp_df_x = pd.DataFrame(temp_df['week'])
    temp_df_y = pd.DataFrame(temp_df['number_recruited'])
    temp_df_s = pd.DataFrame(temp_df['site_number'])

    temp_df_x = temp_df_x.reset_index(drop=True)
    temp_df_y = temp_df_y.reset_index(drop=True)
    temp_df_s = temp_df_s.reset_index(drop=True)

    dfs_to_join_x.append(temp_df_x)
    dfs_to_join_y.append(temp_df_y)
    dfs_to_join_s.append(temp_df_s)

df_to_join_x = pd.concat(dfs_to_join_x, axis=1)
df_to_join_y = pd.concat(dfs_to_join_y, axis=1)
df_to_join_s = pd.concat(dfs_to_join_s, axis=1)


df_to_join_x = df_to_join_x.fillna(0)
df_to_join_y = df_to_join_y.fillna(0)
df_to_join_s = df_to_join_s.fillna(0)

y = np.array(df_to_join_y.T).astype(int)
x = np.array(df_to_join_x.T)
site = np.array(df_to_join_s.T).astype(int)
n_tilde = 200

In [7]:
real_data = {'J': J, 'maxN': maxN, 'N': N, 'y': y, 'x': x, 'S':S, 'site':site, 'n_tilde':n_tilde}

In [None]:
x_tilde = np.zeros((J,n_tilde))
for i in range(J):
    x_tilde[i] = np.linspace(1, np.array(df.groupby(['country_number'], sort=False)['week'].max())[i]+5, n_tilde)