In [1]:
import pandas as pd 
import altair as alt

df = pd.read_csv("clinical_trial.csv")

print('df info')
df.info()
print()

print('df2 info')
df2 = pd.DataFrame(df.value_counts()).reset_index()
df2.info()
print()

print('df3 info')
df3 = df2.rename(columns = {0: "count"})
df3.info()

df info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   group   30000 non-null  object
 1   covid   30000 non-null  bool  
dtypes: bool(1), object(1)
memory usage: 263.8+ KB

df2 info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   group   4 non-null      object
 1   covid   4 non-null      bool  
 2   0       4 non-null      int64 
dtypes: bool(1), int64(1), object(1)
memory usage: 196.0+ bytes

df3 info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   group   4 non-null      object
 1   covid   4 non-null      bool  
 2   count   4 non-null      int64 
dtypes: bool(1), int64(1), object(1)
memory u

In [2]:
import numpy as np

def sample(p_covid, N):
    '''
    Return a binary vector of length N where a 1 indicates the person got COVID 
    
    Each person should be randomly assigned to get COVID with probability p_covid
    '''
    return np.random.choice([0, 1], N, p=[1-p_covid, p_covid])
    
def make_data_frame(p_covid_treat, p_covid_control, totalN):
    '''
    Make a data frame like df3 with columns group, covid and count indicating the counts 
    of covid in the treatment and control groups. 
    
    You should assign the counts at random based on the p_covid_treat, p_covid_control
    and totalN parameters. 
    
    Note that N should be totalN/2
    
    You will have to use your sample function for this
    '''
    
    # sample from the treatment and control groups
    treatment = sample(p_covid_treat, int(totalN/2))
    control = sample(p_covid_control, int(totalN/2))

    # count up how many people get/don't get covid in the control group
    N_covid_control = np.sum(control) # np.sum([0, 1, 0, 0, 1, 1]) = 3
    N_no_covid_control = control.size - N_covid_control

    # count up how many people get/don't get covid in the treatment group
    N_covid_treatment = np.sum(treatment)  # np.sum([0, 1, 0, 1, 1, 1]) = 4
    N_no_covid_treatment = treatment.size - N_covid_treatment
    
    group = ["treatment", "treatment", "control", "control"]
    covid = [True, False, True, False]
    count = [N_covid_treatment, N_no_covid_treatment, N_covid_control, N_no_covid_control]
    return pd.DataFrame({"group": group, "count": count, "covid": covid})

def make_plot_2(df):
    '''
    Write a function to make a plot that shows the number of patients 
    who did and did not get covid in the treatment adn control groups
    
    You can assume the dataframe comes from make_data_frame
    '''
    plot2 = df[df["covid"] == True][["group", "count"]]
    
    max_ = max(plot2["count"])
    plot2["is_max"] = plot2["count"] == max_
    
    _chart = alt.Chart(plot2).mark_bar().encode(
        x='group',
        y='count',
        color="is_max",
        tooltip="count"
    ).properties(title="# covid", width=20, height=50)
    return _chart

In [3]:
all_ = []

for j in range(8):
    df1 = make_data_frame(p_covid_treat=.1, p_covid_control=.11, totalN=100)
    plot1 = make_plot_2(df1)
    all_.append(plot1)

alt.hconcat(*all_)

In [4]:
all_ = []

for j in range(8):
    df1 = make_data_frame(p_covid_treat=.1, p_covid_control=.11, totalN=200)
    plot1 = make_plot_2(df1)
    all_.append(plot1)

alt.hconcat(*all_)

In [5]:
all_ = []

for j in range(8):
    df1 = make_data_frame(p_covid_treat=.1, p_covid_control=.11, totalN=300)
    plot1 = make_plot_2(df1)
    all_.append(plot1)

alt.hconcat(*all_)

In [6]:
all_ = []

for j in range(8):
    df1 = make_data_frame(p_covid_treat=.1, p_covid_control=.11, totalN=1000)
    plot1 = make_plot_2(df1)
    all_.append(plot1)

alt.hconcat(*all_)

In [7]:
all_ = []

for j in range(8):
    df1 = make_data_frame(p_covid_treat=.1, p_covid_control=.11, totalN=3000)
    plot1 = make_plot_2(df1)
    all_.append(plot1)

alt.hconcat(*all_)

In [8]:
all_ = []

for j in range(8):
    df1 = make_data_frame(p_covid_treat=.1, p_covid_control=.11, totalN=5000)
    plot1 = make_plot_2(df1)
    all_.append(plot1)

alt.hconcat(*all_)

In [9]:
all_ = []

for j in range(8):
    df1 = make_data_frame(p_covid_treat=.1, p_covid_control=.11, totalN=10000)
    plot1 = make_plot_2(df1)
    all_.append(plot1)

alt.hconcat(*all_)

In [10]:
all_ = []

for j in range(8):
    df1 = make_data_frame(p_covid_treat=.1, p_covid_control=.11, totalN=15000)
    plot1 = make_plot_2(df1)
    all_.append(plot1)

alt.hconcat(*all_)

In [11]:
all_ = []

for j in range(8):
    df1 = make_data_frame(p_covid_treat=.1, p_covid_control=.11, totalN=20000)
    plot1 = make_plot_2(df1)
    all_.append(plot1)

alt.hconcat(*all_)

In [12]:
all_ = []

for j in range(8):
    df1 = make_data_frame(p_covid_treat=.1, p_covid_control=.11, totalN=25000)
    plot1 = make_plot_2(df1)
    all_.append(plot1)

alt.hconcat(*all_)