# Stratifying an experiment
## Block randomization and Stratifying by high_annual_return

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('investment_returns.csv')
df.head()

Unnamed: 0,Strategy_Type,Annual_Return
0,Quantitative,10.597379
1,Quantitative,1.656248
2,Quantitative,9.2021
3,Quantitative,3.980213
4,Quantitative,8.08672


In [3]:
df['Annual_Return'].describe()

count    840.000000
mean       7.028955
std        5.094456
min       -7.414640
25%        3.675554
50%        6.941097
75%       10.544015
max       21.702863
Name: Annual_Return, dtype: float64

#### Vamos a clasificar como "high annual return" aquellos registros con annual_return mayor al quantil 0.75

In [4]:
q_75 = df['Annual_Return'].quantile(0.75)

df.loc[df['Annual_Return']>q_75,'high_annual_return']=1
df.loc[df['high_annual_return']!=1, 'high_annual_return']=0
df.head()

Unnamed: 0,Strategy_Type,Annual_Return,high_annual_return
0,Quantitative,10.597379,1.0
1,Quantitative,1.656248,0.0
2,Quantitative,9.2021,0.0
3,Quantitative,3.980213,0.0
4,Quantitative,8.08672,0.0


#### Realizamos random assigment of subjects

In [5]:
treatment_group = df.sample(frac=0.5, random_state=42, replace=False)
control_group = df.drop(treatment_group.index)

compare_df_random = pd.concat([treatment_group['Annual_Return'].describe(), control_group['Annual_Return'].describe()], axis=1)
compare_df_random.columns=['group1','group2']
print(compare_df_random)
print(f'treatment group high_annual_return {treatment_group['high_annual_return'].sum()}')
print(f'control group high_annual_return {control_group['high_annual_return'].sum()}')


           group1      group2
count  420.000000  420.000000
mean     6.973577    7.084332
std      5.298716    4.887367
min     -7.414640   -6.437859
25%      3.532934    3.760035
50%      6.784724    7.086165
75%     10.612788   10.521503
max     21.702863   21.325721
treatment group high_annual_return 107.0
control group high_annual_return 103.0


notese que las estadisticas descriptivas del grupo 1 y el grupo 2 son muy similares, pero cuando revisamos cuantos casos de high_annual_return tenemos diferencia de 4 casos entre los dos grupos, si queremos garantizar que el high_annual_return sea igualmente distribuido, podemos apoyarnos del block randomization and stratifying by high_annual_return 

#### Realizamos blocking randomization and stratifying

In [6]:
# Create the first block
strata_1 = df.loc[df['high_annual_return']==1].copy()
strata_1['block']=1

# Create two groups assigning to Treatment or Control
strata_1_g1 = strata_1.sample(frac=0.5, random_state=42, replace=False)
strata_1_g1['T_C'] = 'treatment'

strata_1_g2 = strata_1.drop(strata_1_g1.index)
strata_1_g2['T_C'] = 'control'

# Create the second block and assign groups
strata_2 = df.loc[df['high_annual_return']==0].copy()
strata_2['block']=2

strata_2_g1 = strata_2.sample(frac=0.5, random_state=42, replace=False)
strata_2_g1['T_C'] = 'treatment'

strata_2_g2 = strata_2.drop(strata_2_g1.index)
strata_2_g2['T_C'] = 'control'

# Concatenate the grouping work
wealth_data_stratified = pd.concat([strata_1_g1, strata_1_g2, strata_2_g1, strata_2_g2])
print(wealth_data_stratified.groupby(['block','T_C', 'high_annual_return']).size())


block  T_C        high_annual_return
1      control    1.0                   105
       treatment  1.0                   105
2      control    0.0                   315
       treatment  0.0                   315
dtype: int64


#### Comparamos los estadisticos de ambos grupos

In [7]:
treatment_group = pd.concat([strata_1_g1, strata_2_g1])
control_group = pd.concat([strata_1_g2, strata_2_g2])

compare_stratified = pd.concat([treatment_group['Annual_Return'].describe(), control_group['Annual_Return'].describe()], axis=1)
compare_stratified.columns = ['treatment','control']

print(compare_stratified,'\n')
print(f'treatment group high_annual_return {treatment_group['high_annual_return'].sum()}')
print(f'control group high_annual_return {control_group['high_annual_return'].sum()}')


        treatment     control
count  420.000000  420.000000
mean     6.974632    7.083277
std      5.158643    5.035015
min     -7.355186   -7.414640
25%      3.481729    3.745739
50%      6.589750    7.100828
75%     10.541967   10.544015
max     21.702863   21.043119 

treatment group high_annual_return 105.0
control group high_annual_return 105.0
