# stratified_sampling_example

Standard libraries

In [1]:
import datetime
import itertools
import contextlib
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import uniform
import matplotlib.pyplot as plt
import pandas.io.formats.format as pf
from IPython.display import clear_output

from dateutil.relativedelta import relativedelta

pd.plotting.register_matplotlib_converters(explicit=True)

User-defined display format

In [2]:
@contextlib.contextmanager
def custom_formatting():
    orig_float_format=pd.options.display.float_format
    orig_int_format=pf.IntArrayFormatter
    pd.options.display.float_format='{:0,.2f}'.format
    class IntArrayFormatter(pf.GenericArrayFormatter):
        def _format_strings(self):
            formatter=self.formatter or '{:,d}'.format
            fmt_values=[formatter(x) for x in self.values]
            return fmt_values
    pf.IntArrayFormatter=IntArrayFormatter
    yield
    pd.options.display.float_format=orig_float_format
    pf.IntArrayFormatter=orig_int_format

Suppose that in a company there are the following staff:

male, part-time: 18

male, full-time: 90

female, part-time: 63

female, full-time: 9

total: 180

In [3]:
df=pd.DataFrame(np.append(np.append(np.append(np.array([[0, 1]]*90),
                                              np.array([[0, 0]]*18), axis=0),
                                    np.array([[1, 1]]*9), axis=0),
                          np.array([[1, 0]]*63), axis=0),
                columns=['i_fem', 'i_full'])

df['random']=uniform.rvs(size=df.shape[0])

display(df.head())
df.info()

Unnamed: 0,i_fem,i_full,random
0,0,1,0.181351
1,0,1,0.902976
2,0,1,0.53435
3,0,1,0.530606
4,0,1,0.92402


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
i_fem     180 non-null int32
i_full    180 non-null int32
random    180 non-null float64
dtypes: float64(1), int32(2)
memory usage: 2.9 KB


The sizes and proportions of each group are given by

In [4]:
p=(
    df
    .groupby(by=['i_fem', 'i_full'])
    .apply(lambda g: pd.DataFrame({'n': [g.shape[0]]}))
)
p.index=p.index.droplevel(2)
p['p']=p.n/df.shape[0]

with custom_formatting():
    display(p)
p.info()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p
i_fem,i_full,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,18,0.1
0,1,90,0.5
1,0,63,0.35
1,1,9,0.05


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4 entries, (0, 0) to (1, 1)
Data columns (total 2 columns):
n    4 non-null int64
p    4 non-null float64
dtypes: float64(1), int64(1)
memory usage: 213.0 bytes


We are asked to take a sample of 40 staff, stratified according to the above categories

In [5]:
sample_idx=(
    df
    .groupby(by=['i_fem', 'i_full'], group_keys=False)
    .apply(lambda g: g.drop(g.columns, axis=1).sample(frac=40/180, replace=False))
    .index
)

display(df.loc[sample_idx])
df.loc[sample_idx].info()

Unnamed: 0,i_fem,i_full,random
99,0,0,0.057943
90,0,0,0.267931
95,0,0,0.883345
92,0,0,0.566618
76,0,1,0.181666
70,0,1,0.077103
52,0,1,0.239745
59,0,1,0.59344
56,0,1,0.53776
88,0,1,0.833522


<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 99 to 115
Data columns (total 3 columns):
i_fem     40 non-null int32
i_full    40 non-null int32
random    40 non-null float64
dtypes: float64(1), int32(2)
memory usage: 960.0 bytes


The sizes and proportions of each sampled stratum are given by

In [6]:
sample_p=(
    df.loc[sample_idx]
    .groupby(by=['i_fem', 'i_full'])
    .apply(lambda g: pd.DataFrame({'n': [g.shape[0]]}))
)
sample_p.index=sample_p.index.droplevel(2)
sample_p['p']=sample_p.n/df.loc[sample_idx].shape[0]

with custom_formatting():
    display(sample_p)
sample_p.info()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p
i_fem,i_full,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,4,0.1
0,1,20,0.5
1,0,14,0.35
1,1,2,0.05


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4 entries, (0, 0) to (1, 1)
Data columns (total 2 columns):
n    4 non-null int64
p    4 non-null float64
dtypes: float64(1), int64(1)
memory usage: 213.0 bytes


Furthermore, we can wrap this procedure in a function

In [7]:
def stratified_sample_idx(df, v_strata, sample_pct=0.1):
    sample_idx=(
        df
        .groupby(by=v_strata, group_keys=False)
        .apply(lambda g: g.drop(g.columns, axis=1).sample(frac=sample_pct, replace=False))
        .index
    )
    
    return sample_idx

In [8]:
sample_idx=stratified_sample_idx(df=df, v_strata=['i_fem', 'i_full'], sample_pct=40/180)
sample_df=df.loc[sample_idx]

display(sample_df)
sample_df.info()

Unnamed: 0,i_fem,i_full,random
107,0,0,0.406026
93,0,0,0.059787
104,0,0,0.769413
100,0,0,0.194781
9,0,1,0.38983
89,0,1,0.601636
41,0,1,0.800301
47,0,1,0.661724
16,0,1,0.604671
51,0,1,0.467038


<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 107 to 116
Data columns (total 3 columns):
i_fem     40 non-null int32
i_full    40 non-null int32
random    40 non-null float64
dtypes: float64(1), int32(2)
memory usage: 960.0 bytes


In [9]:
sample_df_p=(
    sample_df
    .groupby(by=['i_fem', 'i_full'])
    .apply(lambda g: pd.DataFrame({'n': [g.shape[0]]}))
)
sample_df_p.index=sample_df_p.index.droplevel(2)
sample_df_p['p']=sample_df_p.n/sample_df.shape[0]

with custom_formatting():
    display(sample_df_p)
sample_df_p.info()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p
i_fem,i_full,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,4,0.1
0,1,20,0.5
1,0,14,0.35
1,1,2,0.05


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4 entries, (0, 0) to (1, 1)
Data columns (total 2 columns):
n    4 non-null int64
p    4 non-null float64
dtypes: float64(1), int64(1)
memory usage: 213.0 bytes
