# stratified_sampling_example

Standard libraries

In [1]:
import datetime
import itertools
import contextlib
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import uniform
import matplotlib.pyplot as plt
import pandas.io.formats.format as pf
from IPython.display import clear_output

from dateutil.relativedelta import relativedelta

pd.plotting.register_matplotlib_converters(explicit=True)

Non-Standard libraries

In [2]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

User-defined display format

In [3]:
@contextlib.contextmanager
def custom_formatting():
    orig_float_format=pd.options.display.float_format
    orig_int_format=pf.IntArrayFormatter
    pd.options.display.float_format='{:0,.2f}'.format
    class IntArrayFormatter(pf.GenericArrayFormatter):
        def _format_strings(self):
            formatter=self.formatter or '{:,d}'.format
            fmt_values=[formatter(x) for x in self.values]
            return fmt_values
    pf.IntArrayFormatter=IntArrayFormatter
    yield
    pd.options.display.float_format=orig_float_format
    pf.IntArrayFormatter=orig_int_format

Import data

In [4]:
df=pd.DataFrame(np.append(np.append(np.append(np.array([[0, 1]]*90),
                                              np.array([[0, 0]]*18), axis=0),
                                    np.array([[1, 1]]*9), axis=0),
                          np.array([[1, 0]]*63), axis=0),
                columns=['i_fem', 'i_full'])

df['random']=uniform.rvs(size=df.shape[0])

display(df.head())
df.info()

Unnamed: 0,i_fem,i_full,random
0,0,1,0.118005
1,0,1,0.176627
2,0,1,0.90305
3,0,1,0.455695
4,0,1,0.549844


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
i_fem     180 non-null int32
i_full    180 non-null int32
random    180 non-null float64
dtypes: float64(1), int32(2)
memory usage: 2.9 KB


In [5]:
p=(
    df
    .groupby(by=['i_fem', 'i_full'])
    .apply(lambda g: pd.DataFrame({'p': [(g.shape[0]/df.shape[0])]}))
)
p.index=p.index.droplevel(2)

with custom_formatting():
    display(p)
p.info()

Unnamed: 0_level_0,Unnamed: 1_level_0,p
i_fem,i_full,Unnamed: 2_level_1
0,0,0.1
0,1,0.5
1,0,0.35
1,1,0.05


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4 entries, (0, 0) to (1, 1)
Data columns (total 1 columns):
p    4 non-null float64
dtypes: float64(1)
memory usage: 181.0 bytes


In [6]:
sample_idx=(
    df
    .groupby(by=['i_fem', 'i_full'], group_keys=False)
    .apply(lambda g: g.drop(g.columns, axis=1).sample(frac=40/180))
    .index
)

display(df.loc[sample_idx])
df.loc[sample_idx].info()

Unnamed: 0,i_fem,i_full,random
91,0,0,0.203349
103,0,0,0.895375
93,0,0,0.248033
105,0,0,0.573469
67,0,1,0.202088
65,0,1,0.872615
21,0,1,0.601022
35,0,1,0.79589
59,0,1,0.255455
49,0,1,0.527008


<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 91 to 110
Data columns (total 3 columns):
i_fem     40 non-null int32
i_full    40 non-null int32
random    40 non-null float64
dtypes: float64(1), int32(2)
memory usage: 960.0 bytes


In [7]:
sample_p=(
    df.loc[sample_idx]
    .groupby(by=['i_fem', 'i_full'])
    .apply(lambda g: pd.DataFrame({'p': [(g.shape[0]/df.loc[sample_idx].shape[0])]}))
)
sample_p.index=sample_p.index.droplevel(2)

with custom_formatting():
    display(sample_p)
sample_p.info()

Unnamed: 0_level_0,Unnamed: 1_level_0,p
i_fem,i_full,Unnamed: 2_level_1
0,0,0.1
0,1,0.5
1,0,0.35
1,1,0.05


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4 entries, (0, 0) to (1, 1)
Data columns (total 1 columns):
p    4 non-null float64
dtypes: float64(1)
memory usage: 181.0 bytes
