Importar librerías

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import psycopg2
import warnings
warnings.filterwarnings("ignore")

Fuente de funciones para obteción de muestras: https://www.kaggle.com/flaviobossolan/stratified-sampling-python 

In [3]:
def excluir_comercio(df,columna,limite_mayor, limite_menor):
    global df1
    df1 = df[((mrch[columna] <= limite_mayor) & 
               (mrch[columna] >= limite_menor))
             |(mrch[columna] == 0)]
    print('por {}: {}'.format(columna, df1.shape))

def stratified_sample(df, strata, size=None, seed=None, keep_index= True):
    '''
    It samples data from a pandas dataframe using strata. These functions use
    proportionate stratification:
    n1 = (N1/N) * n
    where:
        - n1 is the sample size of stratum 1
        - N1 is the population size of stratum 1
        - N is the total population size
        - n is the sampling size
    Parameters
    ----------
    :df: pandas dataframe from which data will be sampled.
    :strata: list containing columns that will be used in the stratified sampling.
    :size: sampling size. If not informed, a sampling size will be calculated
        using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
        where:
            - cochran_n = result of the previous formula
            - N is the population size
    :seed: sampling seed
    :keep_index: if True, it keeps a column with the original population index indicator
    
    Returns
    -------
    A sampled pandas dataframe based in a set of strata.
    Examples
    --------
    >> df.head()
    	id  sex age city 
    0	123 M   20  XYZ
    1	456 M   25  XYZ
    2	789 M   21  YZX
    3	987 F   40  ZXY
    4	654 M   45  ZXY
    ...
    # This returns a sample stratified by sex and city containing 30% of the size of
    # the original data
    >> stratified = stratified_sample(df=df, strata=['sex', 'city'], size=0.3)
    Requirements
    ------------
    - pandas
    - numpy
    '''
    population = len(df)
    size = __smpl_size(population, size)
    tmp = df[strata]
    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()
    tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)

    # controlling variable to create the dataframe or append to it
    first = True 
    for i in range(len(tmp_grpd)):
        # query generator for each iteration
        qry=''
        for s in range(len(strata)):
            stratum = strata[s]
            value = tmp_grpd.iloc[i][stratum]
            n = tmp_grpd.iloc[i]['samp_size']

            if type(value) == str:
                value = "'" + str(value) + "'"
            
            if s != len(strata)-1:
                qry = qry + stratum + ' == ' + str(value) +' & '
            else:
                qry = qry + stratum + ' == ' + str(value)
        
        # final dataframe
        if first:
            stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            first = False
        else:
            tmp_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            stratified_df = stratified_df.append(tmp_df, ignore_index=True)
    
    return stratified_df



def stratified_sample_report(df, strata, size=None):
    '''
    Generates a dataframe reporting the counts in each stratum and the counts
    for the final sampled dataframe.
    Parameters
    ----------
    :df: pandas dataframe from which data will be sampled.
    :strata: list containing columns that will be used in the stratified sampling.
    :size: sampling size. If not informed, a sampling size will be calculated
        using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
        where:
            - cochran_n = result of the previous formula
            - N is the population size
    Returns
    -------
    A dataframe reporting the counts in each stratum and the counts
    for the final sampled dataframe.
    '''
    population = len(df)
    size = __smpl_size(population, size)
    tmp = df[strata]
    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()
    tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)
    return tmp_grpd


def __smpl_size(population, size):
    '''
    A function to compute the sample size. If not informed, a sampling 
    size will be calculated using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
        where:
            - cochran_n = result of the previous formula
            - N is the population size
    Parameters
    ----------
        :population: population size
        :size: sample size (default = None)
    Returns
    -------
    Calculated sample size to be used in the functions:
        - stratified_sample
        - stratified_sample_report
    '''
    if size is None:
        cochran_n = round(((1.96)**2 * 0.5 * 0.5)/ 0.02**2)
        n = round(cochran_n/(1+((cochran_n -1) /population)))
    elif size >= 0 and size < 1:
        n = round(population * size)
    elif size < 0:
        raise ValueError('Parameter "size" must be an integer or a proportion between 0 and 0.99.')
    elif size >= 1:
        n = size
    return n

## Muestreo
---

Cálculo de tamaño de muestra

Deben estar grabados en la memoria del kernel los siguientes dataframes:
- qr_mrch (población): todos los individuos de la población con las variables que se usarán para estratificar la muestra.
- mrch_m (población muestreable): En caso de que se quiera retirar determindos individuos de la población antes de hacer el muestreo.

In [398]:
poblacion = qr_mrch.shape[0]
base_muestreable = mrch_m.shape[0]
proporcion_n = 0.05
proporcion_grupo = proporcion_n/2
mi = round(base_muestreable*proporcion_n)
if mi%2 == 0:
    n = mi
else:
    n = mi-1
g = round(n/2)
print('poblacion: {}'.format(poblacion))
print('base_muestreable: {}'.format(base_muestreable))
print('proporcion para muestra: {}'.format(proporcion_n))
print('proporcion para grupo: {}'.format(proporcion_grupo))
print('n: {}'.format(n))
print('Tamaño de cada grupo: {}'.format(g))

poblacion: 4932
base_muestreable: 4932
proporcion para muestra: 0.05
proporcion para grupo: 0.025
n: 246
Tamaño de cada grupo: 123


Reporte de estratos

In [None]:
reporte_estratos_sampledf = stratified_sample_report(mrch_m, ['variable_1', 'variable_2','variable_3'],n)
print(reporte_estratos_sampledf.shape)
reporte_estratos_sampledf.head(1)

sample_df

In [None]:
df = pd.DataFrame(mrch_m,copy=True)
sample_df = stratified_sample(df, ['variable_1', 'variable_2','variable_3'], size=n, seed=123, keep_index=False)
#gc = cantidad de muestras en el grupo de control
gc = round(sample_df.shape[0]/2)
print('n = {}'.format(n))
print(sample_df.shape)
sample_df.head(1)

sample grupo de control

In [None]:
df = pd.DataFrame(sample_df,copy=True)
sample_g_cont = stratified_sample(df, ['variable_1', 'variable_2','variable_3'], size=gc, seed=123, keep_index=False)
#ge = cantidad de muestras en el grupo experimental
ge = sample_g_cont.shape[0]
print('gc = {}'.format(gc))
print(sample_g_cont.shape)
sample_g_cont.head(1)