# Amostragem

## Carregamento da base de dados

In [67]:
import random

import numpy as np
import pandas as pd

In [68]:
ds_census = pd.read_csv('datasets/census.csv')

## Amostragem aleatória simples 

In [69]:
def amostragem_aleatoria_simples(dataset, amostras, seed=1):
    return dataset.sample(n=amostras, random_state=seed)

In [70]:
df_amostra_aleatoria_simples = amostragem_aleatoria_simples(ds_census, 100)

In [71]:
df_amostra_aleatoria_simples.shape

(100, 15)

In [72]:
df_amostra_aleatoria_simples

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
9646,62,Self-emp-not-inc,26911,7th-8th,4,Widowed,Other-service,Not-in-family,White,Female,0,0,66,United-States,<=50K
709,18,Private,208103,11th,7,Never-married,Other-service,Other-relative,White,Male,0,0,25,United-States,<=50K
7385,25,Private,102476,Bachelors,13,Never-married,Farming-fishing,Own-child,White,Male,27828,0,50,United-States,>50K
16671,33,Private,511517,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
21932,36,Private,292570,11th,7,Never-married,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27578,64,?,200017,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,0,20,United-States,<=50K
2544,19,?,192773,Some-college,10,Never-married,?,Own-child,White,Female,0,0,35,United-States,<=50K
2486,75,?,164849,9th,5,Married-civ-spouse,?,Husband,Black,Male,1409,0,5,United-States,<=50K
13143,28,Private,154863,Bachelors,13,Never-married,Adm-clerical,Own-child,Black,Male,0,0,35,United-States,<=50K


## Amostragem sistemática

In [73]:
def amostragem_sistematica(dataset, amostras, seed=1):
    intervalo = len(dataset) // amostras
    random.seed(seed)
    inicio = random.randint(0, intervalo)
    indices = np.arange(inicio, len(dataset), step=intervalo)
    amostra_sistematica = dataset.iloc[indices]

    return amostra_sistematica

In [74]:
df_amostra_sistematica = amostragem_sistematica(ds_census, 100)

In [75]:
df_amostra_sistematica.shape

(100, 15)

In [76]:
df_amostra_sistematica

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
68,49,Self-emp-inc,191681,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,>50K
393,34,State-gov,98101,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,45,?,>50K
718,22,Private,214399,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,15,United-States,<=50K
1043,44,Private,167005,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,60,United-States,>50K
1368,52,Private,152234,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,99999,0,40,Japan,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30943,33,Private,48010,Some-college,10,Divorced,Exec-managerial,Unmarried,White,Female,0,0,40,United-States,<=50K
31268,43,Private,306440,Prof-school,15,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,66,France,<=50K
31593,37,Private,171968,Assoc-voc,11,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,<=50K
31918,26,Private,154571,Some-college,10,Never-married,Machine-op-inspct,Not-in-family,Asian-Pac-Islander,Male,0,0,45,United-States,>50K


## Amostragem por grupos

In [77]:
def amostragem_agrupamento(dataset, numero_grupos, seed=1):
    intervalo = len(dataset) // numero_grupos

    grupos = []
    id_grupo = 0
    contagem = 0

    for _ in dataset.iterrows():
        grupos.append(id_grupo)
        contagem += 1

        if contagem > intervalo:
            contagem = 0
            id_grupo += 1

    dataset['grupo'] = grupos
    random.seed(seed)
    grupo_selecionado = random.randint(0, numero_grupos)

    return dataset[dataset['grupo'] == grupo_selecionado]

In [78]:
df_amostra_agrupamento = amostragem_agrupamento(ds_census, 100)

In [79]:
df_amostra_agrupamento.shape

(326, 16)

In [80]:
df_amostra_agrupamento['grupo'].value_counts()

grupo
17    326
Name: count, dtype: int64

In [81]:
df_amostra_agrupamento

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
5542,40,Self-emp-inc,169878,Assoc-acdm,12,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,>50K,17
5543,44,Private,296728,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K,17
5544,33,Local-gov,342458,Assoc-acdm,12,Divorced,Protective-serv,Not-in-family,White,Male,0,0,56,United-States,<=50K,17
5545,21,Local-gov,38771,Some-college,10,Never-married,Adm-clerical,Own-child,White,Male,0,0,40,United-States,<=50K,17
5546,35,Self-emp-not-inc,269300,Bachelors,13,Never-married,Other-service,Not-in-family,Black,Female,0,0,60,United-States,<=50K,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5863,64,Private,256019,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,35,United-States,<=50K,17
5864,48,Private,348144,Some-college,10,Divorced,Transport-moving,Not-in-family,White,Male,3325,0,53,United-States,<=50K,17
5865,24,Private,190293,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,<=50K,17
5866,51,Self-emp-not-inc,25932,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,60,United-States,<=50K,17


In [115]:

random.seed(1)
def amostragem_reservatorio(dataset, amostras):
  tamanho = len(dataset)
  stream = np.arange(tamanho)

 
  
  reservatorio = np.zeros(amostras, dtype=np.int32)

 

  for i in range(amostras):
    reservatorio[i] = stream[i]

 
    
  while i < tamanho:
    j = random.randrange(i + 1)
    if j < amostras:
      reservatorio[j] = stream[i]
    i += 1

  return dataset.iloc[reservatorio]
  

In [116]:
df_amostragem_reservatorio = amostragem_reservatorio(ds_census, 100)
df_amostragem_reservatorio.shape

(100, 16)

In [117]:
df_amostragem_reservatorio.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
29608,41,Self-emp-inc,114580,Prof-school,15,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,2415,55,United-States,>50K,90
21696,37,Federal-gov,329088,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,Black,Male,0,0,40,United-States,<=50K,66
30676,42,Private,355728,Assoc-voc,11,Never-married,Craft-repair,Not-in-family,White,Male,0,0,44,United-States,<=50K,94
28550,43,Private,110970,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,40,United-States,>50K,87
8768,52,Federal-gov,221532,Bachelors,13,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,45,United-States,>50K,26


In [111]:
df_amostragem_reservatorio['age'].mean()


37.59

In [112]:
df_amostra_aleatoria_simples['age'].mean()

39.41

In [113]:
df_amostra_sistematica['age'].mean()

37.57

In [114]:
df_amostra_agrupamento['age'].mean()

39.23312883435583