# AMOSTRAGEM

In [32]:
import pandas as pd
import random
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit

In [7]:
df = pd.read_csv('Bases/census.csv')
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Amostragem Aleatória Simples

In [8]:
df_amostra_simples = df.sample(n = 100)

In [9]:
df_amostra_simples.shape

(100, 15)

In [10]:
df_amostra_simples.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
32001,51,Local-gov,108435,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,80,United-States,>50K
8358,42,Private,147099,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,0,30,United-States,<=50K
5374,55,Private,157079,Some-college,10,Married-civ-spouse,Protective-serv,Husband,Black,Male,0,0,40,?,>50K
3123,42,Private,54102,Assoc-voc,11,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
8616,23,Private,240398,Bachelors,13,Never-married,Sales,Not-in-family,Black,Male,0,0,15,United-States,<=50K


## Amostragem Sistemática

In [11]:
df.shape

(32561, 15)

In [13]:
# selecao de 100 amostras
len(df) // 100 #será selecionado um valor entre 1 e 325 e depois "pular" de 325 em 325 posicoes

325

In [14]:
random.seed(10)
random.randint(0, 325)

292

In [17]:
np.arange(68, len(df), step = 325)

array([   68,   393,   718,  1043,  1368,  1693,  2018,  2343,  2668,
        2993,  3318,  3643,  3968,  4293,  4618,  4943,  5268,  5593,
        5918,  6243,  6568,  6893,  7218,  7543,  7868,  8193,  8518,
        8843,  9168,  9493,  9818, 10143, 10468, 10793, 11118, 11443,
       11768, 12093, 12418, 12743, 13068, 13393, 13718, 14043, 14368,
       14693, 15018, 15343, 15668, 15993, 16318, 16643, 16968, 17293,
       17618, 17943, 18268, 18593, 18918, 19243, 19568, 19893, 20218,
       20543, 20868, 21193, 21518, 21843, 22168, 22493, 22818, 23143,
       23468, 23793, 24118, 24443, 24768, 25093, 25418, 25743, 26068,
       26393, 26718, 27043, 27368, 27693, 28018, 28343, 28668, 28993,
       29318, 29643, 29968, 30293, 30618, 30943, 31268, 31593, 31918,
       32243])

In [18]:
def amostragem_sistematica(dataset, amostras):
    intervalo = len(dataset) // amostras
    random.seed(1)
    inicio = random.randint(0, intervalo)
    indices = np.arange(inicio, len(dataset), step = intervalo)
    amostra_sistematica = df.iloc[indices]
    return amostra_sistematica

In [20]:
df_amostra_sistematica = amostragem_sistematica(df, 100)
df_amostra_sistematica.shape

(100, 15)

In [21]:
df_amostra_sistematica.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
68,49,Self-emp-inc,191681,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,>50K
393,34,State-gov,98101,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,45,?,>50K
718,22,Private,214399,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,15,United-States,<=50K
1043,44,Private,167005,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,60,United-States,>50K
1368,52,Private,152234,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,99999,0,40,Japan,>50K


## Amostragem por Grupos

In [22]:
len(df) / 10 # 10 grupos

3256.1

In [24]:
grupos = []
id_grupo = 0
contagem = 0

for _ in df.iterrows():
    grupos.append(id_grupo)
    contagem += 1
    if contagem > 3256:
        contagem = 0
        id_grupo += 1

In [26]:
np.unique(grupos)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [27]:
np.unique(grupos, return_counts = True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([3257, 3257, 3257, 3257, 3257, 3257, 3257, 3257, 3257, 3248]))

In [28]:
np.shape(grupos), df.shape

((32561,), (32561, 15))

In [29]:
df['grupos'] = grupos

In [30]:
random.randint(0, 9)

9

In [31]:
df_agrupamento = df[df['grupos'] == 9]
df_agrupamento

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupos
29313,45,Private,285060,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,>50K,9
29314,28,State-gov,189765,Some-college,10,Separated,Adm-clerical,Unmarried,White,Female,0,0,50,United-States,<=50K,9
29315,23,Private,130905,Bachelors,13,Never-married,Sales,Own-child,White,Female,0,0,40,United-States,<=50K,9
29316,50,Private,146325,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,44,United-States,>50K,9
29317,33,Private,102821,12th,8,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,<=50K,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,9
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,9
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,9
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K,9


## Amostragem Estratificada

In [33]:
df['income'].value_counts()

 <=50K    24720
 >50K      7841
Name: income, dtype: int64

In [36]:
7841 / len(df), 24720 / len(df)

(0.2408095574460244, 0.7591904425539756)

In [38]:
split = StratifiedShuffleSplit(test_size = 0.1)

for x, y in split.split(df, df['income']):
    df_x = df.iloc[x]
    df_y = df.iloc[y]
    
    

In [41]:
df_x.shape, df_y.shape

((29304, 16), (3257, 16))