## Amostragem de Dados

In [1]:
# Importando os pacotes Pandas e Seaborn
import pandas as pd
import seaborn as sns

In [2]:
# Carregando um DataSet para exemplo
iris = sns.load_dataset('iris')

In [3]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


### Amostragem aleatoria simples

In [4]:
#Especificando amostra a partir de quantidade
df_amostra_simples = iris.sample(n = 10)

In [5]:
df_amostra_simples.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 24 to 44
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  10 non-null     float64
 1   sepal_width   10 non-null     float64
 2   petal_length  10 non-null     float64
 3   petal_width   10 non-null     float64
 4   species       10 non-null     object 
dtypes: float64(4), object(1)
memory usage: 480.0+ bytes


In [6]:
df_amostra_simples['species'].value_counts()

setosa        6
virginica     3
versicolor    1
Name: species, dtype: int64

In [None]:
# Espeficicando amostra através do percentual

In [7]:
df_amostra_simples = iris.sample(frac = 0.10)

In [8]:
df_amostra_simples.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 114 to 118
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  15 non-null     float64
 1   sepal_width   15 non-null     float64
 2   petal_length  15 non-null     float64
 3   petal_width   15 non-null     float64
 4   species       15 non-null     object 
dtypes: float64(4), object(1)
memory usage: 720.0+ bytes


In [9]:
df_amostra_simples['species'].value_counts()

virginica     6
versicolor    6
setosa        3
Name: species, dtype: int64

### Amostragem sistematica

In [19]:
import numpy as np

In [28]:
semente = np.random.choice(10, 1)

In [29]:
semente

array([7])

In [13]:
#Gerando indice a partir da semente
indices = np.arange(0,100,semente)
indices

array([ 0,  6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96])

In [14]:
#gerando amostra a partir do indice
amostra = iris.loc[indices,:]
amostra

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
6,4.6,3.4,1.4,0.3,setosa
12,4.8,3.0,1.4,0.1,setosa
18,5.7,3.8,1.7,0.3,setosa
24,4.8,3.4,1.9,0.2,setosa
30,4.8,3.1,1.6,0.2,setosa
36,5.5,3.5,1.3,0.2,setosa
42,4.4,3.2,1.3,0.2,setosa
48,5.3,3.7,1.5,0.2,setosa
54,6.5,2.8,4.6,1.5,versicolor


In [30]:
# Verificando a separação da amostra
amostra['species'].value_counts()

setosa        9
versicolor    8
Name: species, dtype: int64

## Amostragem Estratificada

In [31]:
from sklearn.model_selection import StratifiedShuffleSplit

In [32]:
iris['species'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [33]:
split = StratifiedShuffleSplit(test_size = 0.2)

# A função split.split irá colocar os 80% no X e os 20% no Y
for x, y in split.split(iris, iris['species']):
    df_x = iris.iloc[x]
    df_y = iris.iloc[y]
    

In [34]:
#Para visualizar a amostra basta utilizarmos o df_y
df_y['species'].value_counts()

versicolor    10
virginica     10
setosa        10
Name: species, dtype: int64

In [35]:
df_y

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
80,5.5,2.4,3.8,1.1,versicolor
53,5.5,2.3,4.0,1.3,versicolor
130,7.4,2.8,6.1,1.9,virginica
141,6.9,3.1,5.1,2.3,virginica
110,6.5,3.2,5.1,2.0,virginica
13,4.3,3.0,1.1,0.1,setosa
51,6.4,3.2,4.5,1.5,versicolor
76,6.8,2.8,4.8,1.4,versicolor
14,5.8,4.0,1.2,0.2,setosa
21,5.1,3.7,1.5,0.4,setosa
