# The basics of sampling in statistics

## 1.0 - Libraries

In [2]:
import pandas as pd
import numpy as np
from random import randint

## 2.0 - Data Extraction

In [None]:
df = pd.read_csv('../dataset/census.csv')

In [77]:
print(f'Columns and Rows: {df.shape}')
describing = (
    pd.concat([
        df.isna().mean(),
        df.nunique(),
        df.describe(include='all').round().T.iloc[:,2:],
        df.sample(3).T
    ], axis=1)
)
describing.columns = ['%_null', 'unique']+describing.columns[2:].to_list()
describing

Columns and Rows: (32561, 15)


Unnamed: 0,%_null,unique,top,freq,mean,std,min,25%,50%,75%,max,20767,18822,12585
age,0.0,73,,,39.0,14.0,17.0,28.0,37.0,48.0,90.0,29,38,20
workclass,0.0,9,Private,22696.0,,,,,,,,Private,Private,Private
final-weight,0.0,21648,,,189778.0,105550.0,12285.0,117827.0,178356.0,237051.0,1484705.0,203797,52963,296618
education,0.0,16,HS-grad,10501.0,,,,,,,,Some-college,Bachelors,HS-grad
education-num,0.0,16,,,10.0,3.0,1.0,9.0,10.0,12.0,16.0,10,13,9
marital-status,0.0,7,Married-civ-spouse,14976.0,,,,,,,,Married-civ-spouse,Never-married,Never-married
occupation,0.0,15,Prof-specialty,4140.0,,,,,,,,Exec-managerial,Adm-clerical,Sales
relationship,0.0,6,Husband,13193.0,,,,,,,,Husband,Not-in-family,Not-in-family
race,0.0,5,White,27816.0,,,,,,,,Black,White,White
sex,0.0,2,Male,21790.0,,,,,,,,Male,Female,Female


## 3.0 - Sampling

### 3.1 - Sampling Probability

#### 3.1.1 - Simple random sample

In [129]:
df.sample(100, random_state=1).head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
9646,62,Self-emp-not-inc,26911,7th-8th,4,Widowed,Other-service,Not-in-family,White,Female,0,0,66,United-States,<=50K
709,18,Private,208103,11th,7,Never-married,Other-service,Other-relative,White,Male,0,0,25,United-States,<=50K
7385,25,Private,102476,Bachelors,13,Never-married,Farming-fishing,Own-child,White,Male,27828,0,50,United-States,>50K
16671,33,Private,511517,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
21932,36,Private,292570,11th,7,Never-married,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K


In [135]:
def simple_sample(data, sample:int):
    
    srs = data.sample(sample, random_state=1)
    return srs

In [137]:
simple_sample(df, 100).head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
9646,62,Self-emp-not-inc,26911,7th-8th,4,Widowed,Other-service,Not-in-family,White,Female,0,0,66,United-States,<=50K
709,18,Private,208103,11th,7,Never-married,Other-service,Other-relative,White,Male,0,0,25,United-States,<=50K
7385,25,Private,102476,Bachelors,13,Never-married,Farming-fishing,Own-child,White,Male,27828,0,50,United-States,>50K
16671,33,Private,511517,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
21932,36,Private,292570,11th,7,Never-married,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K


#### 3.1.2 - Systemic sample

In [148]:
step = len(df)//10
random_a = randint(0, step)
np.arange(random_a, len(df), step)

array([ 3089,  6345,  9601, 12857, 16113, 19369, 22625, 25881, 29137,
       32393])

In [160]:
def systemic_sample(data, sample):

    step = len(data)//sample
    random_a = randint(0, step)
    sample_idx = np.arange(random_a, len(data), step)

    return data.iloc[sample_idx]

In [162]:
ss = systemic_sample(df, 100)
ss.shape

(101, 15)

#### 3.1.3 - Cluster sample

#### 3.1.4 - Stratified sample

### 3.2 - No Sampling Probability

#### 3.2.1 - Convenience sample

#### 3.2.2 - Purpose sample

#### 3.2.3 - Quota sample

#### 3.2.4 - Snowball sample