# The basics of sampling in statistics

## 1.0 - Libraries

In [14]:
import pandas as pd
import numpy as np
from random import randint
from sklearn.model_selection import StratifiedShuffleSplit

## 2.0 - Data Extraction

In [3]:
df = pd.read_csv('../dataset/census.csv')

In [3]:
print(f'Columns and Rows: {df.shape}')
describing = (
    pd.concat([
        df.isna().mean(),
        df.nunique(),
        df.describe(include='all').round().T.iloc[:,2:],
        df.sample(3).T
    ], axis=1)
)
describing.columns = ['%_null', 'unique']+describing.columns[2:].to_list()
describing

Columns and Rows: (32561, 15)


Unnamed: 0,%_null,unique,top,freq,mean,std,min,25%,50%,75%,max,22972,17613,24945
age,0.0,73,,,39.0,14.0,17.0,28.0,37.0,48.0,90.0,47,51,55
workclass,0.0,9,Private,22696.0,,,,,,,,Private,Self-emp-inc,Private
final-weight,0.0,21648,,,189778.0,105550.0,12285.0,117827.0,178356.0,237051.0,1484705.0,112791,258735,239404
education,0.0,16,HS-grad,10501.0,,,,,,,,HS-grad,HS-grad,10th
education-num,0.0,16,,,10.0,3.0,1.0,9.0,10.0,12.0,16.0,9,9,6
marital-status,0.0,7,Married-civ-spouse,14976.0,,,,,,,,Married-civ-spouse,Divorced,Married-civ-spouse
occupation,0.0,15,Prof-specialty,4140.0,,,,,,,,Protective-serv,Protective-serv,Transport-moving
relationship,0.0,6,Husband,13193.0,,,,,,,,Husband,Not-in-family,Husband
race,0.0,5,White,27816.0,,,,,,,,White,White,Black
sex,0.0,2,Male,21790.0,,,,,,,,Male,Male,Male


## 3.0 - Sampling

### 3.1 - Sampling Probability

#### 3.1.1 - Simple random sample

In [129]:
df.sample(100, random_state=1).head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
9646,62,Self-emp-not-inc,26911,7th-8th,4,Widowed,Other-service,Not-in-family,White,Female,0,0,66,United-States,<=50K
709,18,Private,208103,11th,7,Never-married,Other-service,Other-relative,White,Male,0,0,25,United-States,<=50K
7385,25,Private,102476,Bachelors,13,Never-married,Farming-fishing,Own-child,White,Male,27828,0,50,United-States,>50K
16671,33,Private,511517,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
21932,36,Private,292570,11th,7,Never-married,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K


In [135]:
def simple_sample(data, sample:int):
    
    srs = data.sample(sample, random_state=1)
    return srs

In [137]:
simple_sample(df, 100).head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
9646,62,Self-emp-not-inc,26911,7th-8th,4,Widowed,Other-service,Not-in-family,White,Female,0,0,66,United-States,<=50K
709,18,Private,208103,11th,7,Never-married,Other-service,Other-relative,White,Male,0,0,25,United-States,<=50K
7385,25,Private,102476,Bachelors,13,Never-married,Farming-fishing,Own-child,White,Male,27828,0,50,United-States,>50K
16671,33,Private,511517,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
21932,36,Private,292570,11th,7,Never-married,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K


#### 3.1.2 - Systemic sample

In [148]:
step = len(df)//10
random_a = randint(0, step)
np.arange(random_a, len(df), step)

array([ 3089,  6345,  9601, 12857, 16113, 19369, 22625, 25881, 29137,
       32393])

In [160]:
def systemic_sample(data, sample):

    step = len(data)//sample
    random_a = randint(0, step)
    sample_idx = np.arange(random_a, len(data), step)

    return data.iloc[sample_idx]

In [162]:
ss = systemic_sample(df, 100)
ss.shape

(101, 15)

#### 3.1.3 - Cluster sample

In [35]:
def cluster_sampling(data, num_cluster:int):
    cluster = []
    id_cluster = 1
    count = 0
    sample = len(data)//num_cluster
    for _ in data.iterrows():
        cluster.append(id_cluster)
        count += 1
        if count > sample:
            count = 0
            id_cluster += 1

    data['cluster'] = cluster

    sample_sel = data[data['cluster'] == randint(1, num_cluster)]

    return sample_sel

In [36]:
sample_sel = cluster_sampling(df, 10)
sample_sel.shape, sample_sel['cluster'].value_counts()

((3257, 16),
 3    3257
 Name: cluster, dtype: int64)

#### 3.1.4 - Stratified sample

In [9]:
sss = StratifiedShuffleSplit(test_size=0.10)
for x, y in sss.split(df, df['income']):
    df_x = df.iloc[x]
    df_y = df.iloc[y]

In [12]:
df_x.shape, df_y.shape, df.shape

((29304, 15), (3257, 15), (32561, 15))

In [14]:
df_x['income'].value_counts()/ len(df)

 <=50K    0.683241
 >50K     0.216732
Name: income, dtype: float64

In [15]:
df_y['income'].value_counts()/len(df)

 <=50K    0.075950
 >50K     0.024078
Name: income, dtype: float64

In [18]:
df['income'].value_counts()/len(df)

 <=50K    0.75919
 >50K     0.24081
Name: income, dtype: float64

In [None]:
def stratified_sample(data, test_size:float, target:str):
    sss = StratifiedShuffleSplit(test_size=0.10)
    for x, y in sss.split(df, df[target]):
        df_x = df.iloc[x]
        df_y = df.iloc[y]

#### 3.1.5 - Reserved sample

The sample is select on a stream data while the data is flowing than randomicaly will be selected

In [11]:
from random import randrange

stream = [x for x in range(100)]
reservatorio = [x for x in range(10)]

for i in range(10):
    reservatorio[i] = stream[i]
    
i = 0
while i < 3:
    j = randrange(i + 1)
    if j < 3:
        reservatorio[j] = stream[i]
    i += 1

## 4.0 - Testing sample

### 4.1 - Class and Functions

In [42]:
class SamplingData:

    def __init__(self, data, sample):
        self.data = data
        self.sample = sample
    

    def simple_sample(self):
        
        srs = self.data.sample(self.sample, random_state=1)
        return srs
        
        
    def systemic_sample(self):

        step = len(self.data)//self.sample
        random_a = randint(0, step)
        sample_idx = np.arange(random_a, len(self.data), step)

        return self.data.iloc[sample_idx]
        
        
    def cluster_sampling(self):
        cluster = []
        id_cluster = 1
        count = 0
        num_cluster = len(self.data)//self.sample
        for _ in self.data.iterrows():
            cluster.append(id_cluster)
            count += 1
            if count > self.sample:
                count = 0
                id_cluster += 1
        
        self.data['cluster'] = cluster

        sample_sel = self.data[self.data['cluster'] == randint(1, num_cluster)]

        return sample_sel
        
        
    def stratified_sample(self, test_size:float, target:str):

        sss = StratifiedShuffleSplit(test_size=test_size)

        for _, y in sss.split(self.data, self.data[target]):
            df_y = self.data.iloc[y]
        
        return df_y

In [43]:
df = pd.read_csv('../dataset/credit_data.csv')

In [44]:
sd = SamplingData(df, 200)

In [45]:
sd.simple_sample()['age'].mean()

40.80053166500293

In [46]:
sd.cluster_sampling()['age'].mean()

40.09948042672665

In [47]:
sd.systemic_sample()['age'].mean()

40.0795626997645

In [48]:
sd.stratified_sample(test_size=0.10, target='c#default')['age'].mean()

41.741845767642374

In [49]:
df['age'].mean()

40.80755937840458