# Different probabilistic sampling methods

#### By Carolina Lopes

Modified and adapted from Udemy course 'Estatística para Ciência de Dados e Machine Learning'

Link: https://www.udemy.com/course/estatistica-para-ciencia-de-dados-machine-learning/

### Loading python libraries

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import StratifiedShuffleSplit

### Loading our file

In [None]:
df = pd.read_csv('/Users/csergilo/Desktop/udemy-stats/databases/credit_data.csv')
df.head()

In [None]:
df.shape

## Simple Random Sampling

In [None]:
def simplerandom_sampling(dataset, n_samples):
  return dataset.sample(n = n_samples, random_state=1)

In [None]:
df_sr = simplerandom_sampling(df, 1000)
df_sr.head()

## Systematic Sampling

In [None]:
def systematic_sampling(dataset, n_samples):
    
    random.seed(1)
    interval = len(dataset) // n_samples
    beginning = random.randint(0, interval)
    indices = np.arange(beginning, len(dataset), step = interval)
    
    return dataset.iloc[indices]

In [None]:
df_sys = systematic_sampling(df, 1000)
df_sys.head()

## Cluster Sampling

In [None]:
def cluster_sampling(dataset, n_groups):
    
    dataset['group'] = -999

    step = len(dataset)/n_groups

    for i in range(0,n_groups):

        dataset['group'] = np.where((i*step <= dataset.index) & (dataset.index < (i+1)*step), int(i), dataset['group'])
        
    selected = random.randint(0, n_groups-1) # this function includes the end points

    print('Selected group is: ', selected)
    
    return dataset[dataset['group']==selected]

In [None]:
df_clus = cluster_sampling(df, 2)
df_clus.head()

## Stratified Sampling

In [None]:
def stratified_sampling(dataset, percentage, field):
    
    split = StratifiedShuffleSplit(test_size=percentage, random_state=1)
    
    for _, y in split.split(dataset, dataset[field]):
        df_y = dataset.iloc[y]
        
    return df_y

In [None]:
df_strat = stratified_sampling(df, 0.5, 'c#default')
df_strat.head()

## Reservoir Sampling

In [None]:
def reservoir_sampling(dataset, n_samples):
    
    stream = []
    stream = list(range(0,len(dataset)))

    i = 0
    size = len(dataset)

    reservoir = [0] * n_samples

    # for i in range(n_samples):

    #     reservoir[i] = stream[i]

    while i < size:

        j = random.randrange(i + 1)

        if j < n_samples:

            reservoir[j] = stream[i]

        i += 1

    return(dataset.iloc[reservoir])
  

In [None]:
df_res = reservoir_sampling(df, 1000)
df_res.head()

## Comparison table

In [None]:
my_dict = {'Entire Dataset': [round(df['age'].mean(),2), round(df['income'].mean(),2), round(df['loan'].mean(),2)],
           'Simple Random': [round(df_sr['age'].mean(),2), round(df_sr['income'].mean(),2), round(df_sr['loan'].mean(),2)],
           'Systematic': [round(df_sys['age'].mean(),2), round(df_sys['income'].mean(),2), round(df_sys['loan'].mean(),2)],
           'Cluster': [round(df_clus['age'].mean(),2), round(df_clus['income'].mean(),2), round(df_clus['loan'].mean(),2)],
           'Stratified': [round(df_strat['age'].mean(),2), round(df_strat['income'].mean(),2), round(df_strat['loan'].mean(),2)],
           'Reservoir': [round(df_res['age'].mean(),2), round(df_res['income'].mean(),2), round(df_res['loan'].mean(),2)]
          }
    
df_comparison = pd.DataFrame(my_dict)

In [None]:
new_indices = pd.Series(['age mean', 'income mean', 'loan mean'])

df_comparison.set_index(new_indices, inplace=True)

df_comparison.head()