# Different probabilistic sampling methods

#### By Carolina Lopes

Modified and adapted from Udemy course 'Estatística para Ciência de Dados e Machine Learning'

Link: https://www.udemy.com/course/estatistica-para-ciencia-de-dados-machine-learning/

### Loading python libraries

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import StratifiedShuffleSplit

### Loading our file

In [2]:
data = pd.read_csv('/Users/csergilo/Desktop/udemy-stats/databases/credit_data.csv')
data.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [3]:
data.shape

(2000, 5)

## Simple Random Sampling

In [4]:
def simplerandom_sampling(dataset, n_samples):
  return dataset.sample(n = n_samples, random_state=1)

In [5]:
df_sr = simplerandom_sampling(data, 1000)
df_sr.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
674,675,34158.633968,29.421142,2911.408067,0
1699,1700,25789.742025,45.316211,4442.33178,0
1282,1283,59589.064289,20.609764,4191.715856,0
1315,1316,49908.291867,29.55094,2903.036128,0
1210,1211,69132.462579,33.471182,7621.410219,0


## Systematic Sampling

In [6]:
def systematic_sampling(dataset, n_samples):
    
    random.seed(1)
    interval = len(dataset) // n_samples
    beginning = random.randint(0, interval)
    indices = np.arange(beginning, len(dataset), step = interval)
    
    return dataset.iloc[indices]

In [7]:
df_sys = systematic_sampling(data, 1000)
df_sys.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
2,3,57317.170063,63.108049,8020.953296,0
4,5,66952.688845,18.584336,8770.099235,1
6,7,48430.359613,26.809132,5722.581981,0
8,9,40654.892537,55.496853,4755.82528,0


## Cluster Sampling

In [8]:
def cluster_sampling(dataset, n_groups):
    
    dataset['group'] = -999

    step = len(dataset)/n_groups

    for i in range(0,n_groups):

        dataset['group'] = np.where((i*step <= dataset.index) & (dataset.index < (i+1)*step), int(i), dataset['group'])
        
    selected = random.randint(0, n_groups-1) # this function includes the end points

    print('Selected group is: ', selected)
    
    return dataset[dataset['group']==selected]

In [9]:
df_clus = cluster_sampling(data, 2)
df_clus.head()

Selected group is:  0


Unnamed: 0,i#clientid,income,age,loan,c#default,group
0,1,66155.925095,59.017015,8106.532131,0,0
1,2,34415.153966,48.117153,6564.745018,0,0
2,3,57317.170063,63.108049,8020.953296,0,0
3,4,42709.534201,45.751972,6103.64226,0,0
4,5,66952.688845,18.584336,8770.099235,1,0


## Stratified Sampling

In [10]:
def stratified_sampling(dataset, percentage):
    
    split = StratifiedShuffleSplit(test_size=percentage, random_state=1)
    
    for _, y in split.split(dataset, dataset['c#default']):
        df_y = dataset.iloc[y]
        
    return df_y

In [11]:
df_strat = stratified_sampling(data, 0.5)
df_strat.head()

Unnamed: 0,i#clientid,income,age,loan,c#default,group
1374,1375,35916.704154,53.540443,6401.189486,0,1
253,254,25259.401631,39.739766,4341.008082,0,0
1149,1150,56317.08282,24.653482,8045.440953,1,1
1453,1454,28630.009508,27.29153,4406.995056,1,1
1929,1930,27514.088473,36.278684,192.144611,0,1


## Reservoir Sampling

In [12]:
def reservoir_sampling(dataset, n_samples):
    
    stream = []
    stream = list(range(0,len(data)))

    i = 0
    size = len(data)

    reservoir = [0] * n_samples

    # for i in range(n_samples):

    #     reservoir[i] = stream[i]

    while i < size:

        j = random.randrange(i + 1)

        if j < n_samples:

            reservoir[j] = stream[i]

        i += 1

    return(data.iloc[reservoir])
  

In [13]:
df_res = reservoir_sampling(data, 1000)
df_res.head()

Unnamed: 0,i#clientid,income,age,loan,c#default,group
677,678,42236.456093,24.686733,4749.068675,0,0
1582,1583,66393.711155,58.612272,9540.416626,0,1
251,252,34796.00356,58.4879,443.666538,0,0
713,714,29736.310504,35.298398,657.048409,0,0
626,627,36892.716219,54.448425,6463.647751,0,0


## Comparison table

In [18]:
my_dict = {'Simple Random': [round(df_sr['age'].mean(),2), round(df_sr['income'].mean(),2), round(df_sr['loan'].mean(),2)],
           'Systematic': [round(df_sys['age'].mean(),2), round(df_sys['income'].mean(),2), round(df_sys['loan'].mean(),2)],
           'Cluster': [round(df_clus['age'].mean(),2), round(df_clus['income'].mean(),2), round(df_clus['loan'].mean(),2)],
           'Stratified': [round(df_strat['age'].mean(),2), round(df_strat['income'].mean(),2), round(df_strat['loan'].mean(),2)],
           'Reservoir': [round(df_res['age'].mean(),2), round(df_res['income'].mean(),2), round(df_res['loan'].mean(),2)]
          }
    
df_comparison = pd.DataFrame(my_dict) 
    

In [24]:
new_indices = pd.Series(['age mean', 'income mean', 'loan mean'])

df_comparison.set_index(new_indices, inplace=True)

df_comparison.head()