# Data sampling

In [13]:
import pandas as pd
import numpy as np


# Loading dataset 

In [14]:
dataset = pd.read_csv('Dataset.csv')
dataset.shape


(1460, 3)

In [15]:
dataset.describe()


Unnamed: 0,Id,LotArea,SalePrice
count,1460.0,1460.0,1460.0
mean,729.5,10516.828082,180921.19589
std,421.610009,9981.264932,79442.502883
min,0.0,1300.0,34900.0
25%,364.75,7553.5,129975.0
50%,729.5,9478.5,163000.0
75%,1094.25,11601.5,214000.0
max,1459.0,215245.0,755000.0


In [16]:
dataset.head(5)

Unnamed: 0,Id,LotArea,SalePrice
0,0,8450,208500
1,1,9600,181500
2,2,11250,223500
3,3,9550,140000
4,4,14260,250000


# Simple random sampling

In [None]:
#It selects random samples from a process or population

In [17]:
#5 random samples out of 1460 - mean of sales price

simpleRandomSample = dataset.sample(n=5).sort_values(by = 'Id')
mean_simpleRandomSample = round(simpleRandomSample['SalePrice'].mean(),3)
print("Mean of SimpleRandomSample:",mean_simpleRandomSample)


Mean of SimpleRandomSample: 198720.0


In [18]:
simpleRandomSample

Unnamed: 0,Id,LotArea,SalePrice
173,173,10197,163000
231,231,15138,403000
835,835,9600,128000
995,995,4712,121600
1198,1198,9100,178000


# Systematic Sampling

In [None]:
#It selects units based on a fixed sampling interval

In [20]:
# step of data is took in a range 
# step in number of 3


index = np.arange(0,len(dataset),step=3)
index

array([   0,    3,    6,    9,   12,   15,   18,   21,   24,   27,   30,
         33,   36,   39,   42,   45,   48,   51,   54,   57,   60,   63,
         66,   69,   72,   75,   78,   81,   84,   87,   90,   93,   96,
         99,  102,  105,  108,  111,  114,  117,  120,  123,  126,  129,
        132,  135,  138,  141,  144,  147,  150,  153,  156,  159,  162,
        165,  168,  171,  174,  177,  180,  183,  186,  189,  192,  195,
        198,  201,  204,  207,  210,  213,  216,  219,  222,  225,  228,
        231,  234,  237,  240,  243,  246,  249,  252,  255,  258,  261,
        264,  267,  270,  273,  276,  279,  282,  285,  288,  291,  294,
        297,  300,  303,  306,  309,  312,  315,  318,  321,  324,  327,
        330,  333,  336,  339,  342,  345,  348,  351,  354,  357,  360,
        363,  366,  369,  372,  375,  378,  381,  384,  387,  390,  393,
        396,  399,  402,  405,  408,  411,  414,  417,  420,  423,  426,
        429,  432,  435,  438,  441,  444,  447,  4

In [21]:
sys_sample = dataset.iloc[index]
sys_sample

Unnamed: 0,Id,LotArea,SalePrice
0,0,8450,208500
3,3,9550,140000
6,6,10084,307000
9,9,7420,118000
12,12,12968,144000
...,...,...,...
1446,1446,26142,157900
1449,1449,1533,92000
1452,1452,3675,145000
1455,1455,7917,175000


In [23]:
mean_syssample = round(sys_sample['SalePrice'].mean(),3)
mean_syssample

178464.862

# Cluster Sampling

In [26]:
# it divides the population in clusters 
# of equal size n and selects the clusters every Tth time
# example - odd/even clusters picking as a pattern

In [30]:
# number of clusters
n=5

# dividing the units into clusters of equal size
dataset['cluster_id'] = np.repeat([range(1,n+1)],len(dataset)/n)
index = []

# Even number of cluster requirement - Total Ids=1460

for i in range(0,len(dataset)):
    if dataset['cluster_id'].iloc[i]%2 == 0:
        index.append(i)
cluster_sample = dataset.iloc[index]
mean_clustersample = round(cluster_sample['SalePrice'].mean(),3)
print("Mean of clustersample :",mean_clustersample)


Mean of clustersample : 178023.509


In [31]:
cluster_sample

Unnamed: 0,Id,LotArea,SalePrice,cluster_id
292,292,11409,131000,2
293,293,16659,235000,2
294,294,9600,167000,2
295,295,7937,142500,2
296,296,13710,152000,2
...,...,...,...,...
1163,1163,12900,108959,4
1164,1164,16157,194000,4
1165,1165,9541,233170,4
1166,1166,10475,245350,4


# Stratified random sampling - Adding subgroup which is also known as strata


In [33]:
# it divides the population into some groups 
# and selects the random sampling

In [35]:
d = pd.read_csv('Dataset.csv')
d['strata'] = np.repeat([1,2],len(d)/2)
index = []

for i in range(0,len(dataset)):
    index.append(i)
stratifiedrandomsample = d.iloc[index]
stratifiedrandomsample

Unnamed: 0,Id,LotArea,SalePrice,strata
0,0,8450,208500,1
1,1,9600,181500,1
2,2,11250,223500,1
3,3,9550,140000,1
4,4,14260,250000,1
...,...,...,...,...
1455,1455,7917,175000,2
1456,1456,13175,210000,2
1457,1457,9042,266500,2
1458,1458,9717,142125,2


# Stratified Random sampling 

In [38]:
from sklearn.model_selection import StratifiedShuffleSplit as sss

split = sss(n_splits=1, test_size = 8)

for x,y in split.split(d, d['strata']):
    stratifiedRandomsample = d.iloc[y].sort_values(by='SalePrice')
    
stratifiedRandomsample

Unnamed: 0,Id,LotArea,SalePrice,strata
1131,1131,10712,93500,2
709,709,7162,109900,1
1030,1030,7082,160000,2
893,893,13284,165000,2
248,248,11317,180000,1
334,334,9042,192000,1
793,793,9158,225000,2
159,159,19378,320000,1
