In [1]:
import csv
import pandas as pd
from random import sample 

df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [2]:
#simple random sampling : random picks
no_of_elements = 10 
random_index = sample(range(len(df)), no_of_elements) #df.shape[0]=len(df)
print(random_index)
print(df.iloc[random_index])

[123, 361, 90, 12, 43, 609, 548, 418, 457, 64]
      Loan_ID  Gender Married Dependents Education Self_Employed  \
123  LP001432    Male     Yes          2  Graduate            No   
361  LP002170    Male     Yes          2  Graduate            No   
90   LP001316    Male     Yes          0  Graduate            No   
12   LP001028    Male     Yes          2  Graduate            No   
43   LP001131    Male     Yes          0  Graduate            No   
609  LP002978  Female      No          0  Graduate            No   
548  LP002776  Female      No          0  Graduate            No   
418  LP002345    Male     Yes          0  Graduate            No   
457  LP002467    Male     Yes          0  Graduate            No   
64   LP001222  Female      No          0  Graduate            No   

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
123             2957                0.0        81.0             360.0   
361             5000             3667.0       236.0       

In [3]:
#systematic sampling : every kth element
Kth = 10
index = [i for i in range(df.shape[0]) if i%Kth==0]
print(df.iloc[index])

      Loan_ID  Gender Married Dependents     Education Self_Employed  \
0    LP001002    Male      No          0      Graduate            No   
10   LP001024    Male     Yes          2      Graduate            No   
20   LP001043    Male     Yes          0  Not Graduate            No   
30   LP001091    Male     Yes          1      Graduate           NaN   
40   LP001119    Male      No          0      Graduate            No   
..        ...     ...     ...        ...           ...           ...   
570  LP002842    Male     Yes          1      Graduate            No   
580  LP002892    Male     Yes          2      Graduate            No   
590  LP002928    Male     Yes          0      Graduate            No   
600  LP002949  Female      No         3+      Graduate           NaN   
610  LP002979    Male     Yes         3+      Graduate            No   

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0               5849                0.0         NaN           

In [4]:
#stratified sampling : equal no.of samples from each stratum
#stratas formed based on Education
no_of_elements = 4        #number of elements in each strata
unique = list(set(df['Education']))
print('stratas :', unique, '\n')

index_set = [sample(list(df.index[df['Education']==i]), no_of_elements) for i in unique]
print("index set :",index_set)

index = [j for i in index_set for j in i]
print("indexs :",index)
print(df.iloc[index])

stratas : ['Not Graduate', 'Graduate'] 

index set : [[442, 190, 197, 139], [319, 383, 562, 72]]
indexs : [442, 190, 197, 139, 319, 383, 562, 72]
      Loan_ID  Gender Married Dependents     Education Self_Employed  \
442  LP002418    Male      No         3+  Not Graduate            No   
190  LP001653    Male      No          0  Not Graduate            No   
197  LP001669  Female      No          0  Not Graduate            No   
139  LP001493    Male     Yes          2  Not Graduate            No   
319  LP002050    Male     Yes          1      Graduate           Yes   
383  LP002234    Male      No          0      Graduate           Yes   
562  LP002820    Male     Yes          0      Graduate            No   
72   LP001248    Male      No          0      Graduate            No   

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
442             4707             1993.0       148.0             360.0   
190             4885                0.0        48.0        

In [5]:
#cluster sampling : find clusters and sample whole clusters
#clusters formed based on number of Dependents
no_of_clusters = 3
unique = list(set(df['Dependents']))
print("actual clusters :",unique)
smp = sample(unique, no_of_clusters)
print("sampled clusters :",smp)
print("clusters :", smp, "selected out of :", unique, '\n')
index_set = [list(df.index[df['Dependents']==i]) for i in smp]
print("index set :",index_set)
index = [j for i in index_set for j in i]
print("indexs :",index)
print(df.iloc[index])

actual clusters : ['1', '0', nan, '3+', '2']
sampled clusters : ['1', '2', '0']
clusters : ['1', '2', '0'] selected out of : ['1', '0', nan, '3+', '2'] 

index set : [[1, 9, 16, 21, 24, 30, 32, 46, 54, 58, 63, 67, 77, 81, 84, 105, 113, 115, 117, 128, 147, 156, 157, 162, 178, 180, 183, 186, 195, 200, 203, 212, 230, 234, 235, 238, 239, 241, 247, 248, 253, 260, 262, 269, 275, 280, 296, 297, 299, 303, 315, 318, 319, 325, 330, 334, 336, 342, 357, 360, 368, 373, 385, 408, 410, 416, 423, 426, 430, 443, 445, 449, 463, 470, 471, 478, 485, 487, 491, 495, 498, 503, 509, 511, 527, 528, 533, 540, 542, 543, 551, 552, 555, 561, 570, 578, 583, 584, 585, 604, 606, 611], [5, 8, 10, 11, 12, 14, 23, 27, 29, 53, 55, 59, 71, 82, 86, 91, 92, 100, 106, 108, 123, 137, 139, 140, 144, 146, 153, 163, 169, 176, 201, 204, 218, 219, 221, 227, 240, 243, 246, 251, 266, 274, 282, 291, 292, 309, 313, 316, 322, 337, 344, 347, 356, 361, 371, 372, 379, 384, 393, 394, 395, 400, 417, 427, 446, 448, 450, 451, 455, 458, 468, 4