In [16]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('../datasets/income.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,40,Self-emp-not-inc,223881,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,99999,0,70,United-States,>50K
1,30,Private,149118,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Female,0,0,40,United-States,<=50K
2,46,Private,109209,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,>50K
3,32,Private,229566,Assoc-voc,11,Married-civ-spouse,Other-service,Husband,White,Male,0,0,60,United-States,>50K
4,54,?,148657,Preschool,1,Married-civ-spouse,?,Wife,White,Female,0,0,40,Mexico,<=50K


In [4]:
original_row_count = len(df)
original_row_count

25000

In [5]:
#Simple sampling

df_sample_simple = df.sample(frac=0.1, random_state=42)
simple_sample_row_count = len(df_sample_simple)
print(simple_sample_row_count)

print(df_sample_simple.head())

2500
       age workclass  fnlwgt     education  education.num      marital.status  \
6868    19   Private  101549  Some-college             10       Never-married   
24016   43   Private  226902       HS-grad              9  Married-civ-spouse   
9668    18   Private  304169          11th              7       Never-married   
13640   32   Private  295589       Masters             14  Married-civ-spouse   
14018   52   Private   68982     Bachelors             13  Married-civ-spouse   

            occupation relationship   race   sex  capital.gain  capital.loss  \
6868     Other-service    Own-child  White  Male             0             0   
24016            Sales      Husband  White  Male             0             0   
9668             Sales    Own-child  White  Male             0             0   
13640  Exec-managerial      Husband  Black  Male             0          1977   
14018  Exec-managerial      Husband  White  Male             0             0   

       hours.per.week nativ

In [6]:
#Systematic Sampling 

systematic_df = df.iloc[::10, :]
systematic_row_count = len(systematic_df)
systematic_row_count

2500

In [7]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,40,Self-emp-not-inc,223881,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,99999,0,70,United-States,>50K
1,30,Private,149118,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Female,0,0,40,United-States,<=50K
2,46,Private,109209,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,>50K
3,32,Private,229566,Assoc-voc,11,Married-civ-spouse,Other-service,Husband,White,Male,0,0,60,United-States,>50K
4,54,?,148657,Preschool,1,Married-civ-spouse,?,Wife,White,Female,0,0,40,Mexico,<=50K


Stratified sampling

In [None]:
#Stratified sampling

print(df['race'].unique())

total_white = df[df['race'] == 'White']['race'].count()
print(total_white)

total_black = df[df['race'] == 'Black']['race'].count()
print(total_black)

total_asian = df[df['race'] == 'Asian-Pac-Islander']['race'].count()
print(total_asian)

total_other = df[df['race'] == 'Other']['race'].count()
print(total_other)

total_amer = df[df['race'] == 'Amer-Indian-Eskimo']['race'].count()
print(total_amer)

['White' 'Black' 'Asian-Pac-Islander' 'Other' 'Amer-Indian-Eskimo']
21328
2412
799
209
252
2497


In [None]:
sample_white = int(total_white * 0.1)
sample_black = int(total_black * 0.1)
sample_asian = int(total_asian * 0.1)
sample_amer = int(total_amer * 0.1)
sample_other = int(total_other * 0.1)

print(sample_amer + sample_asian + sample_black + sample_other + sample_white)

In [11]:
##all records of other and amer will be considered as they are less than the ideall number of records to be taken from a stata, 
# and remaining leftover records will get distributed among the other stratas

remaining_rec = 2500 - (total_other + total_amer)
print(remaining_rec)
records_to_be_strata = remaining_rec // 3
print(records_to_be_strata)

2039
679


In [12]:
df_other_sample = df[df['race'] == 'Other'].sample(n = total_other, random_state=42)
df_amer_sample = df[df['race'] == 'Amer-Indian-Eskimo'].sample(n = total_amer, random_state=42)
df_aisan_sample = df[df['race'] == 'Asian-Pac-Islander'].sample(n = records_to_be_strata, random_state=42)
df_black_sample = df[df['race'] == 'Black'].sample(n = records_to_be_strata, random_state=42)
df_white_sample = df[df['race'] == 'White'].sample(n = records_to_be_strata, random_state=42)

strata_df = pd.concat([df_white_sample, df_black_sample,df_aisan_sample,df_amer_sample, df_other_sample])
print(len(strata_df))


2498


# Cluster Sampling

In [15]:
unique_marital_status = df['marital.status'].unique()
num_marital_status = len(unique_marital_status)
print(unique_marital_status)
print(num_marital_status)

['Married-civ-spouse' 'Divorced' 'Widowed' 'Never-married' 'Separated'
 'Married-spouse-absent' 'Married-AF-spouse']
7


In [17]:
np.random.seed(42)
selected_marital = np.random.choice(unique_marital_status, size = num_marital_status//2, replace=False)
print(selected_marital)

['Married-civ-spouse' 'Divorced' 'Married-spouse-absent']


In [19]:
cluster_df = df[df['marital.status'].isin(selected_marital)]
cluster_row_count = len(cluster_df)
print(cluster_df)
print(cluster_row_count)

       age         workclass  fnlwgt     education  education.num  \
0       40  Self-emp-not-inc  223881   Prof-school             15   
1       30           Private  149118       HS-grad              9   
2       46           Private  109209  Some-college             10   
3       32           Private  229566     Assoc-voc             11   
4       54                 ?  148657     Preschool              1   
...    ...               ...     ...           ...            ...   
24988   60           Private   83861       HS-grad              9   
24990   40           Private   77357   Prof-school             15   
24994   70           Private  278139       HS-grad              9   
24997   38       Federal-gov  190895     Bachelors             13   
24999   60           Private   88055          10th              6   

           marital.status         occupation   relationship   race     sex  \
0      Married-civ-spouse     Prof-specialty        Husband  White    Male   
1              