# 표본추출

In [1]:
import pandas as pd 
import numpy as np

X_value = np.arange(40).reshape(20,2)
y_value = np.arange(20)

sample_df = pd.DataFrame(np.column_stack((X_value, y_value)), columns=['X_1','X_2','result'])

print(sample_df.shape)
sample_df.head()

(20, 3)


Unnamed: 0,X_1,X_2,result
0,0,1,0
1,2,3,1
2,4,5,2
3,6,7,3
4,8,9,4


In [2]:
sample_df.sample(n=9, random_state=1001) # n =  추출할 표본개수, random_state - 아무값이나 적어도 상관없다

Unnamed: 0,X_1,X_2,result
1,2,3,1
15,30,31,15
0,0,1,0
2,4,5,2
18,36,37,18
7,14,15,7
10,20,21,10
6,12,13,6
19,38,39,19


In [3]:
sample_df.sample(frac=0.5, random_state=1001) # 추출할 표본 비율

Unnamed: 0,X_1,X_2,result
1,2,3,1
15,30,31,15
0,0,1,0
2,4,5,2
18,36,37,18
7,14,15,7
10,20,21,10
6,12,13,6
19,38,39,19
4,8,9,4


In [4]:
rep_df = sample_df.sample(frac=0.2, random_state=1001)
rep_df

Unnamed: 0,X_1,X_2,result
1,2,3,1
15,30,31,15
0,0,1,0
2,4,5,2


In [5]:
# 4개 샘플을 복원추출로 10개 만들기
rep_df.sample(n=10,replace=True,random_state=1001)

Unnamed: 0,X_1,X_2,result
15,30,31,15
15,30,31,15
15,30,31,15
0,0,1,0
0,0,1,0
2,4,5,2
1,2,3,1
1,2,3,1
0,0,1,0
2,4,5,2


In [6]:
# weights : 가중치 반영할 필드값
# result 컬럼 기준으로 높은 쪽으로
sample_df.sample(n=5, weights='result')

Unnamed: 0,X_1,X_2,result
12,24,25,12
18,36,37,18
19,38,39,19
16,32,33,16
13,26,27,13


### 계통표본추출

In [7]:
# 샘플 수를 입력받아 간격을 구하여 샘플 추출
def sysmetic_sampling(data,n):
    count = len(data) #모집단 수
    sample_count = count // n
    index = data[:sample_count].sample(1).index
    intoin = index - 0 # 샘플 간 간격 --> 왜 필요한거지?

    
    sys_df = pd.DataFrame()
    while len(sys_df) < n:
        sys_df = sys_df.append(data.loc[index,:])
        index += sample_count
    return (sys_df)

In [8]:
sysmetic_sampling(sample_df,5)

  sys_df = sys_df.append(data.loc[index,:])
  sys_df = sys_df.append(data.loc[index,:])
  sys_df = sys_df.append(data.loc[index,:])
  sys_df = sys_df.append(data.loc[index,:])
  sys_df = sys_df.append(data.loc[index,:])


Unnamed: 0,X_1,X_2,result
0,0,1,0
4,8,9,4
8,16,17,8
12,24,25,12
16,32,33,16


### 층화확률표본추출

In [13]:
# 모집단을 먼저 서로 겹치지 않는 여러개의 층으로 분할, 각 층별로 단순확률표본추출법을 적용시켜 표분을 추출하는 방법

from sklearn.model_selection import StratifiedShuffleSplit

# n_split : 분할 반복횟수,  test_size : 테스트셋 샘플 비율
splitfi = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=1001)

In [14]:
sample_df.head(3)

Unnamed: 0,X_1,X_2,result,group
0,0,1,0,0
1,2,3,1,0
2,4,5,2,0


In [15]:
group = [0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1]
sample_df['group'] = group
sample_df.head()

Unnamed: 0,X_1,X_2,result,group
0,0,1,0,0
1,2,3,1,0
2,4,5,2,0
3,6,7,3,0
4,8,9,4,0


In [16]:
sample_df['group'].value_counts()

0    10
1    10
Name: group, dtype: int64

In [17]:
splitfi.split(sample_df,sample_df['group'])

<generator object BaseShuffleSplit.split at 0x000001D7E2651D60>

In [18]:
for train_idx, test_idx in splitfi.split(sample_df,sample_df['group']):
    print(type(train_idx))
    print('Train :', train_idx, 'Test :',test_idx)
    df_strat_train = sample_df.loc[train_idx]
    df_strat_test = sample_df.loc[test_idx]

<class 'numpy.ndarray'>
Train : [13  7  1 14 16 12  0 11 10 18  2  8  5  6] Test : [17 19  3 15  4  9]


In [19]:
print("Train data 수 확인")
print(df_strat_train.shape)
print("Test data 수 확인")
print(df_strat_test.shape)


Train data 수 확인
(14, 4)
Test data 수 확인
(6, 4)


In [20]:
# 모집단과 동일 비율로 Group 속성을 기준으로 데이터 분리 확인
print("전체비율")
print(sample_df['group'].value_counts() / len(sample_df))
print('Train data 비율')
print(df_strat_train['group'].value_counts() / len(df_strat_train))
print('Test data 비율')
print(df_strat_test['group'].value_counts() / len(df_strat_test))

전체비율
0    0.5
1    0.5
Name: group, dtype: float64
Train data 비율
1    0.5
0    0.5
Name: group, dtype: float64
Test data 비율
1    0.5
0    0.5
Name: group, dtype: float64
