In [1]:
import pandas as pd
import numpy as np

In [2]:
# https://www.kaggle.com/datasets/uciml/adult-census-income
df = pd.read_csv('adult.csv')
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


# 결측치 탐색
결측치가 어떤식으로 표현되어 있을지 모르기 때문에, 전체적으로 값을 확인
결측치 예시) Nan, Null, NA, None, np.nan, '?' ...

In [4]:
col_list = list(df.columns)
col_list

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education.num',
 'marital.status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital.gain',
 'capital.loss',
 'hours.per.week',
 'native.country',
 'income']

In [5]:
for col in col_list:
    print("\n===== "+col+" =====")
    print(df[col].value_counts())


===== age =====
36    898
31    888
34    886
23    877
35    876
     ... 
83      6
88      3
85      3
86      1
87      1
Name: age, Length: 73, dtype: int64

===== workclass =====
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

===== fnlwgt =====
164190    13
203488    13
123011    13
113364    12
121124    12
          ..
183522     1
44419      1
442612     1
374833     1
257302     1
Name: fnlwgt, Length: 21648, dtype: int64

===== education =====
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th       

# 결측치 처리
결측치로 처리하기 위한 값을 간단하게 처리하기 위해, null 값으로 교체  
ex) '?'를 결측치로 처리하기로 했으면 해당 값을 null로 교체

In [7]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [9]:
# 연습을 위해 숫자 10도 결측치로 처리

df_include_null = df.copy()
df_include_null = df_include_null.replace('?', np.NaN)
df_include_null = df_include_null.replace(10, np.NaN)
df_include_null

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9.0,Widowed,,Not-in-family,White,Female,0,4356,40.0,United-States,<=50K
1,82,Private,132870,HS-grad,9.0,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18.0,United-States,<=50K
2,66,,186061,Some-college,,Widowed,,Unmarried,Black,Female,0,4356,40.0,United-States,<=50K
3,54,Private,140359,7th-8th,4.0,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40.0,United-States,<=50K
4,41,Private,264663,Some-college,,Separated,Prof-specialty,Own-child,White,Female,0,3900,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32558,40,Private,154374,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32559,58,Private,151910,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K


In [10]:
df_include_null.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num     7291
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week     278
native.country     583
income               0
dtype: int64

In [11]:
tmp = df_include_null.isnull().sum()
col_names_include_missing_values = list()
for i in range(len(tmp)):
    if tmp.iloc[i] != 0:
        col_names_include_missing_values.append(tmp.index[i])
col_names_include_missing_values

['workclass',
 'education.num',
 'occupation',
 'hours.per.week',
 'native.country']

# 결측치 삭제
### null 결측치가 있는 행(row) 삭제

In [12]:
df_remove_null_row = df_include_null.dropna(axis=0)
df_remove_null_row

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,132870,HS-grad,9.0,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18.0,United-States,<=50K
3,54,Private,140359,7th-8th,4.0,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40.0,United-States,<=50K
5,34,Private,216864,HS-grad,9.0,Divorced,Other-service,Unmarried,White,Female,0,3770,45.0,United-States,<=50K
6,38,Private,150601,10th,6.0,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40.0,United-States,<=50K
7,74,State-gov,88638,Doctorate,16.0,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20.0,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,53,Private,321865,Masters,14.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40.0,United-States,>50K
32557,27,Private,257302,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32558,40,Private,154374,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32559,58,Private,151910,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K


In [13]:
df_remove_null_row.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

### null 결측치가 있는 열(column) 삭제

In [14]:
df_remove_null_col = df_include_null.dropna(axis=1)
df_remove_null_col

Unnamed: 0,age,fnlwgt,education,marital.status,relationship,race,sex,capital.gain,capital.loss,income
0,90,77053,HS-grad,Widowed,Not-in-family,White,Female,0,4356,<=50K
1,82,132870,HS-grad,Widowed,Not-in-family,White,Female,0,4356,<=50K
2,66,186061,Some-college,Widowed,Unmarried,Black,Female,0,4356,<=50K
3,54,140359,7th-8th,Divorced,Unmarried,White,Female,0,3900,<=50K
4,41,264663,Some-college,Separated,Own-child,White,Female,0,3900,<=50K
...,...,...,...,...,...,...,...,...,...,...
32556,22,310152,Some-college,Never-married,Not-in-family,White,Male,0,0,<=50K
32557,27,257302,Assoc-acdm,Married-civ-spouse,Wife,White,Female,0,0,<=50K
32558,40,154374,HS-grad,Married-civ-spouse,Husband,White,Male,0,0,>50K
32559,58,151910,HS-grad,Widowed,Unmarried,White,Female,0,0,<=50K


In [15]:
df_remove_null_col.isnull().sum()

age               0
fnlwgt            0
education         0
marital.status    0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
income            0
dtype: int64

### null 결측치 대체하기
여기서는 가장 먼저 나온 값으로 대체하였으며, 대체 방법에 대해서는 아래 내용 참고

In [16]:
value_to_replace = {
    'workclass': 'Private',
    'occupation': 'Exec-managerial',
    'native.country': 'United-States',
    'education.num': 9,
    'hours.per.week': 40
}

df_replace_null = df_include_null.fillna(value = value_to_replace)
df_replace_null

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,HS-grad,9.0,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,40.0,United-States,<=50K
1,82,Private,132870,HS-grad,9.0,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18.0,United-States,<=50K
2,66,Private,186061,Some-college,9.0,Widowed,Exec-managerial,Unmarried,Black,Female,0,4356,40.0,United-States,<=50K
3,54,Private,140359,7th-8th,4.0,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40.0,United-States,<=50K
4,41,Private,264663,Some-college,9.0,Separated,Prof-specialty,Own-child,White,Female,0,3900,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,9.0,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32558,40,Private,154374,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32559,58,Private,151910,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K


In [17]:
df_replace_null.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

# 결측치 대체
### 최빈값으로 대체
분포를 변형 시킬 수 있으며, 결측치가 많을 수록 최빈값이 더 많아지게 됨

In [19]:
df_replace_frequent_val = df_include_null.copy()

for col_name in col_names_include_missing_values:
    df_replace_frequent_val[col_name] = df_replace_frequent_val[col_name].fillna(df_include_null[col_name].value_counts(ascending=False).index[0])

df_replace_frequent_val

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,HS-grad,9.0,Widowed,Prof-specialty,Not-in-family,White,Female,0,4356,40.0,United-States,<=50K
1,82,Private,132870,HS-grad,9.0,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18.0,United-States,<=50K
2,66,Private,186061,Some-college,9.0,Widowed,Prof-specialty,Unmarried,Black,Female,0,4356,40.0,United-States,<=50K
3,54,Private,140359,7th-8th,4.0,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40.0,United-States,<=50K
4,41,Private,264663,Some-college,9.0,Separated,Prof-specialty,Own-child,White,Female,0,3900,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,9.0,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32558,40,Private,154374,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32559,58,Private,151910,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K


In [20]:
df_replace_frequent_val.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

### 평균값으로 대체
(문자형 적용 불가)  
예시로 'education.num'에만 적용

In [21]:
df_replace_mean_val = df_include_null.copy()

df_replace_mean_val['education.num'] = df_replace_mean_val['education.num'].fillna(df_replace_mean_val['education.num'].mean())
df_replace_mean_val

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9.000000,Widowed,,Not-in-family,White,Female,0,4356,40.0,United-States,<=50K
1,82,Private,132870,HS-grad,9.000000,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18.0,United-States,<=50K
2,66,,186061,Some-college,10.103957,Widowed,,Unmarried,Black,Female,0,4356,40.0,United-States,<=50K
3,54,Private,140359,7th-8th,4.000000,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40.0,United-States,<=50K
4,41,Private,264663,Some-college,10.103957,Separated,Prof-specialty,Own-child,White,Female,0,3900,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10.103957,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12.000000,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32558,40,Private,154374,HS-grad,9.000000,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32559,58,Private,151910,HS-grad,9.000000,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K


In [22]:
df_replace_mean_val.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week     278
native.country     583
income               0
dtype: int64

### 중앙값으로 대체
(문자형 적용 불가)  
예시로 'education.num'에만 적용 

In [23]:
df_replace_median_val = df_include_null.copy()

df_replace_median_val['education.num'] = df_replace_median_val['education.num'].fillna(df_replace_median_val['education.num'].median())
df_replace_median_val

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9.0,Widowed,,Not-in-family,White,Female,0,4356,40.0,United-States,<=50K
1,82,Private,132870,HS-grad,9.0,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18.0,United-States,<=50K
2,66,,186061,Some-college,9.0,Widowed,,Unmarried,Black,Female,0,4356,40.0,United-States,<=50K
3,54,Private,140359,7th-8th,4.0,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40.0,United-States,<=50K
4,41,Private,264663,Some-college,9.0,Separated,Prof-specialty,Own-child,White,Female,0,3900,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,9.0,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32558,40,Private,154374,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32559,58,Private,151910,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K


In [24]:
df_replace_median_val.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week     278
native.country     583
income               0
dtype: int64

### 랜덤으로 추출하여 대체

In [26]:
df_replace_random_val = df_include_null.copy()

for col_name in col_names_include_missing_values:
    extracted_samples = df_replace_random_val[col_name].dropna().sample(df_replace_random_val[col_name].isnull().sum())
    extracted_samples.index = df_replace_random_val[lambda x: x[col_name].isnull()].index
    df_replace_random_val.loc[df_replace_random_val[col_name].isnull(), col_name] = extracted_samples
    
df_replace_random_val

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Self-emp-not-inc,77053,HS-grad,9.0,Widowed,Machine-op-inspct,Not-in-family,White,Female,0,4356,40.0,United-States,<=50K
1,82,Private,132870,HS-grad,9.0,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18.0,United-States,<=50K
2,66,Private,186061,Some-college,7.0,Widowed,Tech-support,Unmarried,Black,Female,0,4356,40.0,United-States,<=50K
3,54,Private,140359,7th-8th,4.0,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40.0,United-States,<=50K
4,41,Private,264663,Some-college,9.0,Separated,Prof-specialty,Own-child,White,Female,0,3900,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,8.0,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32558,40,Private,154374,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32559,58,Private,151910,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K


In [27]:
df_replace_random_val.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

### ML활용 - KNN
(문자형 적용 불가)  
예시로 'education.num'에만 적용

In [28]:
from sklearn.impute import KNNImputer

In [30]:
df_replace_knn_val = df_include_null.copy()

col_name = 'education.num'
imputer = KNNImputer(n_neighbors=5)
df_replace_knn_val[col_name] = imputer.fit_transform(pd.DataFrame(df_replace_knn_val[col_name]))
df_replace_knn_val

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9.000000,Widowed,,Not-in-family,White,Female,0,4356,40.0,United-States,<=50K
1,82,Private,132870,HS-grad,9.000000,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18.0,United-States,<=50K
2,66,,186061,Some-college,10.103957,Widowed,,Unmarried,Black,Female,0,4356,40.0,United-States,<=50K
3,54,Private,140359,7th-8th,4.000000,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40.0,United-States,<=50K
4,41,Private,264663,Some-college,10.103957,Separated,Prof-specialty,Own-child,White,Female,0,3900,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10.103957,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12.000000,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32558,40,Private,154374,HS-grad,9.000000,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32559,58,Private,151910,HS-grad,9.000000,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K


In [31]:
df_replace_knn_val.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week     278
native.country     583
income               0
dtype: int64

# sklearn 활용
sklearn 라이브러리를 통해 위와 같은 처리를 할 수 있음  
: (최빈값 / 평균값 / 중앙값 사용)

In [33]:
from sklearn.impute import SimpleImputer

In [36]:
# 최빈값
df_replace_1 = df_include_null.copy()

col_name = 'education.num'
imputer = SimpleImputer(strategy = 'most_frequent')
df_replace_1[col_name] = imputer.fit_transform(pd.DataFrame(df_replace_1[col_name]))
df_replace_1

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9.0,Widowed,,Not-in-family,White,Female,0,4356,40.0,United-States,<=50K
1,82,Private,132870,HS-grad,9.0,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18.0,United-States,<=50K
2,66,,186061,Some-college,9.0,Widowed,,Unmarried,Black,Female,0,4356,40.0,United-States,<=50K
3,54,Private,140359,7th-8th,4.0,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40.0,United-States,<=50K
4,41,Private,264663,Some-college,9.0,Separated,Prof-specialty,Own-child,White,Female,0,3900,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,9.0,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32558,40,Private,154374,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32559,58,Private,151910,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K


In [37]:
# 평균값
df_replace_2 = df_include_null.copy()

col_name = 'education.num'
imputer = SimpleImputer(strategy = 'mean')
df_replace_2[col_name] = imputer.fit_transform(pd.DataFrame(df_replace_2[col_name]))
df_replace_2

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9.000000,Widowed,,Not-in-family,White,Female,0,4356,40.0,United-States,<=50K
1,82,Private,132870,HS-grad,9.000000,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18.0,United-States,<=50K
2,66,,186061,Some-college,10.103957,Widowed,,Unmarried,Black,Female,0,4356,40.0,United-States,<=50K
3,54,Private,140359,7th-8th,4.000000,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40.0,United-States,<=50K
4,41,Private,264663,Some-college,10.103957,Separated,Prof-specialty,Own-child,White,Female,0,3900,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10.103957,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12.000000,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32558,40,Private,154374,HS-grad,9.000000,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32559,58,Private,151910,HS-grad,9.000000,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K


In [38]:
# 중앙값
df_replace_3 = df_include_null.copy()

col_name = 'education.num'
imputer = SimpleImputer(strategy = 'median')
df_replace_3[col_name] = imputer.fit_transform(pd.DataFrame(df_replace_3[col_name]))
df_replace_3

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9.0,Widowed,,Not-in-family,White,Female,0,4356,40.0,United-States,<=50K
1,82,Private,132870,HS-grad,9.0,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18.0,United-States,<=50K
2,66,,186061,Some-college,9.0,Widowed,,Unmarried,Black,Female,0,4356,40.0,United-States,<=50K
3,54,Private,140359,7th-8th,4.0,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40.0,United-States,<=50K
4,41,Private,264663,Some-college,9.0,Separated,Prof-specialty,Own-child,White,Female,0,3900,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,9.0,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32558,40,Private,154374,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32559,58,Private,151910,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K
