In [1]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../data/adult-census-income/adult.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='income', null_name='?')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26048, 15), (6513, 15), (26048, 2), (6513, 2))

In [2]:
X_train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,36,Private,241998,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
7632,7632,53,Private,103950,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States
27878,27878,19,Private,203061,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,25,United-States
14121,14121,20,Private,102607,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,30,United-States
32345,32345,54,State-gov,138852,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States


In [5]:
# 결측치, 이상값 확인

print(X_train.isnull().sum())
print()
print(X_test.isnull().sum())


id                   0
age                  0
workclass         1456
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1463
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     461
dtype: int64

id                  0
age                 0
workclass         380
fnlwgt              0
education           0
education.num       0
marital.status      0
occupation        380
relationship        0
race                0
sex                 0
capital.gain        0
capital.loss        0
hours.per.week      0
native.country    122
dtype: int64


In [7]:
# workclass, occupation, native.country
print(X_train['workclass'].value_counts())
print()
print(X_train['occupation'].value_counts())
print()
print(X_train['native.country'].value_counts())

Private             18160
Self-emp-not-inc     2049
Local-gov            1648
State-gov            1037
Self-emp-inc          909
Federal-gov           770
Without-pay            12
Never-worked            7
Name: workclass, dtype: int64

Exec-managerial      3323
Prof-specialty       3306
Craft-repair         3296
Adm-clerical         3037
Sales                2898
Other-service        2624
Machine-op-inspct    1584
Transport-moving     1257
Handlers-cleaners    1080
Farming-fishing       786
Tech-support          746
Protective-serv       521
Priv-house-serv       119
Armed-Forces            8
Name: occupation, dtype: int64

United-States                 23381
Mexico                          516
Philippines                     158
Germany                         108
Canada                           88
Puerto-Rico                      87
El-Salvador                      76
India                            73
Cuba                             73
England                          69
Italy

In [12]:
# 결측치가 모두 범주형 변수이므로
# 최빈값으로 대체

X_train['workclass'] = X_train['workclass'].fillna(X_train['workclass'].mode()[0])
X_train['occupation'] = X_train['occupation'].fillna(X_train['occupation'].mode()[0])
X_train['native.country'] = X_train['native.country'].fillna(X_train['native.country'].mode()[0])

X_test['workclass'] = X_test['workclass'].fillna(X_test['workclass'].mode()[0])
X_test['occupation'] = X_test['occupation'].fillna(X_test['occupation'].mode()[0])
X_test['native.country'] = X_test['native.country'].fillna(X_test['native.country'].mode()[0])
print(X_train.isnull().sum())


id                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64


In [13]:
X_train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,36,Private,241998,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
7632,7632,53,Private,103950,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States
27878,27878,19,Private,203061,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,25,United-States
14121,14121,20,Private,102607,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,30,United-States
32345,32345,54,State-gov,138852,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States


In [14]:
##### 범주형 변수 인코딩
from sklearn.preprocessing import LabelEncoder

# df.assign(): 새로운 컬럼 할당
all_df = pd.concat([X_train.assign(ind="train"), X_test.assign(ind="test")])
all_df.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,ind
21851,21851,36,Private,241998,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States,train
7632,7632,53,Private,103950,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,train
27878,27878,19,Private,203061,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,25,United-States,train
14121,14121,20,Private,102607,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,30,United-States,train
32345,32345,54,State-gov,138852,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,train


In [16]:
categorical = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

encoder = LabelEncoder()

all_df[categorical] = all_df[categorical].apply(encoder.fit_transform)
all_df.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,ind
21851,21851,36,3,241998,9,13,2,2,0,4,1,0,0,50,38,train
7632,7632,53,3,103950,12,14,0,9,1,4,0,0,0,40,38,train
27878,27878,19,3,203061,15,10,4,12,1,4,0,0,0,25,38,train
14121,14121,20,3,102607,11,9,4,5,3,4,1,0,0,30,38,train
32345,32345,54,6,138852,11,9,2,9,0,4,1,0,0,40,38,train


In [20]:
X_train = all_df[all_df['ind']=='train']
print(X_train.shape)
X_train = X_train.drop('ind', axis=1)
print(X_train.shape)

(26048, 16)
(26048, 15)


In [21]:
X_test = all_df[all_df['ind']=='test']
print(X_test.shape)
X_test = X_test.drop('ind', axis=1)
print(X_test.shape)

(6513, 16)
(6513, 15)


In [22]:
y_train.head()

Unnamed: 0,id,income
21851,21851,>50K
7632,7632,<=50K
27878,27878,<=50K
14121,14121,<=50K
32345,32345,<=50K


In [29]:
# train/test split

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train['income'], test_size=0.2)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(20838, 15) (5210, 15) (20838,) (5210,)


In [36]:
# 분류
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=3)

rf.fit(X_train, y_train)
pred = rf.predict(X_test)

print('훈련 정확도:', rf.score(X_train, y_train))

y_test['income'] = y_test['income'].replace('>50K', 1)
y_test['income'] = y_test['income'].replace('<=50K', 0)

from sklearn.metrics import accuracy_score
print('테스트 정확도:', accuracy_score(y_test['income'], pred))

훈련 정확도: 0.8246952682599098
테스트 정확도: 0.8248119146322739


In [39]:
result = pd.DataFrame({'id':X_test['id'], 'pred':pred})
result.head()

Unnamed: 0,id,pred
20901,20901,0
14170,14170,0
1776,1776,1
30428,30428,0
8602,8602,0


In [None]:
#result.to_csv('성인인구소득예측.csv', index=False)