## **T2-3. Adult Census Income Tutorial**

성인 인구조사 소득 예측
- age: 나이
- workclass: 고용 형태
- fnlwgt: 사람의 대표성을 나타내는 가중치(final weight)
- education: 교육 수준
- education.num: 교육 수준 수치
- marital.status: 결혼 상태
- occupation: 업종
- relationship: 가족 관계
- race: 인종
- sex: 성별
- capital.gain: 양도 소득
- capital.loss: 양도 손실
- hours.per.week: 주당 근무 시간
- native.country: 국적
- income: 수익 (예측해야 하는 값)

In [287]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name

    if null_name != "":
        df[df == null_name] = np.nan

    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)

    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])


    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test

df = pd.read_csv('data/adult.csv')
X_train, X_test, y_train, y_test = exam_data_load(df, target='income', null_name='?')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26048, 15), (6513, 15), (26048, 2), (6513, 2))

In [288]:
# EDA

print(X_train)
print('----------------------------------------------------------')
# print(X_test)
# print('----------------------------------------------------------')
print(y_train)

print('----------------------------------------------------------')
print(X_train.isna().sum())
print('----------------------------------------------------------')
print(X_test.isna().sum())

          id  age         workclass  fnlwgt     education  education.num  \
21851  21851   36           Private  241998     Bachelors             13   
7632    7632   53           Private  103950       Masters             14   
27878  27878   19           Private  203061  Some-college             10   
14121  14121   20           Private  102607       HS-grad              9   
32345  32345   54         State-gov  138852       HS-grad              9   
...      ...  ...               ...     ...           ...            ...   
2669    2669   45           Private  187370       Masters             14   
17536  17536   36           Private  174308          11th              7   
6201    6201   47           Private  275361    Assoc-acdm             12   
27989  27989   50  Self-emp-not-inc  196504     Doctorate             16   
25716  25716   19           Private  410543       HS-grad              9   

           marital.status         occupation   relationship   race     sex  \
21851  Ma

In [289]:
# feature 구분
numeric_features = ['age', 'fnlwgt', 'education.num', 'capital.gain','capital.loss', 'hours.per.week']
cat_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

In [290]:
# 결측치는 최빈값과 차이가 크면 최빈값으로, 값이 비슷하면 별도의 값으로 대체

X_train['workclass'] = X_train['workclass'].fillna(X_train['workclass'].mode()[0])
X_train['occupation'] = X_train['occupation'].fillna('null')
X_train['native.country'] = X_train['native.country'].fillna(X_train['native.country'].mode()[0])

X_test['workclass'] = X_test['workclass'].fillna(X_test['workclass'].mode()[0])
X_test['occupation'] = X_test['occupation'].fillna('null')
X_test['native.country'] = X_test['native.country'].fillna(X_test['native.country'].mode()[0])

print(X_train.isnull().sum())
print('----------------------------------------------------------')
print(X_test.isnull().sum())

id                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64
----------------------------------------------------------
id                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64


In [291]:
# 라벨인코딩
from sklearn.preprocessing import LabelEncoder

all_df = pd.concat([X_train.assign(ind='train'), X_test.assign(ind='test')])
le = LabelEncoder()
all_df[cat_features] = all_df[cat_features].apply(le.fit_transform)

X_train = all_df[all_df['ind'] == 'train']
X_train = X_train.drop('ind', axis=1)

X_test = all_df[all_df['ind'] == 'test']
X_test = X_test.drop('ind', axis=1)

print(X_train)

          id  age  workclass  fnlwgt  education  education.num  \
21851  21851   36          3  241998          9             13   
7632    7632   53          3  103950         12             14   
27878  27878   19          3  203061         15             10   
14121  14121   20          3  102607         11              9   
32345  32345   54          6  138852         11              9   
...      ...  ...        ...     ...        ...            ...   
2669    2669   45          3  187370         12             14   
17536  17536   36          3  174308          1              7   
6201    6201   47          3  275361          7             12   
27989  27989   50          5  196504         10             16   
25716  25716   19          3  410543         11              9   

       marital.status  occupation  relationship  race  sex  capital.gain  \
21851               2           2             0     4    1             0   
7632                0           9             1     4  

In [292]:
# 스케일링
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.fit_transform(X_test[numeric_features])

print(X_train)

          id       age  workclass    fnlwgt  education  education.num  \
21851  21851  0.260274          3  0.156011          9       0.800000   
7632    7632  0.493151          3  0.062255         12       0.866667   
27878  27878  0.027397          3  0.129566         15       0.600000   
14121  14121  0.041096          3  0.061343         11       0.533333   
32345  32345  0.506849          6  0.085958         11       0.533333   
...      ...       ...        ...       ...        ...            ...   
2669    2669  0.383562          3  0.118910         12       0.866667   
17536  17536  0.260274          3  0.110039          1       0.400000   
6201    6201  0.410959          3  0.178669          7       0.733333   
27989  27989  0.452055          5  0.125113         10       1.000000   
25716  25716  0.027397          3  0.270479         11       0.533333   

       marital.status  occupation  relationship  race  sex  capital.gain  \
21851               2           2             0

In [293]:
# target 값 변경
y = (y_train['income'] != '<=50K').astype(int)
print(y.head())

21851    1
7632     0
27878    0
14121    0
32345    0
Name: income, dtype: int32


In [294]:
# 검증용 데이터 분리
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size=0.15, random_state=2021)
print(X_tr.shape, X_val.shape, y_tr.shape, y_val.shape)

(22140, 15) (3908, 15) (22140,) (3908,)


In [295]:
# id 삭제
X_tr = X_tr.drop('id', axis=1)
X_val = X_val.drop('id', axis=1)
print(X_tr.head())

            age  workclass    fnlwgt  education  education.num  \
1437   0.191781          3  0.216501          9       0.800000   
7151   0.287671          5  0.127591         11       0.533333   
30296  0.424658          3  0.217452          9       0.800000   
15372  0.452055          3  0.142442         11       0.533333   
13800  0.178082          3  0.187243         15       0.600000   

       marital.status  occupation  relationship  race  sex  capital.gain  \
1437                4           3             1     4    0           0.0   
7151                2           2             0     4    1           0.0   
30296               2           9             0     4    1           0.0   
15372               2           2             0     4    1           0.0   
13800               4           5             1     4    1           0.0   

       capital.loss  hours.per.week  native.country  
1437       0.323232        0.397959              38  
7151       0.000000        0.602041   

In [296]:
# 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(random_state=10)
model.fit(X_tr, y_tr)
pred = model.predict(X_val)
print(accuracy_score(y_val, pred))

0.851842374616172


In [297]:
# test 데이터 예측
X_test_id = X_test.pop('id')
pred = model.predict(X_test)

In [298]:
# csv 생성
output = pd.DataFrame({'id' : X_test_id, 'income' : pred})
output.to_csv('csv/T2-3.csv', index=False)
print(output.head())

          id  income
20901  20901       1
14170  14170       0
1776    1776       1
30428  30428       0
8602    8602       0


## **T2-5 Insurance_Starter (Tutorial)**

보험료(charges) 예측

In [299]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name

    if null_name != "":
        df[df == null_name] = np.nan

    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)

    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])


    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test

df = pd.read_csv("data/insurance.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='charges', null_name='NA')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 7), (268, 7), (1070, 2), (268, 2))

In [300]:
# EDA

print(X_train)
print('--------------------------------------------')
print(y_train)
print('--------------------------------------------')
print(X_train.isna().sum())
print('--------------------------------------------')
print(X_test)

        id  age     sex     bmi  children smoker     region
209    209   40    male  41.230         1     no  northeast
540    540   34  female  38.000         3     no  southwest
747    747   19    male  21.755         0     no  northwest
39      39   60    male  39.900         0    yes  southwest
640    640   33    male  42.400         5     no  southwest
...    ...  ...     ...     ...       ...    ...        ...
44      44   38    male  37.050         1     no  northeast
621    621   37    male  34.100         4    yes  southwest
1152  1152   43  female  32.560         3    yes  southeast
57      57   18    male  31.680         2    yes  southeast
1140  1140   50    male  37.070         1     no  southeast

[1070 rows x 7 columns]
--------------------------------------------
        id      charges
209    209   6610.10970
540    540   6196.44800
747    747   1627.28245
39      39  48173.36100
640    640   6666.24300
...    ...          ...
44      44   6079.67150
621    621  40182.

In [301]:
# 필요없는 칼럼 drop
X_train = X_train.drop(columns=['id'])
X_test_id = X_test.pop('id')

print(X_train.head())
print(X_test.head())

     age     sex     bmi  children smoker     region
209   40    male  41.230         1     no  northeast
540   34  female  38.000         3     no  southwest
747   19    male  21.755         0     no  northwest
39    60    male  39.900         0    yes  southwest
640   33    male  42.400         5     no  southwest
      age     sex     bmi  children smoker     region
1088   52    male  47.740         1     no  southeast
1157   23  female  23.180         2     no  northwest
1267   24    male  31.065         0    yes  northeast
506    22    male  31.350         1     no  northwest
659    57  female  28.785         4     no  northeast


In [302]:
# 라벨인코딩
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
cols = ['sex', 'smoker', 'region']

for col in cols :
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.fit_transform(X_test[col])

print(X_train.head())
print(X_test.head())

     age  sex     bmi  children  smoker  region
209   40    1  41.230         1       0       0
540   34    0  38.000         3       0       3
747   19    1  21.755         0       0       1
39    60    1  39.900         0       1       3
640   33    1  42.400         5       0       3
      age  sex     bmi  children  smoker  region
1088   52    1  47.740         1       0       2
1157   23    0  23.180         2       0       1
1267   24    1  31.065         0       1       0
506    22    1  31.350         1       0       1
659    57    0  28.785         4       0       0


In [303]:
# 정규화
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
cols = ['age', 'bmi']
X_train[cols] = scaler.fit_transform(X_train[cols])
X_test[cols] = scaler.fit_transform(X_test[cols])

print(X_train.head())
print(X_test.head())

          age  sex       bmi  children  smoker  region
209  0.478261    1  0.679849         1       0       0
540  0.347826    0  0.592951         3       0       3
747  0.021739    1  0.155905         0       0       1
39   0.913043    1  0.644068         0       1       3
640  0.326087    1  0.711326         5       0       3
           age  sex       bmi  children  smoker  region
1088  0.739130    1  0.958451         1       0       2
1157  0.108696    0  0.185395         2       0       1
1267  0.130435    1  0.433585         0       1       0
506   0.086957    1  0.442556         1       0       1
659   0.847826    0  0.361819         4       0       0


In [304]:
# 랜덤포레스트
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=10)
model.fit(X_train, y_train['charges'])
print(model.score(X_train, y_train['charges']))

0.9779650798219194


In [305]:
# test 데이터 예측
pred = model.predict(X_test)
pred

array([13099.5769918 ,  6336.0618588 , 34211.290217  ,  2660.9254707 ,
       15678.3878102 ,  4185.9405983 ,  9638.264228  , 15452.2503695 ,
        2367.4574785 , 14801.2738086 , 13602.7756521 , 13291.5651333 ,
       11919.2062834 , 10409.5225731 ,  2645.4505312 , 15686.6100677 ,
       13914.704691  ,  3249.1697145 ,  2570.17918619, 12879.6906823 ,
       10796.9083023 ,  6106.7541333 ,  2996.4313885 , 21764.7527468 ,
        9165.5432279 , 11498.2771033 ,  4501.2424184 ,  6726.5651268 ,
       15995.2294069 ,  4649.632936  ,  5586.7009147 ,  5885.498577  ,
       13508.3360814 , 17361.5087328 ,  7360.3800088 ,  5196.0390818 ,
        4246.1228614 , 43634.4066698 ,  8739.3103044 ,  8361.5702714 ,
       11117.4952509 ,  1612.548028  ,  2388.803905  , 11685.5384656 ,
       11432.0950576 ,  2807.1087616 , 15412.5394297 , 14909.7381508 ,
       44989.6272507 ,  3069.3568085 ,  5894.2494548 ,  4509.9925155 ,
        9639.0823725 , 40215.5647679 ,  1589.888742  , 15006.7937303 ,
      

In [306]:
# csv 출력
output = pd.DataFrame({'id' : X_test_id, 'income' : pred})
output.to_csv('csv/T2-5.csv', index=False)
print(output.head())

        id        income
1088  1088  13099.576992
1157  1157   6336.061859
1267  1267  34211.290217
506    506   2660.925471
659    659  15678.387810
