## Fixed Random Seed

In [1]:
import numpy as np
import random
import os

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## 데이터 불러오기 및 확인

In [2]:
import pandas as pd

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

display(train.head(3))
display(test.head(3))

Unnamed: 0,ID,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,...,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status,Income
0,TRAIN_00000,63,M,Middle (7-8),Full-Time,4,Social Services,Services,White,All other,...,Native,US,US,US,Nonfiler,0,0,0,Unknown,425
1,TRAIN_00001,37,M,Associates degree (Vocational),Full-Time,52,Entertainment,Services,White,All other,...,Native,US,US,US,Single,0,0,0,Under Median,0
2,TRAIN_00002,58,F,High graduate,Full-Time,52,Manufacturing (Non-durable),Admin Support (include Clerical),Black,All other,...,Native,US,US,US,Married Filling Jointly both under 65 (MFJ),3411,0,0,Under Median,860


Unnamed: 0,ID,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,...,Household_Summary,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status
0,TEST_0000,79,M,High Junior,Children or Armed Forces,0,Not in universe or children,Unknown,White,All other,...,Householder,Native,US,Unknown,Unknown,Single,0,0,0,Under Median
1,TEST_0001,47,M,Elementary (5-6),Children or Armed Forces,0,Not in universe or children,Unknown,White,Other Spanish,...,Child 18 or older,Native,US,US,US,Nonfiler,0,0,0,Under Median
2,TEST_0002,18,F,High Junior,Children or Armed Forces,52,Retail,Services,White,All other,...,Child 18 or older,Native,US,US,US,Single,0,0,0,Under Median


In [4]:
train.shape, test.shape

((20000, 23), (10000, 22))

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ID                      20000 non-null  object
 1   Age                     20000 non-null  int64 
 2   Gender                  20000 non-null  object
 3   Education_Status        20000 non-null  object
 4   Employment_Status       20000 non-null  object
 5   Working_Week (Yearly)   20000 non-null  int64 
 6   Industry_Status         20000 non-null  object
 7   Occupation_Status       20000 non-null  object
 8   Race                    20000 non-null  object
 9   Hispanic_Origin         20000 non-null  object
 10  Martial_Status          20000 non-null  object
 11  Household_Status        20000 non-null  object
 12  Household_Summary       20000 non-null  object
 13  Citizenship             20000 non-null  object
 14  Birth_Country           20000 non-null  object
 15  Bi

In [9]:
train.describe(include='all')

Unnamed: 0,ID,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,...,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status,Income
count,20000,20000.0,20000,20000,20000,20000.0,20000,20000,20000,20000,...,20000,20000,20000,20000,20000,20000.0,20000.0,20000.0,20000,20000.0
unique,20000,,2,17,8,,24,15,5,10,...,5,43,43,43,6,,,,3,
top,TRAIN_00000,,F,High graduate,Children or Armed Forces,,Not in universe or children,Unknown,White,All other,...,Native,US,US,US,Married Filling Jointly both under 65 (MFJ),,,,Under Median,
freq,1,,10472,6494,11142,,4688,4688,16845,17769,...,17825,17825,16563,16594,8588,,,,13237,
mean,,35.6325,,,,34.94305,,,,,...,,,,,,383.1295,40.20215,123.45145,,554.56525
std,,17.994414,,,,22.254592,,,,,...,,,,,,4144.247487,279.182677,1206.949429,,701.553155
min,,0.0,,,,0.0,,,,,...,,,,,,0.0,0.0,0.0,,0.0
25%,,23.0,,,,7.0,,,,,...,,,,,,0.0,0.0,0.0,,0.0
50%,,34.0,,,,52.0,,,,,...,,,,,,0.0,0.0,0.0,,500.0
75%,,47.0,,,,52.0,,,,,...,,,,,,0.0,0.0,0.0,,875.0


## 데이터 전처리 1 : 학습 및 추론 데이터 설정

In [6]:
train_x = train.drop(columns=['ID', 'Income'])
train_y = train['Income']

test_x = test.drop(columns=['ID'])

## 데이터 전처리 2 : 범주형 변수 수치화

In [7]:
from sklearn.preprocessing import LabelEncoder

encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

for i in encoding_target:
    le = LabelEncoder()
    
    # train과 test 데이터셋에서 해당 열의 모든 값을 문자열로 변환
    train_x[i] = train_x[i].astype(str)
    test_x[i] = test_x[i].astype(str)
    
    le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    # test 데이터의 새로운 카테고리에 대해 le.classes_ 배열에 추가
    for case in np.unique(test_x[i]):
        if case not in le.classes_: 
            le.classes_ = np.append(le.classes_, case)
    
    test_x[i] = le.transform(test_x[i])

In [8]:
train_x.head()

Unnamed: 0,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,Martial_Status,...,Household_Summary,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status
0,63,1,15,2,4,20,11,4,0,1,...,4,2,39,39,39,4,0,0,0,2
1,37,1,1,2,52,6,11,4,0,4,...,4,2,39,39,39,5,0,0,0,1
2,58,0,12,2,52,11,0,1,0,1,...,4,2,39,39,39,2,3411,0,0,1
3,44,1,12,2,52,19,12,4,0,0,...,4,2,39,39,39,5,0,0,0,1
4,37,0,12,2,52,19,10,4,0,0,...,4,2,39,39,39,0,0,0,0,2


In [11]:
train_y.head()

0    425
1      0
2    860
3    850
4    570
Name: Income, dtype: int64

In [9]:
test_x.head()

Unnamed: 0,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,Martial_Status,...,Household_Summary,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status
0,79,1,9,0,0,14,14,4,0,5,...,4,2,39,40,40,5,0,0,0,1
1,47,1,7,0,0,14,14,4,8,5,...,0,2,39,39,39,4,0,0,0,1
2,18,0,9,0,52,19,11,4,0,5,...,0,2,39,39,39,5,0,0,0,1
3,39,0,1,2,30,12,11,4,0,1,...,7,2,39,39,39,2,0,0,0,2
4,6,1,3,0,0,14,14,4,6,5,...,2,2,39,39,39,4,0,0,0,2


# Scaling

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

## 모델 선정 및 학습

In [13]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor() 
model.fit(train_x, train_y) 

## 예측 수행

In [14]:
preds = model.predict(test_x)

## 제출양식에 예측결과 입력

In [15]:
submission = pd.read_csv('./sample_submission.csv')
submission['Income'] = preds
submission

Unnamed: 0,ID,Income
0,TEST_0000,0.0
1,TEST_0001,0.0
2,TEST_0002,500.0
3,TEST_0003,475.0
4,TEST_0004,0.0
...,...,...
9995,TEST_9995,1300.0
9996,TEST_9996,700.0
9997,TEST_9997,425.0
9998,TEST_9998,0.0


## 예측결과 저장

In [8]:
submission.to_csv('./baseline_submission1.csv', index=False)