In [25]:
#import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [26]:
#read data
df = pd.read_csv("train.csv")

#### 데이터 전처리

In [27]:
# 국가를 대륙으로 매핑하는 딕셔너리
country_continent_dict = {
 'US': 'US',
 'Cuba': 'North America',
 'Portugal': 'Europe',
 'Mexico': 'North America',
 'Unknown': 'US',
 'Puerto-Rico': 'North America',
 'Germany': 'Europe',
 'Japan': 'Asia',
 'Poland': 'Europe',
 'Columbia': 'South America',
 'Philippines': 'Asia',
 'Italy': 'Europe',
 'Trinadad&Tobago': 'South America',
 'England': 'Europe',
 'South Korea': 'Asia',
 'Iran': 'Asia',
 'France': 'Europe',
 'India': 'Asia',
 'China': 'Asia',
 'Dominican-Republic': 'North America',
 'Scotland': 'Europe',
 'Ecuador': 'South America',
 'Nicaragua': 'North America',
 'Peru': 'South America',
 'Cambodia': 'Asia',
 'Canada': 'North America',
 'Jamaica': 'North America',
 'Vietnam': 'Asia',
 'Hong Kong': 'Asia',
 'Thailand': 'Asia',
 'Haiti': 'North America',
 'Guatemala': 'North America',
 'Laos': 'Asia',
 'Yugoslavia': 'Europe',
 'Ireland': 'Europe',
 'El-Salvador': 'North America',
 'Panama': 'North America',
 'Honduras': 'North America',
 'Greece': 'Europe',
 'Outlying-U S (Guam USVI etc)': 'US',
 'Hungary': 'Europe',
 'Taiwan': 'Asia',
 'Holand-Netherlands': 'Europe'
}

# 업데이트할 열 목록: 본인 출신국가, 엄마 출신국가, 아빠 출신국가
columns_to_update = ['Birth_Country', 'Birth_Country (Mother)', 'Birth_Country (Father)']

# 각 열에 대해 국가를 대륙으로 매핑
for column in columns_to_update:
    df[column] = df[column].map(country_continent_dict)

# Remove rows where 'Age' is below 17 or above 75
df = df[df['Age'].between(17, 75)]

# Remove rows with 'Employment Status' as 'Not Working' or 'Seeking Full-Time'
df = df[~df['Employment_Status'].isin(['Not Working', 'Seeking Full-Time'])]

# Drop the 'Gains', 'Losses', 'Dividends', 'Household_Status', 'Income_Status' columns
df.drop(['Gains', 'Losses', 'Dividends', 'Household_Status', 'Income_Status'], axis=1, inplace=True)

# Map 'Gender' values from 'M' and 'F' to 0 and 1, respectively
df['Gender'] = df['Gender'].map({'M': 0, 'F': 1})

# Consolidate education levels and rename as specified
education_map = {
    'High graduate': 'High', 'High Senior': 'High', 
    'High Junior': 'High', 'High Sophomore': 'High',
    'Elementary (5-6)': 'Elementary(1-6)', 'Elementary (1-4)': 'Elementary(1-6)',
    'Kindergarten': 'Baby', 'Children': 'Baby'
}
df['Education_Status'] = df['Education_Status'].replace(education_map)
columns=[
    'Education_Status',
    'Employment_Status',
    'Industry_Status',
    'Occupation_Status',
    'Race',
    'Hispanic_Origin',
    'Martial_Status',
    'Household_Summary',
    'Citizenship',
    'Birth_Country',
    'Birth_Country (Father)',
    'Birth_Country (Mother)',
    'Tax_Status'
]
df_encoded=pd.get_dummies(df,columns=columns,dtype=int)
#제거코드(ID)
df_encoded = df_encoded.drop(columns=['ID'])
df_encoded = df_encoded.drop(columns=['Income'])
df_encoded.to_csv("final.csv", index=False)

#### 모델 학습


In [28]:
# 필요한 라이브러리 임포트
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


ready_X = df_encoded
ready_y = df['Income']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(ready_X, ready_y, test_size=0.2, random_state=42)

# 모델 초기화 및 학습
rf = RandomForestRegressor(n_estimators=100, random_state=50)
rf.fit(X_train, y_train)


#### 모델 성능 평가 (Training Data 이용)

In [29]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV

# 모델 학습
rf.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = rf.predict(X_test)

# 성능 평가
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R²: {r2}")



RMSE: 720.8605932437393
R²: 0.05157198443552957


#### 모델 성능 평가 (Test Data 이용)


In [30]:
test_df = pd.read_csv("Editedtest.csv")
test_df.head()

Unnamed: 0,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,Martial_Status,Household_Summary,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status
0,79,0,High,Children or Armed Forces,0,Not in universe or children,Unknown,White,All other,Single,Householder,Native,US,US,US,Single
1,47,0,Elementary(1-6),Children or Armed Forces,0,Not in universe or children,Unknown,White,Other Spanish,Single,Child 18 or older,Native,US,US,US,Nonfiler
2,18,1,High,Children or Armed Forces,52,Retail,Services,White,All other,Single,Child 18 or older,Native,US,US,US,Single
3,39,1,Associates degree (Vocational),Full-Time,30,Medical (except Hospitals),Services,White,All other,Married,Spouse of householder,Native,US,US,US,Married Filling Jointly both under 65 (MFJ)
4,6,0,Baby,Children or Armed Forces,0,Not in universe or children,Unknown,White,Mexican-American,Single,Child under 18 never married,Native,US,US,US,Nonfiler


In [31]:
row_to_zero = test_df[(test_df["Age"] < 17) | 
                      (test_df["Age"] > 75) | 
                      (test_df["Employment_Status"] == "Not Working") | 
                      (test_df["Employment_Status"] == "Seeking Full-Time")].index

In [32]:
##################
## 원핫인코딩을 형식을 알잘딱해서 잘 한코드

# 'df_encoded'는 훈련 데이터에 대해 이미 원핫 인코딩이 수행된 DataFrame입니다.
# 'train_cols'는 원핫 인코딩된 훈련 데이터의 열 순서입니다.
train_cols = df_encoded.columns

# 'columns' 리스트에 지정된 열에 대해 test_df 데이터프레임을 원핫 인코딩합니다.
test_df_encoded = pd.get_dummies(test_df, columns=columns, dtype=int)

# 훈련 데이터에만 있는 열을 찾습니다.
missing_cols = set(train_cols) - set(test_df_encoded.columns)

# 훈련 데이터에만 있는 열을 테스트 데이터에 추가하고, 해당 열의 값을 0으로 설정합니다.
for col in missing_cols:
    test_df_encoded[col] = 0

# 테스트 데이터에서 훈련 데이터에 없는 열을 찾습니다.
extra_cols = set(test_df_encoded.columns) - set(train_cols)

# 테스트 데이터에서 훈련 데이터에 없는 열을 제거합니다.
test_df_encoded = test_df_encoded.drop(columns=extra_cols)

# 테스트 데이터의 열 순서를 훈련 데이터의 열 순서와 동일하게 재정렬합니다.
test_df_encoded = test_df_encoded[train_cols]

# 이제 'test_df_encoded'는 원핫 인코딩된 테스트 데이터이며, 열 순서가 'df_encoded'와 일치합니다.


In [33]:
test_df_encoded

Unnamed: 0,Age,Gender,Working_Week (Yearly),Education_Status_Associates degree (Academic),Education_Status_Associates degree (Vocational),Education_Status_Baby,Education_Status_Bachelors degree,Education_Status_College,Education_Status_Doctorate degree,Education_Status_Elementary(1-6),...,Birth_Country (Mother)_Europe,Birth_Country (Mother)_North America,Birth_Country (Mother)_South America,Birth_Country (Mother)_US,Tax_Status_Head of Household (HOH),Tax_Status_Married Filling Jointly both over 65 (MFJ),Tax_Status_Married Filling Jointly both under 65 (MFJ),Tax_Status_Married Filling Jointly one over 65 & one under 65 (MFJ),Tax_Status_Nonfiler,Tax_Status_Single
0,79,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,47,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
2,18,1,52,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,39,1,30,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,6,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,31,0,52,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
9996,27,0,52,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
9997,18,0,7,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
9998,9,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [34]:
y_pred = rf.predict(test_df_encoded) 
for i in range(len(y_pred)):
    if i in row_to_zero:
        y_pred[i]=0
    


## Test.csv 파일 예측 완료


In [35]:
# ID 생성
ids = [f'TEST_{i:04d}' for i in range(len(y_pred))]

# 데이터프레임 생성
df = pd.DataFrame({
    'ID': ids,
    'Income': y_pred
})

# CSV 파일로 저장
df.to_csv('ourSubmissionFile.csv', index=False)