In [114]:
#import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [115]:
#read data
df = pd.read_csv("train.csv")

#### 데이터 전처리

In [116]:
# 국가를 대륙으로 매핑하는 딕셔너리
country_continent_dict = {
 'US': 'US',
 'Cuba': 'North America',
 'Portugal': 'Europe',
 'Mexico': 'North America',
 'Unknown': 'US',
 'Puerto-Rico': 'North America',
 'Germany': 'Europe',
 'Japan': 'Asia',
 'Poland': 'Europe',
 'Columbia': 'South America',
 'Philippines': 'Asia',
 'Italy': 'Europe',
 'Trinadad&Tobago': 'South America',
 'England': 'Europe',
 'South Korea': 'Asia',
 'Iran': 'Asia',
 'France': 'Europe',
 'India': 'Asia',
 'China': 'Asia',
 'Dominican-Republic': 'North America',
 'Scotland': 'Europe',
 'Ecuador': 'South America',
 'Nicaragua': 'North America',
 'Peru': 'South America',
 'Cambodia': 'Asia',
 'Canada': 'North America',
 'Jamaica': 'North America',
 'Vietnam': 'Asia',
 'Hong Kong': 'Asia',
 'Thailand': 'Asia',
 'Haiti': 'North America',
 'Guatemala': 'North America',
 'Laos': 'Asia',
 'Yugoslavia': 'Europe',
 'Ireland': 'Europe',
 'El-Salvador': 'North America',
 'Panama': 'North America',
 'Honduras': 'North America',
 'Greece': 'Europe',
 'Outlying-U S (Guam USVI etc)': 'US',
 'Hungary': 'Europe',
 'Taiwan': 'Asia',
 'Holand-Netherlands': 'Europe'
}

# 업데이트할 열 목록: 본인 출신국가, 엄마 출신국가, 아빠 출신국가
columns_to_update = ['Birth_Country', 'Birth_Country (Mother)', 'Birth_Country (Father)']

# 각 열에 대해 국가를 대륙으로 매핑
for column in columns_to_update:
    df[column] = df[column].map(country_continent_dict)

# Remove rows where 'Age' is below 17 or above 75
df = df[df['Age'].between(17, 75)]

# Remove rows with 'Employment Status' as 'Not Working' or 'Seeking Full-Time'
df = df[~df['Employment_Status'].isin(['Not Working', 'Seeking Full-Time'])]

# Drop the 'Gains', 'Losses', 'Dividends', 'Household_Status', 'Income_Status' columns
df.drop(['Gains', 'Losses', 'Dividends', 'Household_Status', 'Income_Status'], axis=1, inplace=True)

# Map 'Gender' values from 'M' and 'F' to 0 and 1, respectively
df['Gender'] = df['Gender'].map({'M': 0, 'F': 1})

# Consolidate education levels and rename as specified
education_map = {
    'High graduate': 'High', 'High Senior': 'High', 
    'High Junior': 'High', 'High Sophomore': 'High',
    'Elementary (5-6)': 'Elementary(1-6)', 'Elementary (1-4)': 'Elementary(1-6)',
    'Kindergarten': 'Baby', 'Children': 'Baby'
}
df['Education_Status'] = df['Education_Status'].replace(education_map)
columns=[
    'Education_Status',
    'Employment_Status',
    'Industry_Status',
    'Occupation_Status',
    'Race',
    'Hispanic_Origin',
    'Martial_Status',
    'Household_Summary',
    'Citizenship',
    'Birth_Country',
    'Birth_Country (Father)',
    'Birth_Country (Mother)',
    'Tax_Status'
]
df_encoded=pd.get_dummies(df,columns=columns,dtype=int)
#제거코드(ID)
df_encoded = df_encoded.drop(columns=['ID'])
df_encoded.to_csv("final.csv", index=False)

#### 모델 학습


In [117]:
# 필요한 라이브러리 임포트
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

ready_X = df_encoded
ready_y = df['Income']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(ready_X, ready_y, test_size=0.2, random_state=42)

# 모델 초기화 및 학습
rf = RandomForestRegressor(n_estimators=100, random_state=50)
rf.fit(X_train, y_train)


#### 모델 성능 평가 (Training Data 이용)

In [118]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV

# 모델 학습
rf.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = rf.predict(X_test)

# 성능 평가
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R²: {r2}")

# # 하이퍼파라미터 튜닝 (예시)
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20, 30],
# }
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(X_train, y_train)

# # 최적의 파라미터와 그 때의 성능
# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best score (RMSE): {(-grid_search.best_score_) ** 0.5}")


RMSE: 8.718831006966209
R²: 0.9998612546859453


#### 모델 성능 평가 (Test Data 이용)


[2024.04.05]
Test Data만 돌려서 제출만 하면 되는데
Test Data를 모델에 넣기 위한 포멧을 짜주어야 함

In [119]:
test_df = pd.read_csv("Editedtest.csv")
test_df.head()

Unnamed: 0,Age,Gender,Working_Week (Yearly),Education_Status_Associates degree (Academic),Education_Status_Associates degree (Vocational),Education_Status_Baby,Education_Status_Bachelors degree,Education_Status_College,Education_Status_Doctorate degree,Education_Status_Elementary(1-6),...,Birth_Country (Mother)_Europe,Birth_Country (Mother)_North America,Birth_Country (Mother)_South America,Birth_Country (Mother)_US,Tax_Status_Head of Household (HOH),Tax_Status_Married Filling Jointly both over 65 (MFJ),Tax_Status_Married Filling Jointly both under 65 (MFJ),Tax_Status_Married Filling Jointly one over 65 & one under 65 (MFJ),Tax_Status_Nonfiler,Tax_Status_Single
0,47,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
1,18,1,52,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,39,1,30,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
3,31,0,24,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,17,0,51,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [123]:
# 훈련 데이터와 테스트 데이터의 열 순서를 맞추기 위해 train 데이터의 열 순서를 가져옵니다.
train_cols = df_encoded.columns


##인코딩 결과


# 훈련 데이터와 테스트 데이터의 원핫 인코딩 결과를 비교하여 훈련 데이터에만 있는 열을 찾습니다.
missing_cols = set(train_cols) - set(test_df.columns)
print(missing_cols)
# 훈련 데이터에만 있는 열을 테스트 데이터에 추가하고 해당 열의 값을 0으로 설정합니다.
for col in missing_cols:
    test_df[col] = 0

# 테스트 데이터의 열 순서를 train 데이터의 열 순서와 동일하게 재정렬합니다.
test_df = test_df[train_cols]

set()


In [121]:
y_pred = rf.predict(test_df) 
for i in range(100):
    print(y_pred[i],end=" ")

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 

디버깅 시작

In [46]:
# # column_names에 해당하는 열만 남기고 나머지는 삭제
# columns_to_encode = [col for col in df.columns if col in column_names]
# df_to_encode = df[columns_to_encode]

# # 원핫인코딩 수행
# encoded_df = pd.get_dummies(df_to_encode)

In [63]:
# # train 데이터와 test 데이터 불러오기
# train_data =df_encoded
# test_data = pd.read_csv('Editedtest.csv')

# # 범주형 변수의 열 이름 리스트
# categorical_columns = train_data.columns

# # train 데이터와 test 데이터 각각의 범주의 종류 확인
# for col in categorical_columns:
#     train_categories = set(train_data[col].unique())
#     test_categories = set(test_data[col].unique())
    
#     # train 데이터에만 있는 범주를 test 데이터에 추가
#     new_categories = test_categories - train_categories
#     if new_categories:
#         print(f"New categories found in {col} of test data:", new_categories)
#         # 새로운 범주를 train 데이터에 있는 범주 중 하나로 대체
#         test_data[col] = test_data[col].replace(list(new_categories), train_data[col].iloc[0])

# # 이후에 모델을 학습하고 예측을 수행합니다.

KeyError: 'Income'

In [64]:
# train_data와 test_data에 모두 존재하는 열 이름만 필터링
common_columns = set(train_data.columns) & set(test_data.columns)

# 범주형 열 이름을 명시적으로 지정하거나, 데이터 타입을 기반으로 필터링하여 common_columns를 조정
# 예시: categorical_columns = [col for col in common_columns if train_data[col].dtype == 'object']

# 공통 열에 대해서만 순회
for col in common_columns:
    train_categories = set(train_data[col].unique())
    test_categories = set(test_data[col].unique())
    
    new_categories = test_categories - train_categories
    if new_categories:
        print(f"테스트 데이터의 {col} 열에서 새로운 범주 발견:", new_categories)
        # 새로운 범주를 대체하는 로직 (예: '기타'로 대체)
        test_data[col] = test_data[col].replace(list(new_categories), '기타')
