## 집 값 예측
- 예측할 변수 ['SalePrice']
- 평가: rmse, r2

    - rmse는 낮을 수록 좋은 성능
    - r2는 높을 수록 좋은 성능
   

In [52]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=2021)
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[id_name, target])
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[id_name, target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='SalePrice', id_name='Id')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1168, 79), (292, 79), (1168, 2), (292, 2))

# Data Load & Simple EDA

In [53]:
# 라이브러리 및 데이터 불러오기

import pandas as pd
import numpy as np

In [54]:
X_train.shape, X_test.shape, y_train.shape

((1168, 79), (292, 79), (1168, 2))

In [55]:
y_train.head()

Unnamed: 0,Id,SalePrice
81,82,153500
1418,1419,124000
1212,1213,113000
588,589,143000
251,252,235000


In [56]:
# 데이터 확인
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
81,120,RM,32.0,4500,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,3,2006,WD,Normal
1418,20,RL,71.0,9204,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2008,COD,Normal
1212,30,RL,50.0,9340,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2009,WD,Normal
588,20,RL,65.0,25095,Pave,,IR1,Low,AllPub,Inside,...,60,0,,,,0,6,2009,WD,Partial
251,120,RM,44.0,4750,Pave,,IR1,HLS,AllPub,Inside,...,153,0,,,,0,12,2007,WD,Family


In [57]:
X_test.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1380,30,RL,45.0,8212,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
520,190,RL,60.0,10800,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2008,WD,Normal
1175,50,RL,85.0,10678,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2007,WD,Normal
351,120,RL,,5271,Pave,,IR1,Low,AllPub,Inside,...,184,0,,,,0,12,2006,WD,Abnorml
1335,20,RL,80.0,9650,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2009,WD,Normal


In [58]:
X_test_id = X_test.index
X_test_id

Int64Index([1380,  520, 1175,  351, 1335,  280,  977,  469, 1051,  291,
            ...
             954, 1102,  380, 1235, 1388, 1283, 1039,   61, 1395,  906],
           dtype='int64', length=292)

In [59]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 81 to 1140
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1168 non-null   int64  
 1   MSZoning       1168 non-null   object 
 2   LotFrontage    956 non-null    float64
 3   LotArea        1168 non-null   int64  
 4   Street         1168 non-null   object 
 5   Alley          70 non-null     object 
 6   LotShape       1168 non-null   object 
 7   LandContour    1168 non-null   object 
 8   Utilities      1168 non-null   object 
 9   LotConfig      1168 non-null   object 
 10  LandSlope      1168 non-null   object 
 11  Neighborhood   1168 non-null   object 
 12  Condition1     1168 non-null   object 
 13  Condition2     1168 non-null   object 
 14  BldgType       1168 non-null   object 
 15  HouseStyle     1168 non-null   object 
 16  OverallQual    1168 non-null   int64  
 17  OverallCond    1168 non-null   int64  
 18  YearBui

In [60]:
# 결측치 확인
X_train.isnull().sum().sort_values(ascending=False)[:20]

PoolQC          1163
MiscFeature     1124
Alley           1098
Fence            937
FireplaceQu      553
LotFrontage      212
GarageType        61
GarageYrBlt       61
GarageQual        61
GarageCond        61
GarageFinish      61
BsmtExposure      30
BsmtCond          29
BsmtFinType2      29
BsmtFinType1      29
BsmtQual          29
MasVnrArea         6
MasVnrType         6
Electrical         1
BedroomAbvGr       0
dtype: int64

In [61]:
print(X_test.isnull().sum().sort_values(ascending=False)[:20])

PoolQC          290
MiscFeature     282
Alley           271
Fence           242
FireplaceQu     137
LotFrontage      47
GarageType       20
GarageYrBlt      20
GarageQual       20
GarageCond       20
GarageFinish     20
BsmtFinType2      9
BsmtCond          8
BsmtFinType1      8
BsmtQual          8
BsmtExposure      8
MasVnrArea        2
MasVnrType        2
BedroomAbvGr      0
BsmtFullBath      0
dtype: int64


In [62]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 81 to 1140
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1168 non-null   int64  
 1   MSZoning       1168 non-null   object 
 2   LotFrontage    956 non-null    float64
 3   LotArea        1168 non-null   int64  
 4   Street         1168 non-null   object 
 5   Alley          70 non-null     object 
 6   LotShape       1168 non-null   object 
 7   LandContour    1168 non-null   object 
 8   Utilities      1168 non-null   object 
 9   LotConfig      1168 non-null   object 
 10  LandSlope      1168 non-null   object 
 11  Neighborhood   1168 non-null   object 
 12  Condition1     1168 non-null   object 
 13  Condition2     1168 non-null   object 
 14  BldgType       1168 non-null   object 
 15  HouseStyle     1168 non-null   object 
 16  OverallQual    1168 non-null   int64  
 17  OverallCond    1168 non-null   int64  
 18  YearBui

# Preprocessing

In [63]:
# 전처리 

X_train = X_train.select_dtypes(exclude = ['object'])
X_test = X_test.select_dtypes(exclude = ['object'])
y = y_train['SalePrice']

In [64]:
print(X_train.head(3))

      MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
81           120         32.0     4500            6            5       1998   
1418          20         71.0     9204            5            5       1963   
1212          30         50.0     9340            4            6       1941   

      YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  GarageArea  \
81            1998       443.0        1201           0  ...         405   
1418          1963         0.0          25         872  ...         336   
1212          1950         0.0         344           0  ...         234   

      WoodDeckSF  OpenPorchSF  EnclosedPorch  3SsnPorch  ScreenPorch  \
81             0          199              0          0            0   
1418           0           88              0          0            0   
1212           0          113              0          0            0   

      PoolArea  MiscVal  MoSold  YrSold  
81           0        0       3    2006  
1418     

In [65]:
# 결측치 처리

from sklearn.impute import SimpleImputer

imp = SimpleImputer()
X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)

In [66]:
X_train

array([[1.20000000e+02, 3.20000000e+01, 4.50000000e+03, ...,
        0.00000000e+00, 3.00000000e+00, 2.00600000e+03],
       [2.00000000e+01, 7.10000000e+01, 9.20400000e+03, ...,
        0.00000000e+00, 8.00000000e+00, 2.00800000e+03],
       [3.00000000e+01, 5.00000000e+01, 9.34000000e+03, ...,
        0.00000000e+00, 8.00000000e+00, 2.00900000e+03],
       ...,
       [6.00000000e+01, 8.90000000e+01, 1.16450000e+04, ...,
        0.00000000e+00, 8.00000000e+00, 2.00600000e+03],
       [6.00000000e+01, 7.00700837e+01, 7.50000000e+03, ...,
        0.00000000e+00, 1.00000000e+00, 2.01000000e+03],
       [2.00000000e+01, 6.00000000e+01, 7.35000000e+03, ...,
        0.00000000e+00, 6.00000000e+00, 2.00800000e+03]])

In [67]:
# train, validation 데이터 분리하기 

from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size = 0.15, random_state = 2023)
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((992, 36), (176, 36), (992,), (176,))

# Model

In [68]:
# 평가지수 (r2; 1에 가까울 수록 (높을수록)좋음, rmse: 낮을수록 좋음)

from sklearn.metrics import mean_squared_error, r2_score

def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [69]:
# model 불러오기


from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor 

In [70]:
# XGBRegressor

XGB = XGBRegressor()
XGB.fit(X_tr, y_tr, verbose = False)
pred = XGB.predict(X_val)

print("R2: ",r2_score(y_val, pred))
print("RMSE: ",rmse(y_val, pred))

R2:  0.7884189024748108
RMSE:  34979.4070902125


In [71]:
# RandomForestRegressor

rfr = RandomForestRegressor()
rfr.fit(X_tr, y_tr)
pred = rfr.predict(X_val)

print("R2: ",r2_score(y_val, pred))
print("RMSE: ",rmse(y_val,pred))

R2:  0.8414930764851496
RMSE:  30275.974402964508


In [72]:
final_model = RandomForestRegressor()
final_model.fit(X_train, y)

prediction = final_model.predict(X_test)

## Prediction & to CSV

In [73]:
submission = pd.DataFrame(data={
    'Id': X_test_id,
    'income' : prediction
})

In [74]:
submission.head()

Unnamed: 0,Id,income
0,1380,90053.05
1,520,103546.04
2,1175,344719.27
3,351,208414.35
4,1335,168657.1


In [75]:
submission.to_csv("12345.csv", index=False)

In [77]:
check = pd.read_csv('12345.csv')
print(check.head())

     Id     income
0  1380   90053.05
1   520  103546.04
2  1175  344719.27
3   351  208414.35
4  1335  168657.10


# 결과 체점

In [78]:
pred = final_model.predict(X_test)
print("RMSE : " + str(rmse(y_test['SalePrice'], prediction)))
print("R2 : " + str(r2_score(y_test['SalePrice'], prediction)))

RMSE : 26110.303140685344
R2 : 0.8831723614279051
