## 집 값 예측
- 예측할 변수 ['SalePrice']
- 평가: rmse, r2

    - rmse는 낮을 수록 좋은 성능
    - r2는 높을 수록 좋은 성능
   

In [1]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=2021)
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[id_name, target])
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[id_name, target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='SalePrice', id_name='Id')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1168, 79), (292, 79), (1168, 2), (292, 2))

In [2]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
81,120,RM,32.0,4500,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,3,2006,WD,Normal
1418,20,RL,71.0,9204,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2008,COD,Normal
1212,30,RL,50.0,9340,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2009,WD,Normal
588,20,RL,65.0,25095,Pave,,IR1,Low,AllPub,Inside,...,60,0,,,,0,6,2009,WD,Partial
251,120,RM,44.0,4750,Pave,,IR1,HLS,AllPub,Inside,...,153,0,,,,0,12,2007,WD,Family


In [3]:
y_train['SalePrice'].value_counts()

135000    16
140000    14
145000    12
155000    11
110000    11
          ..
171900     1
475000     1
369900     1
136000     1
216000     1
Name: SalePrice, Length: 568, dtype: int64

In [4]:
# 범주형 변수 확인
categorical = []
col = X_train.columns

for c in col:
    val = X_train[c][~X_train[c].isnull()].iloc[0]
    #print(val)
    if str(val).isalpha():
        categorical.append(c)
print(len(categorical))

# 범주형 변수가 너무 많아...

41


In [5]:
# 범주형 변수는 제외
X_train = X_train.select_dtypes(exclude=['object'])
X_test = X_test.select_dtypes(exclude=['object'])

X_train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
81,120,32.0,4500,6,5,1998,1998,443.0,1201,0,...,405,0,199,0,0,0,0,0,3,2006
1418,20,71.0,9204,5,5,1963,1963,0.0,25,872,...,336,0,88,0,0,0,0,0,8,2008
1212,30,50.0,9340,4,6,1941,1950,0.0,344,0,...,234,0,113,0,0,0,0,0,8,2009
588,20,65.0,25095,5,8,1968,2003,0.0,1324,0,...,452,0,48,0,0,60,0,0,6,2009
251,120,44.0,4750,8,5,2006,2007,481.0,1573,0,...,538,123,0,0,0,153,0,0,12,2007


In [6]:
# 결측값 확인
print(X_train.isnull().sum())
print()
print(X_test.isnull().sum())

MSSubClass         0
LotFrontage      212
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         6
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       61
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
dtype: int64

MSSubClass        0
LotFrontage      47
LotArea           0
OverallQual       0
OverallCond       0
YearBuilt         0
YearRemodAdd      0
MasVnrArea        2
BsmtFinSF1        0
BsmtFinSF2        0
BsmtUnfSF         0
TotalBsmtS

In [16]:
# 결측값 평균으로 대체

cols = ['MasVnrArea', 'LotFrontage', 'GarageYrBlt']

X_train[cols] = X_train[cols].fillna(X_train[cols].mean())

X_test[cols] = X_test[cols].fillna(X_test[cols].mean())

In [17]:
# train/test split

from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train['SalePrice'], test_size=0.2)
print(X_tr.shape)
print(y_tr.isnull().sum())

(934, 36)
0


In [25]:
from sklearn.ensemble import RandomForestRegressor

rf_r = RandomForestRegressor(max_depth=4)

rf_r.fit(X_tr, y_tr)
pred = rf_r.predict(X_test)

print("훈련 정확도:", rf_r.score(X_tr, y_tr))
print("테스트 정확도:", rf_r.score(X_val, y_val))

훈련 정확도: 0.8757354668862997
테스트 정확도: 0.7486981975916354


In [32]:
print(y_test['SalePrice'])

1380     58500
520     106250
1175    285000
351     190000
1335    167900
         ...  
1283    139000
1039     80000
61      101000
1395    281213
906     255000
Name: SalePrice, Length: 292, dtype: int64


In [33]:
print(pred)

[108120.9873549  114719.04778589 354315.13724145 202703.12156313
 159600.03940809 217970.85648701 188261.68743823 167140.98057455
 183396.1181207  152675.83085089 120149.29874645 138417.22883104
 155228.23166635 185587.35657489 116776.59813329 303715.87029154
 188955.92093032 166681.6051135  156111.04044147 197477.32124333
 119854.76010063 127042.05915744 237457.59653724 112266.36393716
 339579.3218214  113217.17947459 134166.69846705 163129.49901299
 230012.16886652 139147.14462608 131531.64568736 123934.51918951
 230280.40867535 144738.52929695 164883.74496594 117405.97031014
 185530.14711108 123611.90459615 162300.28538896 204317.96631403
 194580.44422658 117468.62886594 143655.41782583 180777.90423359
 224661.72324895 190293.04862255 136479.90121384 258890.70582814
 187887.84282059 207202.21798826 171592.27273243 323005.43792253
 155740.00828275 258041.90960471 122516.65702679 124629.22446431
 136782.62327359 174330.15529339 213954.50775509 120791.46158965
 178776.84257059 178765.3

In [40]:
from sklearn.metrics import mean_squared_error, r2_score

print(np.sqrt(mean_squared_error(y_test['SalePrice'], pred)))
print(r2_score(y_test['SalePrice'], pred))

31728.322770122042
0.8274893147596327


In [37]:
import sklearn
dir(sklearn.metrics)

['ConfusionMatrixDisplay',
 'PrecisionRecallDisplay',
 'RocCurveDisplay',
 'SCORERS',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_base',
 '_classification',
 '_pairwise_fast',
 '_plot',
 '_ranking',
 '_regression',
 '_scorer',
 'accuracy_score',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'auc',
 'average_precision_score',
 'balanced_accuracy_score',
 'brier_score_loss',
 'calinski_harabasz_score',
 'check_scoring',
 'classification_report',
 'cluster',
 'cohen_kappa_score',
 'completeness_score',
 'confusion_matrix',
 'consensus_score',
 'coverage_error',
 'davies_bouldin_score',
 'dcg_score',
 'euclidean_distances',
 'explained_variance_score',
 'f1_score',
 'fbeta_score',
 'fowlkes_mallows_score',
 'get_scorer',
 'hamming_loss',
 'hinge_loss',
 'homogeneity_completeness_v_measure',
 'homogeneity_score',
 'jaccard_score',
 'label_ranking_average_precision_score',
 'label_rank

In [42]:
result = pd.DataFrame({'id':y_test['Id'], 'income': pred})
result.head()

Unnamed: 0,id,income
1380,1381,108120.987355
520,521,114719.047786
1175,1176,354315.137241
351,352,202703.121563
1335,1336,159600.039408


In [None]:
#result.to_csv("12345.csv", index=False)