In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('./house_price.csv')
df.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [5]:
df.drop('Id', axis=1, inplace=True)

In [6]:
df.isna().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 80, dtype: int64

In [12]:
df_t = df.isna().sum().sort_values(ascending=False).head(5).index

In [13]:
df.drop(df_t, axis=1, inplace=True)

In [15]:
df.isna().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 75, dtype: int64

In [17]:
df.fillna(df.mean(), inplace=True)

In [20]:
df.isna().sum().sort_values(ascending=False)

GarageType      81
GarageCond      81
GarageFinish    81
GarageQual      81
BsmtFinType2    38
                ..
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
MSZoning         0
SalePrice        0
Length: 75, dtype: int64

In [25]:
df_one = pd.get_dummies(df)
df_one.head(3)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0


In [26]:
y = df_one.SalePrice
x = df_one.drop('SalePrice', axis=1)

In [30]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=100)

In [31]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [32]:
lr = LinearRegression()
rid = Ridge()
las = Lasso()

In [33]:
models = [lr, rid, las]

In [34]:
for m in models:
    m.fit(x_train, y_train)
    pred = m.predict(x_test)
    acc = m.score(x_test, y_test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    
    print(m.__class__.__name__)
    print('acc : ', acc)
    print('rmse : ', rmse)
    print('='*100)

LinearRegression
acc :  0.804177791991873
rmse :  35481.80707038126
Ridge
acc :  0.8827116942444783
rmse :  27460.10003318563
Lasso
acc :  0.8063809012451696
rmse :  35281.64742371554


In [35]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [36]:
mim = MinMaxScaler()
std = StandardScaler()

In [37]:
mim.fit(x_train)

x_train_m = mim.transform(x_train)
x_test_m = mim.transform(x_test)

In [38]:
std.fit(x_train)

x_train_std = std.transform(x_train)
x_test_std = std.transform(x_test)

In [39]:
models = [lr, rid, las]

for m in models:
    m.fit(x_train_m, y_train)
    pred = m.predict(x_test_m)
    acc = m.score(x_test_m, y_test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    
    print(m.__class__.__name__)
    print('acc : ', acc)
    print('rmse : ', rmse)
    print('='*100)

LinearRegression
acc :  -2.7691912436497524e+20
rmse :  1334293079092986.2
Ridge
acc :  0.8850412391324003
rmse :  27186.030230399694
Lasso
acc :  0.8068272458417818
rmse :  35240.95707088464


In [40]:
models = [lr, rid, las]

for m in models:
    m.fit(x_train_std, y_train)
    pred = m.predict(x_test_std)
    acc = m.score(x_test_std, y_test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    
    print(m.__class__.__name__)
    print('acc : ', acc)
    print('rmse : ', rmse)
    print('='*100)

LinearRegression
acc :  -2.271834727573522e+24
rmse :  1.2085458336211141e+17
Ridge
acc :  0.8052338076604678
rmse :  35386.005892488545
Lasso
acc :  0.8052537524156911
rmse :  35384.19401927143


# 질문

In [41]:
y_log = np.log1p(y)

y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [43]:
models = [lr, rid, las]

for m in models:
    m.fit(x_train_std, y_train_log)
    pred = m.predict(x_test_std)
    preds = np.expm1(pred)
#     acc = m.score(x_test_std, y_test)
    rmse = np.sqrt(mean_squared_error(y_test_log, preds))
    
    print(m.__class__.__name__)
#     print('acc : ', acc)
    print('rmse : ', rmse)
    print('='*100)

LinearRegression
rmse :  191854.16497554356
Ridge
rmse :  195691.05097092965
Lasso
rmse :  166596.44500021692


In [44]:
from sklearn.model_selection import GridSearchCV

In [45]:
rid_params = {'alpha': [0.005, 0.1, 1, 5, 8, 10, 12, 15, 20, 30, 50, 700]}
las_params = {'alpha': [0.001, 0.005, 0.008, 0.05, 0.03, 0.1, 0.5, 1.5, 10]}

In [46]:
rid_g = Ridge()
las_g = Lasso()

In [47]:
gb = GridSearchCV(rid_g, param_grid=rid_params, refit=True, verbose=False)
gb.fit(x_train_std, y_train)

GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [0.005, 0.1, 1, 5, 8, 10, 12, 15, 20, 30, 50,
                                   700]},
             verbose=False)

In [48]:
gb.best_params_

{'alpha': 700}

In [49]:
gb.best_score_

0.8318239277255011

In [51]:
gb = GridSearchCV(las_g, param_grid=las_params, refit=True, verbose=False)
gb.fit(x_train_std, y_train)

GridSearchCV(estimator=Lasso(),
             param_grid={'alpha': [0.001, 0.005, 0.008, 0.05, 0.03, 0.1, 0.5,
                                   1.5, 10]},
             verbose=False)

In [52]:
gb.best_params_

{'alpha': 10}

In [53]:
gb.best_score_

0.7844012183079799

In [55]:
rid_g = Ridge(alpha=700)
las_g = Lasso(alpha=10)

rid_g.fit(x_train_std, y_train)
las_g.fit(x_train_std, y_train)

print(rid_g.score(x_test_std, y_test))
print(las_g.score(x_test_std, y_test))

0.8723836518937673
0.8063054226906495


In [None]:
# feature importance

In [56]:
df_1 = pd.read_csv('./house_price.csv')
df_1.head(3)

df_1.drop('Id', axis=1, inplace=True)
df_t = df.isna().sum().sort_values(ascending=False).head(5).index

df_1.drop(df_t, axis=1, inplace=True)
df_1.fillna(df.mean(), inplace=True)

In [57]:
y = df_1.SalePrice
x = df_1.drop('SalePrice', axis=1)

In [58]:
x_ohe = pd.get_dummies(x)

In [59]:
scaler = MinMaxScaler()
scaler.fit_transform(x_ohe)

array([[0.23529412, 0.15068493, 0.0334198 , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.20205479, 0.03879502, ..., 0.        , 1.        ,
        0.        ],
       [0.23529412, 0.1609589 , 0.04650728, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.29411765, 0.15410959, 0.03618687, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.1609589 , 0.03934189, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.18493151, 0.04037019, ..., 0.        , 1.        ,
        0.        ]])

In [61]:
x_ohe_scaled = pd.DataFrame(x_ohe, columns = x_ohe.columns)
x_ohe_scaled.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [62]:
x_train, x_test, y_train, y_test = train_test_split(x_ohe_scaled, y, random_state=111)

In [63]:
lr = LinearRegression()
rid = Ridge()
las = Lasso()

models = [lr, rid, las]
trained_models = []

for m in models:
    m.fit(x_train, y_train)
    trained_models.append(m)
    preds = m.predict(x_test)
    accuracy = m.score(x_test, y_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    
    print('--------')
    print(m.__class__.__name__)
    print('accuracy :', accuracy)
    print('rmse :', rmse)

--------
LinearRegression
accuracy : 0.8693518439386663
rmse : 31774.817678926014
--------
Ridge
accuracy : 0.865957519933437
rmse : 32184.936180360095
--------
Lasso
accuracy : 0.8792942031191979
rmse : 30541.86047133667
