In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score



data = pd.read_csv('kc_house_data.csv')
print(data.shape)
data.head(3)


(21613, 21)


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062


In [123]:
data = data.drop(["date"], axis=1)

In [124]:
from sklearn.utils import shuffle

data = shuffle(data)

In [125]:
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
5270,205000010,620000.0,4,2.5,2450,55387,2.0,0,0,3,9,2450,0,1994,0,98053,47.6323,-121.985,2730,38827
14507,4136930360,359800.0,4,2.5,2390,6426,2.0,0,0,3,9,2390,0,1999,0,98092,47.2586,-122.221,2520,6700
8237,293000145,250000.0,4,1.0,1440,7404,1.0,0,0,3,6,1080,360,1918,0,98126,47.5328,-122.379,1620,7436
14772,1240100065,807500.0,4,2.5,3190,24170,2.0,0,0,3,10,3190,0,2002,0,98074,47.6209,-122.052,2110,26321
6874,1277000020,915000.0,4,2.5,3210,8532,2.0,0,0,3,10,3210,0,1998,0,98007,47.625,-122.144,2950,6753


In [126]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 21613 entries, 5270 to 10287
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   price          21613 non-null  float64
 2   bedrooms       21613 non-null  int64  
 3   bathrooms      21613 non-null  float64
 4   sqft_living    21613 non-null  int64  
 5   sqft_lot       21613 non-null  int64  
 6   floors         21613 non-null  float64
 7   waterfront     21613 non-null  int64  
 8   view           21613 non-null  int64  
 9   condition      21613 non-null  int64  
 10  grade          21613 non-null  int64  
 11  sqft_above     21613 non-null  int64  
 12  sqft_basement  21613 non-null  int64  
 13  yr_built       21613 non-null  int64  
 14  yr_renovated   21613 non-null  int64  
 15  zipcode        21613 non-null  int64  
 16  lat            21613 non-null  float64
 17  long           21613 non-null  float64
 18  sqft_liv

In [127]:
#상관관계 따지기
data.corr()['price'].sort_values(ascending=False)

price            1.000000
sqft_living      0.702035
grade            0.667434
sqft_above       0.605567
sqft_living15    0.585379
bathrooms        0.525138
view             0.397293
sqft_basement    0.323816
bedrooms         0.308350
lat              0.307003
waterfront       0.266369
floors           0.256794
yr_renovated     0.126434
sqft_lot         0.089661
sqft_lot15       0.082447
yr_built         0.054012
condition        0.036362
long             0.021626
id              -0.016762
zipcode         -0.053203
Name: price, dtype: float64

## 특성추가

In [128]:
new_data = data.copy()

In [129]:
# 관계가 높은 특성을 이용하여 새로운 특성 만들기
new_data['sqft_living*grade'] = new_data['sqft_living'] * new_data['grade']
new_data['sqft_living*sqft_above'] = new_data['sqft_living'] * new_data['sqft_above']
new_data['sqft_living*sqft_living15'] = new_data['sqft_living'] * new_data['sqft_living15']
new_data['sqft_living*sqft_bathrooms'] = new_data['sqft_living'] * new_data['bathrooms']
new_data['sqft_living*sqft_view'] = new_data['sqft_living'] * new_data['view']

new_data['sqft_living_sum_grade'] = new_data['sqft_living'] + new_data['grade']
new_data['sqft_living_sum_sqft_above'] = new_data['sqft_living'] + new_data['sqft_above']
new_data['sqft_living_sum_sqft_living15'] = new_data['sqft_living'] + new_data['sqft_living15']
new_data['sqft_living_sum_sqft_bathrooms'] = new_data['sqft_living'] + new_data['bathrooms']
new_data['sqft_living_sum_sqft_view'] = new_data['sqft_living'] + new_data['view']

new_data['sqft_living - grade'] = new_data['sqft_living'] - new_data['grade']
new_data['sqft_living - sqft_above'] = new_data['sqft_living'] - new_data['sqft_above']
new_data['sqft_living - sqft_living15'] = new_data['sqft_living'] - new_data['sqft_living15']
new_data['sqft_living - bathrooms'] = new_data['sqft_living'] - new_data['bathrooms']
new_data['sqft_living - view'] = new_data['sqft_living'] - new_data['view']


new_data['sqft_living_sum_grade / 2'] = (new_data['sqft_living'] + new_data['grade'])/2
new_data['sqft_living_sum_sqft_above / 2'] = (new_data['sqft_living'] + new_data['sqft_above'])/2
new_data['sqft_living_sum_sqft_living15 / 2'] = (new_data['sqft_living'] + new_data['sqft_living15'])/2
new_data['sqft_living_sum_sqft_bathrooms / 2'] = (new_data['sqft_living'] + new_data['bathrooms'])/2
new_data['sqft_living_sum_sqft_view / 2'] = (new_data['sqft_living'] + new_data['view'])/2


In [130]:
new_data.corr()['price'].sort_values(ascending=False)

price                                 1.000000
sqft_living*grade                     0.756279
sqft_living*sqft_living15             0.713117
sqft_living*sqft_bathrooms            0.707134
sqft_living_sum_grade / 2             0.702204
sqft_living_sum_grade                 0.702204
sqft_living_sum_sqft_view             0.702200
sqft_living_sum_sqft_view / 2         0.702200
sqft_living - bathrooms               0.702039
sqft_living                           0.702035
sqft_living_sum_sqft_bathrooms        0.702031
sqft_living_sum_sqft_bathrooms / 2    0.702031
sqft_living - view                    0.701870
sqft_living - grade                   0.701866
sqft_living*sqft_above                0.698087
sqft_living_sum_sqft_living15         0.694920
sqft_living_sum_sqft_living15 / 2     0.694920
sqft_living_sum_sqft_above            0.677472
sqft_living_sum_sqft_above / 2        0.677472
grade                                 0.667434
sqft_above                            0.605567
sqft_living15

## 이상치 대체 하기

In [131]:
new_data.columns


Index(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'sqft_living*grade',
       'sqft_living*sqft_above', 'sqft_living*sqft_living15',
       'sqft_living*sqft_bathrooms', 'sqft_living*sqft_view',
       'sqft_living_sum_grade', 'sqft_living_sum_sqft_above',
       'sqft_living_sum_sqft_living15', 'sqft_living_sum_sqft_bathrooms',
       'sqft_living_sum_sqft_view', 'sqft_living - grade',
       'sqft_living - sqft_above', 'sqft_living - sqft_living15',
       'sqft_living - bathrooms', 'sqft_living - view',
       'sqft_living_sum_grade / 2', 'sqft_living_sum_sqft_above / 2',
       'sqft_living_sum_sqft_living15 / 2',
       'sqft_living_sum_sqft_bathrooms / 2', 'sqft_living_sum_sqft_view / 2'],
      dtype='object')

In [132]:
# import numpy as np

# def replace_outliers_with_mean(df):
#     '''replaces outliers with the mean value for specific columns'''
#     variables = list(new_data.columns.values)
#     for variable in variables:
#         mean_value = new_data[variable].mean()
#         std_value = new_data[variable].std()
#         outliers = (new_data[variable] - mean_value).abs() > 3 * std_value
#         new_data.loc[outliers, variable] = mean_value
        
#     return new_data

# 이상치 대신 중앙값으로 처리하는 함수
def replace_outliers_with_median(df):
    variables = list(df.columns.values)
    for variable in variables:
        median_value = df[variable].median()
        std_value = df[variable].std()
        outliers = (df[variable] - median_value).abs() > 3 * std_value
        df.loc[outliers, variable] = median_value
    return df

In [133]:
new_data = replace_outliers_with_median(new_data)

# 데이터 분할

In [134]:
housing_prepared = new_data.drop(["price"], axis=1)
housing_labels = new_data["price"].copy()

In [135]:
def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standard deviation:', scores.std())

In [136]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(    max_depth=10,  
    min_data_in_leaf=29,
    feature_fraction=1.0,
    n_estimators=150,
    bagging_fraction=0.1,
    min_gain_to_split=1,
    learning_rate=0.1,
    num_leaves=84, 
    random_state=50,
    force_col_wise=True
)
lgbm_scores = cross_val_score(lgbm, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv = 10)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

[LightGBM] [Info] Total Bins 7385
[LightGBM] [Info] Number of data points in the train set: 19451, number of used features: 37
[LightGBM] [Info] Start training from score 501500.302195


[LightGBM] [Info] Total Bins 7380
[LightGBM] [Info] Number of data points in the train set: 19451, number of used features: 37
[LightGBM] [Info] Start training from score 502108.861035
[LightGBM] [Info] Total Bins 7381
[LightGBM] [Info] Number of data points in the train set: 19451, number of used features: 37
[LightGBM] [Info] Start training from score 501727.213357
[LightGBM] [Info] Total Bins 7385
[LightGBM] [Info] Number of data points in the train set: 19452, number of used features: 37
[LightGBM] [Info] Start training from score 500978.620296
[LightGBM] [Info] Total Bins 7386
[LightGBM] [Info] Number of data points in the train set: 19452, number of used features: 37
[LightGBM] [Info] Start training from score 502117.406128
[LightGBM] [Info] Total Bins 7386
[LightGBM] [Info] Number of data points in the train set: 19452, number of used features: 37
[LightGBM] [Info] Start training from score 501728.497327
[LightGBM] [Info] Total Bins 7385
[LightGBM] [Info] Number of data points i

In [137]:
display_scores(lgbm_rmse_scores)

Scores: [119134.4505949  120026.15955154 120514.47980284 119106.99206716
 114166.94395843 115291.63677458 120662.58305653 117672.83746075
 115711.42097531 127002.29769652]
Mean: 118928.98019385539
Standard deviation: 3461.4042005827678


### 10-CV Result
# Scroe : 118928.98019385539

In [138]:
housing_prepared.shape

(21613, 39)