In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score



data = pd.read_csv('kc_house_data.csv')
print(data.shape)
data.head(3)


(21613, 21)


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062


In [2]:
data = data.drop(["date"], axis=1)

In [3]:
from sklearn.utils import shuffle

data = shuffle(data)

In [4]:
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
4472,7504050090,720000.0,3,2.5,2820,14250,2.0,0,0,3,11,2820,0,1991,0,98074,47.6396,-122.054,2820,12600
15621,2019200480,220000.0,3,2.25,1470,7518,1.0,0,0,3,7,1160,310,1985,0,98003,47.2725,-122.3,1720,8300
14439,3904980360,495000.0,3,2.5,1800,7318,2.0,0,0,3,8,1800,0,1989,0,98029,47.5747,-122.008,1800,5414
12922,1938400410,275000.0,3,1.75,1650,7700,1.0,0,0,4,8,1650,0,1977,0,98023,47.3155,-122.365,2020,7700
2300,5152100160,357000.0,3,1.75,2400,14012,1.0,0,0,3,9,2400,0,1971,0,98003,47.3371,-122.325,2800,13988


In [5]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 21613 entries, 4472 to 3452
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   price          21613 non-null  float64
 2   bedrooms       21613 non-null  int64  
 3   bathrooms      21613 non-null  float64
 4   sqft_living    21613 non-null  int64  
 5   sqft_lot       21613 non-null  int64  
 6   floors         21613 non-null  float64
 7   waterfront     21613 non-null  int64  
 8   view           21613 non-null  int64  
 9   condition      21613 non-null  int64  
 10  grade          21613 non-null  int64  
 11  sqft_above     21613 non-null  int64  
 12  sqft_basement  21613 non-null  int64  
 13  yr_built       21613 non-null  int64  
 14  yr_renovated   21613 non-null  int64  
 15  zipcode        21613 non-null  int64  
 16  lat            21613 non-null  float64
 17  long           21613 non-null  float64
 18  sqft_livi

In [6]:
#상관관계 따지기
data.corr()['price'].sort_values(ascending=False)

price            1.000000
sqft_living      0.702035
grade            0.667434
sqft_above       0.605567
sqft_living15    0.585379
bathrooms        0.525138
view             0.397293
sqft_basement    0.323816
bedrooms         0.308350
lat              0.307003
waterfront       0.266369
floors           0.256794
yr_renovated     0.126434
sqft_lot         0.089661
sqft_lot15       0.082447
yr_built         0.054012
condition        0.036362
long             0.021626
id              -0.016762
zipcode         -0.053203
Name: price, dtype: float64

## 특성추가

In [7]:
new_data = data.copy()

In [8]:
# 관계가 높은 특성을 이용하여 새로운 특성 만들기
new_data['sqft_living*grade'] = new_data['sqft_living'] * new_data['grade']
new_data['sqft_living*sqft_above'] = new_data['sqft_living'] * new_data['sqft_above']
new_data['sqft_living*sqft_living15'] = new_data['sqft_living'] * new_data['sqft_living15']
new_data['sqft_living*sqft_bathrooms'] = new_data['sqft_living'] * new_data['bathrooms']
new_data['sqft_living*sqft_view'] = new_data['sqft_living'] * new_data['view']

new_data['sqft_living_sum_grade'] = new_data['sqft_living'] + new_data['grade']
new_data['sqft_living_sum_sqft_above'] = new_data['sqft_living'] + new_data['sqft_above']
new_data['sqft_living_sum_sqft_living15'] = new_data['sqft_living'] + new_data['sqft_living15']
new_data['sqft_living_sum_sqft_bathrooms'] = new_data['sqft_living'] + new_data['bathrooms']
new_data['sqft_living_sum_sqft_view'] = new_data['sqft_living'] + new_data['view']

new_data['sqft_living - grade'] = new_data['sqft_living'] - new_data['grade']
new_data['sqft_living - sqft_above'] = new_data['sqft_living'] - new_data['sqft_above']
new_data['sqft_living - sqft_living15'] = new_data['sqft_living'] - new_data['sqft_living15']
new_data['sqft_living - bathrooms'] = new_data['sqft_living'] - new_data['bathrooms']
new_data['sqft_living - view'] = new_data['sqft_living'] - new_data['view']


new_data['sqft_living_sum_grade / 2'] = (new_data['sqft_living'] + new_data['grade'])/2
new_data['sqft_living_sum_sqft_above / 2'] = (new_data['sqft_living'] + new_data['sqft_above'])/2
new_data['sqft_living_sum_sqft_living15 / 2'] = (new_data['sqft_living'] + new_data['sqft_living15'])/2
new_data['sqft_living_sum_sqft_bathrooms / 2'] = (new_data['sqft_living'] + new_data['bathrooms'])/2
new_data['sqft_living_sum_sqft_view / 2'] = (new_data['sqft_living'] + new_data['view'])/2


In [9]:
new_data.corr()['price'].sort_values(ascending=False)

price                                 1.000000
sqft_living*grade                     0.756279
sqft_living*sqft_living15             0.713117
sqft_living*sqft_bathrooms            0.707134
sqft_living_sum_grade / 2             0.702204
sqft_living_sum_grade                 0.702204
sqft_living_sum_sqft_view             0.702200
sqft_living_sum_sqft_view / 2         0.702200
sqft_living - bathrooms               0.702039
sqft_living                           0.702035
sqft_living_sum_sqft_bathrooms        0.702031
sqft_living_sum_sqft_bathrooms / 2    0.702031
sqft_living - view                    0.701870
sqft_living - grade                   0.701866
sqft_living*sqft_above                0.698087
sqft_living_sum_sqft_living15         0.694920
sqft_living_sum_sqft_living15 / 2     0.694920
sqft_living_sum_sqft_above            0.677472
sqft_living_sum_sqft_above / 2        0.677472
grade                                 0.667434
sqft_above                            0.605567
sqft_living15

## 이상치 대체 하기

In [10]:
new_data.columns


Index(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'sqft_living*grade',
       'sqft_living*sqft_above', 'sqft_living*sqft_living15',
       'sqft_living*sqft_bathrooms', 'sqft_living*sqft_view',
       'sqft_living_sum_grade', 'sqft_living_sum_sqft_above',
       'sqft_living_sum_sqft_living15', 'sqft_living_sum_sqft_bathrooms',
       'sqft_living_sum_sqft_view', 'sqft_living - grade',
       'sqft_living - sqft_above', 'sqft_living - sqft_living15',
       'sqft_living - bathrooms', 'sqft_living - view',
       'sqft_living_sum_grade / 2', 'sqft_living_sum_sqft_above / 2',
       'sqft_living_sum_sqft_living15 / 2',
       'sqft_living_sum_sqft_bathrooms / 2', 'sqft_living_sum_sqft_view / 2'],
      dtype='object')

In [11]:
# import numpy as np

# def replace_outliers_with_mean(df):
#     '''replaces outliers with the mean value for specific columns'''
#     variables = list(new_data.columns.values)
#     for variable in variables:
#         mean_value = new_data[variable].mean()
#         std_value = new_data[variable].std()
#         outliers = (new_data[variable] - mean_value).abs() > 3 * std_value
#         new_data.loc[outliers, variable] = mean_value
        
#     return new_data

# 이상치 대신 중앙값으로 처리하는 함수
def replace_outliers_with_median(df):
    variables = list(df.columns.values)
    for variable in variables:
        median_value = df[variable].median()
        std_value = df[variable].std()
        outliers = (df[variable] - median_value).abs() > 3 * std_value
        df.loc[outliers, variable] = median_value
    return df

In [12]:
new_data = replace_outliers_with_median(new_data)

# 데이터 분할

In [13]:
housing_prepared = new_data.drop(["price"], axis=1)
housing_labels = new_data["price"].copy()

In [14]:
def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standard deviation:', scores.std())

In [15]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

gdb_reg = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=5, max_features='sqrt', 
                                   min_samples_leaf=15, min_samples_split=10, loss='huber')
lgbm_scores = cross_val_score(gdb_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv = 10)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

In [16]:
display_scores(lgbm_rmse_scores)

Scores: [123877.32156657 115703.77290665 127884.52663363 120345.1959468
 122572.18454891 109910.33454166 122063.04068957 120068.22783432
 122254.54253385 118642.14520941]
Mean: 120332.12924113785
Standard deviation: 4629.017367936317


### 10-CV Result
# Scroe : 118928.98019385539

In [17]:
housing_prepared.shape

(21613, 39)