In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
%matplotlib inline

!pip install xgboost
import xgboost as xgb



In [2]:
df = pd.read_csv('.csv')
df_test = pd.read_csv('test.csv')

print(df.shape)

(15031, 37)


In [3]:
# 가격과 가장 상관관계가 높은 순서대로 스피어만 순위 상관관계 나타냄(히트맵)

import scipy as sp

#abs는 절댓값 함수! 이는 반비례 관계(마이너스)도 고려하기 위함. 
cor_abs = abs(df.corr(method='spearman'))

# price와 상관관계가 높은 column 10개 뽑기
# nlargest : 컬렉션 내부에서 가장 큰 n개의 아이템 찾으려 할 때
# .index : index 값 추출
cor_cols = cor_abs.nlargest(n=10, columns='price').index

#spearman coefficient matrix
#.values : 딕셔너리 value 값을 보여줌
cor = np.array(sp.stats.spearmanr(df[cor_cols].values))[0] #상관계수 추출

#상관계수 히트맵 그리기
plt.figure( figsize = (10,10))
sns.set( font_scale=1.25)
#annot = True : 숫자 표시, annot_kws : 숫자 크기
sns.heatmap(cor, fmt='.2f', annot=True, annot_kws={'size':10}, 
           xticklabels=cor_cols.values, yticklabels=cor_cols.values)

ValueError: Cannot apply_along_axis when any iteration dimensions are 0

In [4]:
df_test.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,totla_rooms,is_renovated,living_ratio(lot)
0,15035,20141209,3,2.25,2570,7242,2.0,0,0,3,...,1951,1991,98125,47.721,-122.319,1690,7639,5.25,1,0.354874
1,15036,20141209,4,3.0,1960,5000,1.0,0,0,5,...,1965,1965,98136,47.5208,-122.393,1360,5000,7.0,0,0.392
2,15037,20140512,4,4.5,5420,101930,1.0,0,0,3,...,2001,2001,98053,47.6561,-122.005,4760,101930,8.5,0,0.053174
3,15038,20150415,3,1.0,1780,7470,1.0,0,0,3,...,1960,1960,98146,47.5123,-122.337,1780,8113,4.0,0,0.238286
4,15039,20150312,3,2.5,1890,6560,2.0,0,0,3,...,2003,2003,98038,47.3684,-122.031,2390,7570,5.5,0,0.28811


# How to use lat / long in house price

In [5]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import xgboost as xgb
import lightgbm as lgb

# plot 의 defaults 를 지정해준다. 
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,8) #그림 사이즈
plt.rcParams['font.size'] = 12 #글씨 크기

#pandas가 어디까지 보여줄 것인가?
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000
pd.options.display.max_colwidth = 1000


In [6]:
#난수 생성 (using Numpy) : 난수 시작 숫자(seed)를 정해주면 컴퓨터는 정해진 알고리즘에 따라
#난수처럼 보이는 수열을 생성한다. (seed를 지정하면 항상 같은 숫자가 나타난다)

random_seed = 42 

np.random.seed(random_seed)

## Baseline Model :   LightGBM 
(5-Fold out of fold prediction)   
**categorical data : One Hot incoding 처리**

In [7]:
# Baseline model 의 CV 스코어

# df 데이터를 train/ test 데이터로 나누기
from sklearn.model_selection import train_test_split

train_data = df.drop(['price'], axis=1)
target_data = df['price']
x_train, x_valid, y_train, y_valid = train_test_split(train_data, target_data, random_state=1, test_size=0.2)
x_test = df_test


print(train_data.shape, x_test.shape)

(15031, 24) (6468, 23)


In [10]:
df_test.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,totla_rooms,is_renovated,living_ratio(lot)
0,15035,20141209,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,5.25,1,0.354874
1,15036,20141209,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,1965,98136,47.5208,-122.393,1360,5000,7.0,0,0.392
2,15037,20140512,4,4.5,5420,101930,1.0,0,0,3,11,3890,1530,2001,2001,98053,47.6561,-122.005,4760,101930,8.5,0,0.053174
3,15038,20150415,3,1.0,1780,7470,1.0,0,0,3,7,1050,730,1960,1960,98146,47.5123,-122.337,1780,8113,4.0,0,0.238286
4,15039,20150312,3,2.5,1890,6560,2.0,0,0,3,7,1890,0,2003,2003,98038,47.3684,-122.031,2390,7570,5.5,0,0.28811


In [8]:
# lgb_param 입력 (하이퍼파라미터 튜닝)

import lightgbm as lgb

train_ds = lgb.Dataset(x_train, label = y_train)
test_ds = lgb.Dataset(x_valid, label = y_valid)

params = {
    'objective': 'regression', #method
    'learning_rate': 0.05, #학습률
    'num_leaves': 15,  #하나의 트리가 가질 수 있는 최대 리프 개수
    'bagging_fraction': 0.7, #배깅 사용
    'bagging_freq': 1,
    'feature_fraction': 0.7, #트리를 학습할 때마다 선택하는 feature의 비율
    'seed': random_seed, 
    'metric': ['rmse'], #평가 지표
}

model = lgb.train(params, train_ds, 1000, test_ds, early_stopping_rounds=100)

[1]	valid_0's rmse: 0.519277
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 0.500567
[3]	valid_0's rmse: 0.486476
[4]	valid_0's rmse: 0.472733
[5]	valid_0's rmse: 0.460336
[6]	valid_0's rmse: 0.445143
[7]	valid_0's rmse: 0.430836
[8]	valid_0's rmse: 0.417236
[9]	valid_0's rmse: 0.40427
[10]	valid_0's rmse: 0.392076
[11]	valid_0's rmse: 0.380773
[12]	valid_0's rmse: 0.369949
[13]	valid_0's rmse: 0.359715
[14]	valid_0's rmse: 0.350283
[15]	valid_0's rmse: 0.341474
[16]	valid_0's rmse: 0.333067
[17]	valid_0's rmse: 0.325173
[18]	valid_0's rmse: 0.318898
[19]	valid_0's rmse: 0.31293
[20]	valid_0's rmse: 0.306462
[21]	valid_0's rmse: 0.299658
[22]	valid_0's rmse: 0.293439
[23]	valid_0's rmse: 0.287572
[24]	valid_0's rmse: 0.283115
[25]	valid_0's rmse: 0.278764
[26]	valid_0's rmse: 0.273896
[27]	valid_0's rmse: 0.269085
[28]	valid_0's rmse: 0.26478
[29]	valid_0's rmse: 0.260513
[30]	valid_0's rmse: 0.256843
[31]	valid_0's rmse: 0.252895
[32]	valid_0's rmse:

[289]	valid_0's rmse: 0.1624
[290]	valid_0's rmse: 0.162235
[291]	valid_0's rmse: 0.162144
[292]	valid_0's rmse: 0.162139
[293]	valid_0's rmse: 0.162135
[294]	valid_0's rmse: 0.162116
[295]	valid_0's rmse: 0.162017
[296]	valid_0's rmse: 0.162008
[297]	valid_0's rmse: 0.161919
[298]	valid_0's rmse: 0.161782
[299]	valid_0's rmse: 0.161631
[300]	valid_0's rmse: 0.1616
[301]	valid_0's rmse: 0.161483
[302]	valid_0's rmse: 0.161445
[303]	valid_0's rmse: 0.161389
[304]	valid_0's rmse: 0.161374
[305]	valid_0's rmse: 0.161339
[306]	valid_0's rmse: 0.161309
[307]	valid_0's rmse: 0.16126
[308]	valid_0's rmse: 0.161237
[309]	valid_0's rmse: 0.161208
[310]	valid_0's rmse: 0.161138
[311]	valid_0's rmse: 0.161168
[312]	valid_0's rmse: 0.161139
[313]	valid_0's rmse: 0.16116
[314]	valid_0's rmse: 0.161149
[315]	valid_0's rmse: 0.161019
[316]	valid_0's rmse: 0.160978
[317]	valid_0's rmse: 0.160938
[318]	valid_0's rmse: 0.160894
[319]	valid_0's rmse: 0.160863
[320]	valid_0's rmse: 0.160824
[321]	valid_0'

[580]	valid_0's rmse: 0.150958
[581]	valid_0's rmse: 0.150892
[582]	valid_0's rmse: 0.150818
[583]	valid_0's rmse: 0.150797
[584]	valid_0's rmse: 0.150791
[585]	valid_0's rmse: 0.150729
[586]	valid_0's rmse: 0.150722
[587]	valid_0's rmse: 0.15065
[588]	valid_0's rmse: 0.150578
[589]	valid_0's rmse: 0.150541
[590]	valid_0's rmse: 0.150534
[591]	valid_0's rmse: 0.150531
[592]	valid_0's rmse: 0.150533
[593]	valid_0's rmse: 0.150434
[594]	valid_0's rmse: 0.15039
[595]	valid_0's rmse: 0.150379
[596]	valid_0's rmse: 0.150381
[597]	valid_0's rmse: 0.15036
[598]	valid_0's rmse: 0.150333
[599]	valid_0's rmse: 0.15034
[600]	valid_0's rmse: 0.150342
[601]	valid_0's rmse: 0.150347
[602]	valid_0's rmse: 0.150345
[603]	valid_0's rmse: 0.150319
[604]	valid_0's rmse: 0.150275
[605]	valid_0's rmse: 0.15024
[606]	valid_0's rmse: 0.150242
[607]	valid_0's rmse: 0.150235
[608]	valid_0's rmse: 0.15022
[609]	valid_0's rmse: 0.150224
[610]	valid_0's rmse: 0.150149
[611]	valid_0's rmse: 0.150102
[612]	valid_0'

[846]	valid_0's rmse: 0.144535
[847]	valid_0's rmse: 0.144531
[848]	valid_0's rmse: 0.144522
[849]	valid_0's rmse: 0.144497
[850]	valid_0's rmse: 0.144471
[851]	valid_0's rmse: 0.144363
[852]	valid_0's rmse: 0.144251
[853]	valid_0's rmse: 0.144262
[854]	valid_0's rmse: 0.144229
[855]	valid_0's rmse: 0.144215
[856]	valid_0's rmse: 0.144192
[857]	valid_0's rmse: 0.1442
[858]	valid_0's rmse: 0.144181
[859]	valid_0's rmse: 0.144191
[860]	valid_0's rmse: 0.144166
[861]	valid_0's rmse: 0.144144
[862]	valid_0's rmse: 0.144125
[863]	valid_0's rmse: 0.144092
[864]	valid_0's rmse: 0.144055
[865]	valid_0's rmse: 0.144064
[866]	valid_0's rmse: 0.143993
[867]	valid_0's rmse: 0.143981
[868]	valid_0's rmse: 0.14394
[869]	valid_0's rmse: 0.143944
[870]	valid_0's rmse: 0.143933
[871]	valid_0's rmse: 0.143859
[872]	valid_0's rmse: 0.143856
[873]	valid_0's rmse: 0.14386
[874]	valid_0's rmse: 0.143845
[875]	valid_0's rmse: 0.143747
[876]	valid_0's rmse: 0.143721
[877]	valid_0's rmse: 0.143719
[878]	valid_

In [9]:
prediction = model.predict(x_test)
prediction

LightGBMError: The number of features in data (23) is not the same as it was in training data (24).

In [None]:
#변수 중요도 파악하기

fig, ax = plt.subplots(figsize=(10,6))
lgb.plot_importance(model, ax=ax)

In [None]:
# zipcode와 price의 관계

data = pd.concat( [ df['price'], df['zipcode']], axis=1)

f, ax = plt.subplots(figsize=(50,10))
fig = sns.boxplot(data=data, x='zipcode', y='price')

# 98004, 98112 는 집값이 비싼 지역, + 98039, 98040
# 98023, 98108 는 집값이 낮은 지역 + 98106, 98032, 98168


In [None]:
# zipcode를 쪼개 새로운 feature 만들기

df['zipcode'] = df['zipcode'].astype('str')

df['zipcode-3'] = 'z_' + df['zipcode'].str[2:3] #3번째 글자
df['zipcode-4'] = 'z_' + df['zipcode'].str[3:4] #4번째 글자
df['zipcode-5'] = 'z_' + df['zipcode'].str[4:5] #5번째 글자
df['zipcode-34'] = 'z_' + df['zipcode'].str[2:4] #3~4번째 글자
df['zipcode-45'] = 'z_' + df['zipcode'].str[3:5] #4~5번째 글자
df['zipcode-35'] = 'z_' + df['zipcode'].str[2:3] + df['zipcode'].str[4:5] #3번째, 5번째 글자

df['zipcode'] = 'z_' + df['zipcode']

df

In [None]:
#zipcode를 위도, 경도로 산점도 나타내기

fig = plt.figure(figsize = (16, 12))

ax1 = plt.subplot(221)
sns.scatterplot(x='long', y='lat', hue='zipcode-3', hue_order=np.sort(df['zipcode-3'].unique()), 
                data=df, ax=ax1)

ax2 = plt.subplot(222)
sns.scatterplot(x='long', y='lat', hue='zipcode-4', hue_order=np.sort(df['zipcode-4'].unique()), 
                data=df, ax=ax2)

ax3 = plt.subplot(223)
sns.scatterplot(x='long', y='lat', hue='zipcode-5', hue_order=np.sort(df['zipcode-5'].unique()), 
                data=df, ax=ax3)

ax4 = plt.subplot(224)
sns.scatterplot(x='long', y='lat', hue='zipcode-34', hue_order=np.sort(df['zipcode-34'].unique()), 
                data=df, ax=ax4)




In [None]:
fig = plt.figure(figsize = (16, 12))

ax1 = plt.subplot(221)
sns.scatterplot(x='long', y='lat', hue='zipcode-45', hue_order=np.sort(df['zipcode-45'].unique()), 
                data=df, ax=ax1)

ax2 = plt.subplot(222)
sns.scatterplot(x='long', y='lat', hue='zipcode-35', hue_order=np.sort(df['zipcode-35'].unique()), 
                data=df, ax=ax2)

In [None]:
data = pd.concat( [ df['price'], df['zipcode-35']], axis=1)

f, ax = plt.subplots(figsize=(30,10))
fig = sns.boxplot(data=data, x='zipcode-35', y='price')

#18이 집값이 싼 지역 (3, 5번째 글자)

In [None]:
data = pd.concat( [ df['price'], df['zipcode-5']], axis=1)

f, ax = plt.subplots(figsize=(30,10))
fig = sns.boxplot(data=data, x='zipcode-5', y='price')

#8이 집값이 싼 지역 (5번째 글자)

### PCA Transformation : 차원축소
차원축소를 하지 않고 2차원 그대로 PCA transformation 하기

In [None]:
coord = df[['lat', 'long']]
pca = PCA(n_components = 2)
pca.fit(coord)

coord_pca = pca.transform(coord)

df['coord_pca1'] = coord_pca[:, 0]
df['coord_pca2'] = coord_pca[:, 1] #원본데이터가 변형되어 새로운 feature로 만들어짐. 

sns.scatterplot(x='coord_pca2', y='coord_pca1', hue='price', data=df)

In [None]:
#test 데이터도 똑같이

coord = df_test[['lat', 'long']]
pca = PCA(n_components = 2)
pca.fit(coord)

coord_pca = pca.transform(coord)

df_test['coord_pca1'] = coord_pca[:, 0]
df_test['coord_pca2'] = coord_pca[:, 1] #원본데이터가 변형되어 새로운 feature로 만들어짐.

df_test['total_rooms'] = df_test['bathrooms'] + df_test['bedrooms']


In [None]:
df.head()

In [None]:
# lgb_param 다시 입력 

import lightgbm as lgb

train_ds = lgb.Dataset(x_train, label = y_train)
test_ds = lgb.Dataset(x_test, label = y_test)

params = {
    'objective': 'regression', #method
    'learning_rate': 0.05, #학습률
    'num_leaves': 15,  #하나의 트리가 가질 수 있는 최대 리프 개수
    'bagging_fraction': 0.7, #배깅 사용
    'bagging_freq': 1,
    'feature_fraction': 0.7, #트리를 학습할 때마다 선택하는 feature의 비율
    'seed': random_seed, 
    'metric': ['rmse'], #평가 지표
}

model = lgb.train(params, train_ds, 1000, test_ds, early_stopping_rounds=100)


In [None]:
fig, ax = plt.subplots(figsize=(10,6))
lgb.plot_importance(model, ax=ax)

### Haversine Distance 를 이용한 feature 생성

In [None]:
def haversine_array(lat1, lng1, lat2, lng2): 
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2)) 
    AVG_EARTH_RADIUS = 6371 # in km 
    lat = lat2 - lat1 
    lng = lng2 - lng1 
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2 
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d)) 
    return h


In [None]:
print(df['lat'].min(), df['lat'].max(), df['long'].min(), df['long'].max())

haversine_dist = haversine_array(df['lat'].min(), df['long'].min(), df['lat'].max(), df['long'].max())
print(f'max distance: {haversine_dist:.2f}km')

In [None]:
neighbor_df = pd.DataFrame()
lat2 = df['lat'].values
long2 = df['long'].values

lat1 = df.loc[0, 'lat'] # id = 0 house lat
long1 = df.loc[0, 'long'] # id = 0 house long
dist_arr = haversine_array(lat1, long1, lat2, long2)
neighbor_df = pd.DataFrame({
    'id': np.tile(np.array([df.loc[0, 'id']]), df.shape[0]),
    'neighbor_id': df['id'],
    'neighbor_lat': lat2,
    'neighbor_long': long2,
    'distance': dist_arr,
})

print(neighbor_df.shape)
neighbor_df.head()

In [None]:
df_fin = pd.merge(df, neighbor_df, on='id')

In [None]:
df_fin.to_csv('df_fin.csv', index=False)
df_fin.head()

In [None]:
df_fin.shape #피처 총 37개

In [None]:
df['zipcode-3']

In [None]:
train_columns = [c for c in df.columns if c not in ['id','price','per_price', 
                                                   'zipcode', 'zipcode-3', 'zipcode-4', 'zipcode-5', 'zipcode-34', 'zipcode-45', 'zipcode-35']]


df_test[train_columns].shape

## XGBoost 

In [None]:
import xgboost as xgb

In [None]:
model_name = "xgboost"

In [4]:
train_columns = [c for c in df.columns if c not in ['id','price','per_price', 
                                                   'zipcode', 'zipcode-3', 'zipcode-4', 'zipcode-5', 'zipcode-34', 'zipcode-45', 'zipcode-35']]


xgb_params = {
    'eta': 0.01,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

log_target_data = np.log1p(target_data)
print('Transform DMatrix...')

dtrain = xgb.DMatrix(df[train_columns], target_data)
dtest = xgb.DMatrix(df_test[train_columns])

print('Start Cross Validation...')

cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=5000, early_stopping_rounds=50,verbose_eval=500, show_stdv=False)
print('best num_boost_rounds = ', len(cv_output))

rounds = len(cv_output)

model = xgb.train(xgb_params, dtrain, num_boost_round = rounds)

NameError: name 'target_data' is not defined

In [None]:
prediction = np.exp(model.predict(dtest))
submission = pd.DataFrame({
    'Id' : x_test['id'],
    'Price' : prediction
})
submission.to_csv("xgboost.csv",index=False)

In [None]:
dff = pd.read_csv('xgboost.csv')
dff.head()

In [None]:
df