In [1]:
import numpy as np

In [2]:
N = 1000000
bootstrap = np.random.choice(N, N, replace=True)
np.round(len(set(bootstrap))/N, 3)

0.632

# Bagging

In [8]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [9]:
data = pd.read_csv('./data/kc_house_data.csv')
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,2487200875,20141209T000000,604000.0,4,3.0,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,1954400510,20150218T000000,510000.0,3,2.0,1.0,0,3,8,1987,0,98074,47.6168,-122.045


* id : 집 고유 아이디
* data : 집이 팔린 날짜
* price : 집 가격
* bedrooms : 가구당 침실 개수
* bathrooms : 가구당 화장실 개수
* floors : 전체 층 수
* waterfront : 해변이 시야에 들어옴(0, 1)
* condition : 집 청소 상태(1~5)
* grade : 평점
* yr_bulit : 집이 지어진 년도
* yr_renovated : 집이 리모델링 된 년도
* zipcode : 우편 번호
* lat : 위도
* long : 경도

In [10]:
data.shape

(21613, 14)

In [11]:
# 의미가 없는 변수 제거 : id, date, zipcode, lat, long
data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis=1)
data.head()

Unnamed: 0,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated
0,221900.0,3,1.0,1.0,0,3,7,1955,0
1,538000.0,3,2.25,2.0,0,3,7,1951,1991
2,180000.0,2,1.0,1.0,0,3,6,1933,0
3,604000.0,4,3.0,1.0,0,5,7,1965,0
4,510000.0,3,2.0,1.0,0,3,8,1987,0


In [14]:
feature_columns = list(data.columns.difference(['price']))
feature_columns

['bathrooms',
 'bedrooms',
 'condition',
 'floors',
 'grade',
 'waterfront',
 'yr_built',
 'yr_renovated']

In [18]:
X = data[feature_columns]
y = data['price']
print(X.shape, y.shape)

(21613, 8) (21613,)


In [21]:
# 학습 데이터와 평가 데이터 분리
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)
print(train_X.shape, test_X.shape, train_y.shape, test_y.shape)

(15129, 8) (6484, 8) (15129,) (6484,)


In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

In [25]:
regression_model = LinearRegression()
linear_model1 = regression_model.fit(train_X, train_y)
predict1 = linear_model1.predict(test_X)
print('RMSE: {}'.format(sqrt(mean_squared_error(predict1, test_y))))

RMSE: 239804.2967085815


In [26]:
from sklearn.ensemble import BaggingRegressor

In [27]:
bagging_model = BaggingRegressor(base_estimator=regression_model, n_estimators=100)
linear_model2 = bagging_model.fit(train_X, train_y)
predict2 = linear_model2.predict(test_X)
print('RMSE: {}'.format(sqrt(mean_squared_error(predict2, test_y))))

RMSE: 239825.19836167851


In [29]:
from sklearn.tree import DecisionTreeRegressor
decision_tree_model = DecisionTreeRegressor()
tree_model1 = decision_tree_model.fit(train_X, train_y)
predict1 = tree_model1.predict(test_X)
print('RMSE: {}'.format(sqrt(mean_squared_error(predict1, test_y))))

RMSE: 297402.8731512593


In [35]:
bagging_decision_tree_model1 = BaggingRegressor(
    base_estimator=decision_tree_model,
    n_estimators=65,
    verbose=1,
    n_jobs=-1
)
tree_model2 = bagging_decision_tree_model1.fit(train_X, train_y)
predict2 = tree_model2.predict(test_X)
print('RMSE: {}'.format(sqrt(mean_squared_error(predict2, test_y))))

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.4s remaining:    2.0s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.6s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.1s remaining:    0.3s


RMSE: 232337.6022464481


[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.2s finished


# Randomforset

In [36]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [37]:
data = pd.read_csv('./data/otto_train.csv')
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


* id : 고유 아이디
* feat_1 ~ feat_93 : 설명변수
* target : 타겟변수(Class_1 ~ Class_9)

In [38]:
data.shape

(61878, 95)

In [39]:
data = data.drop(['id'], axis=1)
data.head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,1,0,0,1,6,1,5,0,0,1,...,0,1,2,0,0,0,0,0,0,Class_1
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [40]:
data.tail()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
61873,1,0,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,2,0,Class_9
61874,4,0,0,0,0,0,0,0,0,0,...,0,2,0,0,2,0,0,1,0,Class_9
61875,0,0,0,0,0,0,0,3,1,0,...,0,3,1,0,0,0,0,0,0,Class_9
61876,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,3,10,0,Class_9
61877,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,Class_9


In [42]:
mapping_dict = {'Class_1': 1, 'Class_2': 2, 'Class_3': 3, 'Class_4': 4, 'Class_5': 5,
               'Class_6': 6, 'Class_7': 7, 'Class_8': 8, 'Class_9': 9}

In [43]:
after_mapping_target = data['target'].apply(lambda x: mapping_dict[x])
after_mapping_target

0        1
1        1
2        1
3        1
4        1
        ..
61873    9
61874    9
61875    9
61876    9
61877    9
Name: target, Length: 61878, dtype: int64

In [49]:
feature_columns = list(data.columns.difference(['target']))
X = data[feature_columns]
y = after_mapping_target
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
print(train_X.shape, test_X.shape, train_y.shape, test_y.shape)

(49502, 93) (12376, 93) (49502,) (12376,)


In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [53]:
random_forset_model1 = RandomForestClassifier(n_estimators=20, max_depth=5, random_state=42)
model1 = random_forset_model1.fit(train_X, train_y)
predict1 = model1.predict(test_X)
print('Accuracy: %.2f'%(accuracy_score(test_y, predict1)* 100), '%')

Accuracy: 60.16 %


In [59]:
random_forset_model2 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model2 = random_forset_model2.fit(train_X, train_y)
predict2 = model2.predict(test_X)
print('Accuracy: %.2f'%(accuracy_score(test_y, predict2)* 100), '%')

Accuracy: 62.44 %


In [62]:
random_forset_model3 = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42)
model3 = random_forset_model3.fit(train_X, train_y)
predict3 = model3.predict(test_X)
print('Accuracy: %.2f'%(accuracy_score(test_y, predict3)* 100), '%')

Accuracy: 75.44 %


# Adaboost

In [63]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [66]:
tree_model = DecisionTreeClassifier(max_depth=5)
Adaboost_model1 = AdaBoostClassifier(base_estimator=tree_model, n_estimators=20, random_state=42)
model1 = Adaboost_model1.fit(train_X, train_y)
predict1 = model1.predict(test_X)
print('Accuracy: %.2f'%(accuracy_score(test_y, predict1)* 100), '%')

Accuracy: 63.23 %


In [67]:
tree_model = DecisionTreeClassifier(max_depth=10)
Adaboost_model1 = AdaBoostClassifier(base_estimator=tree_model, n_estimators=300, random_state=42)
model1 = Adaboost_model1.fit(train_X, train_y)
predict1 = model1.predict(test_X)
print('Accuracy: %.2f'%(accuracy_score(test_y, predict1)* 100), '%')

Accuracy: 74.43 %


In [74]:
# !pip install xgboost



In [76]:
import xgboost as xgb
import time

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/jaehyeong/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/jaehyeong/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/lib/libomp.dylib' (no such file), '/usr/lib/libomp.dylib' (no such file)"]


In [77]:
start = time.time()
xgb_dtrain = xgb.DMatrix(data=train_X, label=train_y)
xgb_dtest = xgb.DMatrix(data=test_X)
xgb_param = {'max_depth': 10,
             'learning_rate': 0.01,
             'n_estimators': 100,
             'objective': 'multi:softmax',
             'num_class': len(set(train_y))+1
            }
xgb_model = xgb.train(parans=xgb_param, dtrain=xgb_dtrain)
xgb_model_predict = xgb_model.predict(xgb_dtest)
print('Accuracy: %.2f'%(accuracy_score(test_y, xgb_model_predict)* 100), '%')
print('Time: %.2f'%(time.time()-start), 'seconds')

NameError: name 'time' is not defined

In [80]:
# !pip install lightgbm

In [82]:
import lightgbm as lgb

OSError: dlopen(/Users/jaehyeong/opt/anaconda3/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so, 0x0006): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
  Referenced from: /Users/jaehyeong/opt/anaconda3/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so
  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/lib/libomp.dylib' (no such file), '/usr/lib/libomp.dylib' (no such file)

In [83]:
start = time.time()
lgb_dtrain = lgb.Dataset(data=train_X, label=train_y)
lgb_params = {'max_depth': 10,
             'learning_rate': 0.01,
             'n_estimators': 100,
             'objective': 'multiclass',
             'num_class': len(set(train_y))+1
            }
lgb_model = lgb.train(parmas=lgb_params, train_set = lgb_dtrain)
lgb_model_predict = np.argmax(lgb_model.predict(test_X), axis=1)

print('Accuracy: %.2f'%(accuracy_score(test_y, lgb_model_predict)* 100), '%')
print('Time: %.2f'%(time.time()-start), 'seconds')

NameError: name 'time' is not defined

In [85]:
# !pip install catboost

In [87]:
import catboost as cb
import time

In [88]:
start = time.time()
cb_dtrain = cb.Pool(data=train_X, label=train_y)
cb_params = {'max_depth': 10,
             'learning_rate': 0.01,
             'n_estimators': 100,
             'eval_metric': 'Accuracy',
             'loss_function': 'MultiClass'
            }
cb_model = cb.train(pool=cb_dtrain, params=cb_params)
cb_model_predict = np.argmax(cb_model.predict(test_X), axis=1)+1

print('Accuracy: %.2f'%(accuracy_score(test_y, cb_model_predict)* 100), '%')
print('Time: %.2f'%(time.time()-start), 'seconds')

0:	learn: 0.5907034	total: 387ms	remaining: 38.4s
1:	learn: 0.6356107	total: 727ms	remaining: 35.6s
2:	learn: 0.6411256	total: 1.02s	remaining: 32.9s
3:	learn: 0.6480344	total: 1.32s	remaining: 31.7s
4:	learn: 0.6508222	total: 1.58s	remaining: 30.1s
5:	learn: 0.6499939	total: 1.87s	remaining: 29.3s
6:	learn: 0.6507818	total: 2.2s	remaining: 29.2s
7:	learn: 0.6548422	total: 2.53s	remaining: 29.1s
8:	learn: 0.6559533	total: 2.82s	remaining: 28.6s
9:	learn: 0.6560947	total: 3.08s	remaining: 27.7s
10:	learn: 0.6568421	total: 3.33s	remaining: 26.9s
11:	learn: 0.6588219	total: 3.72s	remaining: 27.3s
12:	learn: 0.6592259	total: 4.07s	remaining: 27.2s
13:	learn: 0.6611248	total: 4.41s	remaining: 27.1s
14:	learn: 0.6625591	total: 4.74s	remaining: 26.9s
15:	learn: 0.6631853	total: 5.08s	remaining: 26.7s
16:	learn: 0.6639328	total: 5.36s	remaining: 26.2s
17:	learn: 0.6668821	total: 5.63s	remaining: 25.6s
18:	learn: 0.6669630	total: 5.88s	remaining: 25.1s
19:	learn: 0.6675286	total: 6.14s	remainin

In [90]:
data = pd.read_csv('./data/kc_house_data.csv')
data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis=1)
feature_columns = list(data.columns.difference(['price']))
X = data[feature_columns]
y = data['price']
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)

In [94]:
import random
bagging_predict_result = []
for _ in range(10):
    data_index = [data_index for data_index in range(train_X.shape[0])]
    random_data_index = np.random.choice(data_index, train_X.shape[0])
    print(len(set(random_data_index)))
    lgb_dtrain = lgb.Dataset(data=train_X.iloc[random_data_index,], label=train_y.iloc[random_data_index,])
    lgb_param = {'max_depth': 14,
                 'learning_rate': 0.01,
                 'n_estimators': 500,
                 'objective': 'regression'
                }
    lgb_model = lgb.train(params=lgb_param, train_set=lgb_dtrain)
    predict1 = lgb_model.predict(test_X)
    bagging_predict_result.append(predict1)
    print(sqrt(mean_squared_error(lgb_model.predict(test_X), test_y)))

9607


NameError: name 'lgb' is not defined