<a href="https://colab.research.google.com/github/chaeyeon2367/dataAnalysis-python-addata/blob/main/Ensemble%20Learning/Bagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import dataset

In [4]:
data = pd.read_csv("/content/drive/MyDrive/Ensemble learning/kc_house_data.csv")
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,2487200875,20141209T000000,604000.0,4,3.0,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,1954400510,20150218T000000,510000.0,3,2.0,1.0,0,3,8,1987,0,98074,47.6168,-122.045


In [3]:
'''
id: Unique ID of the house
date: Date when the house was sold
price: House price (target variable)
bedrooms: Number of bedrooms per house
bathrooms: Number of bathrooms per house
floors: Total number of floors
waterfront: Whether the house has a waterfront view (0, 1)
condition: Condition of the house (1-5)
grade: Rating based on the King County grading system (1-13)
yr_built: Year the house was built
yr_renovated: Year the house was renovated
zipcode: Zip code
lat: Latitude
long: Longitude
'''

'\nid: Unique ID of the house\ndate: Date when the house was sold\nprice: House price (target variable)\nbedrooms: Number of bedrooms per house\nbathrooms: Number of bathrooms per house\nfloors: Total number of floors\nwaterfront: Whether the house has a waterfront view (0, 1)\ncondition: Condition of the house (1-5)\ngrade: Rating based on the King County grading system (1-13)\nyr_built: Year the house was built\nyr_renovated: Year the house was renovated\nzipcode: Zip code\nlat: Latitude\nlong: Longitude\n'

In [6]:
nCar = data.shape[0] # number of data
nVar = data.shape[1] # number of variables
print('nCar: %d' % nCar, 'nVar: %d' % nVar )

nCar: 21613 nVar: 14


## Pre-processing of data

### Remove variables deemed *meaningless*

In [7]:
data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis = 1) # delete colums id, date, zipcode, lat, long

## Convert categorical variables to binary variables
- The categorical variable is only the waterfront column, and since it is a binary classification, it is expressed as 0, 1.
- Omit the process as it is expressed as 0, 1 in the data

### Separation of explanatory variables and target variables, and separation of training data and evaluation data

In [8]:
feature_columns = list(data.columns.difference(['price'])) # All rows except Price
X = data[feature_columns]
y = data['price'] #target variable
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 42) #Split the ratio of learning data to evaluation data by 7:3
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # Confirm total of data numbers

(15129, 8) (6484, 8) (15129,) (6484,)


## Validates training data as evaluation data after fitting to a linear regression model (Stats_Models)

In [9]:
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

sm_train_x = sm.add_constant(train_x, has_constant = 'add') # add bias
sm_model = sm.OLS(train_y, sm_train_x) # fit the linear regression model
fitted_sm_model = sm_model.fit() # training
fitted_sm_model.summary() # confirm the leraning model

0,1,2,3
Dep. Variable:,price,R-squared:,0.595
Model:,OLS,Adj. R-squared:,0.595
Method:,Least Squares,F-statistic:,2776.0
Date:,"Sun, 19 Nov 2023",Prob (F-statistic):,0.0
Time:,16:28:39,Log-Likelihood:,-208260.0
No. Observations:,15129,AIC:,416500.0
Df Residuals:,15120,BIC:,416600.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.186e+06,1.73e+05,41.548,0.000,6.85e+06,7.52e+06
bathrooms,1.303e+05,3960.833,32.889,0.000,1.23e+05,1.38e+05
bedrooms,-2224.7910,2382.356,-0.934,0.350,-6894.497,2444.915
condition,1.641e+04,3169.013,5.178,0.000,1.02e+04,2.26e+04
floors,1946.3052,4336.838,0.449,0.654,-6554.422,1.04e+04
grade,1.956e+05,2199.540,88.924,0.000,1.91e+05,2e+05
waterfront,7.555e+05,2.26e+04,33.479,0.000,7.11e+05,8e+05
yr_built,-4300.7865,88.073,-48.832,0.000,-4473.420,-4128.153
yr_renovated,12.7325,5.043,2.525,0.012,2.847,22.618

0,1,2,3
Omnibus:,13447.374,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1684794.827
Skew:,3.763,Prob(JB):,0.0
Kurtosis:,54.147,Cond. No.,182000.0


In [10]:
# Verify the result
sm_test_x = sm.add_constant(test_x, has_constant = 'add') # Adding Bias to Test Data
sm_model_predict = fitted_sm_model.predict(sm_test_x) # prediction the test data
print("RMSE: {}".format(sqrt(mean_squared_error(sm_model_predict, test_y)))) # RMSE
print(fitted_sm_model.params) # regression coefficient

RMSE: 239804.2967085816
const           7.185671e+06
bathrooms       1.302689e+05
bedrooms       -2.224791e+03
condition       1.641020e+04
floors          1.946305e+03
grade           1.955909e+05
waterfront      7.555423e+05
yr_built       -4.300787e+03
yr_renovated    1.273246e+01
dtype: float64


### Verify that Bagging Results are better than Normal Results

In [37]:
import random
bagging_predict_result = [] # Create bin list
for _ in range(10):
    data_index = [data_index for data_index in range(train_x.shape[0])] # Convert the index of the learning data to a list
    random_data_index = np.random.choice(data_index, train_x.shape[0]) # Restoration extraction \Random sampling as much as 1/10th the size of the data, // to ignore the decimal point
    print(len(set(random_data_index)))
    sm_train_x = train_x.iloc[random_data_index, ] # Explainatory variables among learning data corresponding to a random index
    sm_train_y = train_y.iloc[random_data_index, ] # Dependent variable among learning data corresponding to random index
    sm_train_x = sm.add_constant(sm_train_x, has_constant = 'add') #add bias
    sm_model = sm.OLS(sm_train_y, sm_train_x) # fit the model
    fitted_sm_model = sm_model.fit() # learning the model

    sm_test_x = sm.add_constant(test_x, has_constant = 'add') # Adding Bias to Test Data
    sm_model_predict = fitted_sm_model.predict(sm_test_x) # prediction the test data
    bagging_predict_result.append(sm_model_predict) # Save the result value to an empty list before the iteration runs
    print(sqrt(mean_squared_error(sm_model_predict,test_y)))

9521
240613.34065594533
9529
239893.74238554967
9560
240020.9760084165
9594
239678.05185247786
9538
241069.9416641829
9576
239880.3096474074
9574
240668.16795974388
9546
239939.04867334204
9571
239776.070449879
9544
240264.28109600116


In [38]:
bagging_predict_result[4] # 0 to 9 and 10 predictions, resulting in 10 results

735      5.628320e+05
2830     7.123726e+05
4106     1.114787e+06
16218    1.474328e+06
19964    6.991678e+05
             ...     
12606    6.034291e+05
14393    6.816749e+05
6899     3.266181e+05
85       9.075020e+05
21363    4.340011e+05
Length: 6484, dtype: float64

In [39]:
# Calculate the average for the predicted result based on Bagging
bagging_predict = []
for lst2_index in range(test_x.shape[0]): # Repeat as many test data
    temp_predict = [] # Create a temporary bin list (save results in repeat statements)
    for lst_index in range(len(bagging_predict_result)): # Repeating Bagging Results List
        temp_predict.append(bagging_predict_result[lst_index].values[lst2_index]) # Store the same index in the list among the predicted values of each Bagging result
    bagging_predict.append(np.mean(temp_predict)) # Add an average of 30 results for that index to the final list

In [40]:
bagging_predict

[559330.7603081479,
 708803.0728635538,
 1119864.320025115,
 1484408.5342532438,
 699798.2885683896,
 385828.4532655605,
 784770.8005895612,
 482724.1364818318,
 499120.9514035477,
 539323.2917500127,
 640513.0979089418,
 409527.56828904117,
 266441.6725911447,
 278077.0912128737,
 335874.3855755191,
 1270054.9161524076,
 315905.96918888204,
 1040837.1245741158,
 256004.3801641006,
 602296.6125815366,
 390853.29580338404,
 1310787.9622040726,
 825673.1337719606,
 583713.7541143022,
 599786.141297697,
 571074.2102832368,
 259094.402706795,
 44203.74521767944,
 564575.9121008574,
 641921.3126050681,
 568714.9073357085,
 458089.93202506815,
 551644.569584308,
 688790.7432944749,
 408117.57359907485,
 873631.2847312469,
 945729.365364388,
 639121.2192062396,
 392996.20236595144,
 1088129.7245855355,
 457324.52629917505,
 145214.03059264988,
 486571.30723163905,
 217646.97305338812,
 58417.06554195359,
 -49940.48971736683,
 246518.1165871077,
 283091.7000974735,
 363837.3191586408,
 720575.

In [41]:
'''Calculate the average of the predicted results and evaluate the performance
against the target variables of the actual test data'''

print("RMSE: {}".format(sqrt(mean_squared_error(bagging_predict, test_y)))) # RMSE

RMSE: 239788.7308872504


## Fits the learning data to the linear regression model and then validates it as the evaluation data (Scikit-Learn)

In [72]:
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression() # linear regression model
linear_model1 = regression_model.fit(train_x, train_y) # Fit training data to linear regression model
predict1 = linear_model1.predict(test_x) # Predict the evaluation data with the learned linear regression model
print("RMSE: {}".format(sqrt(mean_squared_error(predict1, test_y)))) # RMSE result

RMSE: 239804.2967085815


## Evaluation after fitting to a linear regression model using Bagging (Sampling n.10)

In [73]:
from sklearn.ensemble import BaggingRegressor
bagging_model = BaggingRegressor(base_estimator = regression_model,
                                 n_estimators = 10, # N.10 sampling
                                 verbose = 1)
linear_model2 = bagging_model.fit(train_x, train_y) # training the model
predict2 = linear_model2.predict(test_x) # Predict the evaluation data with the learned Bagging linear regression model
print("RMSE: {}".format(sqrt(mean_squared_error(predict2, test_y)))) # RMSE result

RMSE: 239898.32412305573




In [74]:
bagging_model2 = BaggingRegressor(base_estimator = regression_model, #linear regression model
                                  n_estimators = 30, # n.30 sampling
                                  verbose = 1)
linear_model3 = bagging_model2.fit(train_x, train_y) # fit the model
predict3 = linear_model3.predict(test_x)
print("RMSE: {}".format(sqrt(mean_squared_error(predict3, test_y)))) # RMSE result



RMSE: 239774.13759681504


## Verifying the learning data as evaluation data after fitting to the decision tree model

In [75]:
from sklearn.tree import DecisionTreeRegressor
decision_tree_model = DecisionTreeRegressor() # decision tree model
tree_model1 = decision_tree_model.fit(train_x, train_y)
predict1 = tree_model1.predict(test_x) # Predict the evaluation data from the learned decision tree model
print("RMSE: {}".format(sqrt(mean_squared_error(predict1, test_y)))) # RMSE result

RMSE: 296974.4938780359


In [76]:
import random
bagging_predict_result = []
for _ in range(10):
    data_index = [data_index for data_index in range(train_x.shape[0])]
    random_data_index = np.random.choice(data_index, train_x.shape[0])
    print(len(set(random_data_index)))
    sm_train_x = train_x.iloc[random_data_index, ]
    sm_train_y = train_y.iloc[random_data_index, ]
    decision_tree_model = DecisionTreeRegressor()
    tree_model1 = decision_tree_model.fit(sm_train_x, sm_train_y)

    predict_tree = tree_model1.predict(test_x)
    bagging_predict_result.append(predict_tree)
    print(sqrt(mean_squared_error(predict_tree,test_y)))

9536
285116.2105678009
9591
285465.83600159944
9590
282424.82924057514
9587
278713.70716124953
9537
283496.68474209734
9585
308282.5470779919
9559
281884.8887602026
9621
288933.9411749616
9611
323461.07214712404
9546
296309.18493364664


In [77]:
# Calculate the average for the predicted result based on Bagging
bagging_predict = []
for lst2_index in range(test_x.shape[0]): # Repeat as many test data
    temp_predict = [] # Create a temporary bin list (save results in repeat statements)
    for lst_index in range(len(bagging_predict_result)): # Repeating Bagging Results List
        temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # Store the same index in the list among the predicted values of each Bagging result
    bagging_predict.append(np.mean(temp_predict)) # Add an average of 30 results for that index to the final list

In [78]:
# Calculate the average of the predicted results and evaluate the performance against the target variables of the actual test data

print("RMSE: {}".format(sqrt(mean_squared_error(bagging_predict, test_y)))) # RMSE

RMSE: 234083.5853289731


## Evaluation after conforming to the decision tree model using Bagging (Sampling No. 10)

In [83]:
bagging_decision_tree_model1 = BaggingRegressor(base_estimator = decision_tree_model,
                                                n_estimators = 10,
                                                verbose = 1)
tree_model2 = bagging_decision_tree_model1.fit(train_x, train_y)
predict2 = tree_model2.predict(test_x) # Predict the evaluation data with the learned Bagging decision tree model
print("RMSE: {}".format(sqrt(mean_squared_error(predict2, test_y)))) # RMSE result



RMSE: 241816.95891760115


In [87]:
bagging_decision_tree_model2 = BaggingRegressor(base_estimator = decision_tree_model,
                                                n_estimators = 30,
                                                verbose = 1)
tree_model3 = bagging_decision_tree_model2.fit(train_x, train_y)
predict3 = tree_model3.predict(test_x)
print("RMSE: {}".format(sqrt(mean_squared_error(predict3, test_y))))



RMSE: 233863.19394764808
