In [2]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from utils import *

In [3]:
# Load king_country_dataset
df = pd.read_csv('Data/cleaned_house_sales.csv')
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year_sold,month_sold,day_sold
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,...,1955,0,98178,47.5112,-122.257,1340,5650,2014,10,13
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,...,1951,1991,98125,47.721,-122.319,1690,7639,2014,12,9
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,...,1933,0,98028,47.7379,-122.233,2720,8062,2015,2,25
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,...,1965,0,98136,47.5208,-122.393,1360,5000,2014,12,9
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,...,1987,0,98074,47.6168,-122.045,1800,7503,2015,2,18


In [4]:
# Load king_country_dataset
base_metrics = pd.read_csv('Metrics/baseline_metrics.csv')
base_metrics = base_metrics.drop([2, 3], axis=0)
base_metrics

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6977,0.6974,125948.1118,202864.5703,0.2561,Baseline model
1,LinearRegression,test,0.7162,0.7148,125985.6747,191531.3335,0.2596,Baseline model
4,RandomForestRegressor,train,0.982,0.9819,25948.4444,49547.1197,0.0486,"Baseline, no normalization, random_state 13, d..."
5,RandomForestRegressor,test,0.8959,0.8954,67269.877,115994.2051,0.1271,"Baseline, no normalization, random_state 13, d..."
6,XGBRegressor,train,0.978,0.978,39126.7558,54676.7727,0.0872,"Baseline, no normalization, default values."
7,XGBRegressor,test,0.9015,0.901,65712.7448,112860.4995,0.1246,"Baseline, no normalization, default values."


In the OLS we see that the p-values of some columns are high. We would like to test if without those columns the models work better or change. The columns are:  
- sqft_lot
- floors   


The Ridge and Lasso Coefficients were also low in some columns and we would like to analize the same on:
- month_sold
- day_sold
- yr_renovated


## Drop 'sqft_lot'

Defining target and features

In [None]:
X = df.drop(['price', 'sqft_lot'], axis=1) # Features
y = df['price'] # Target


Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
 # Create the Linear Regression estimator
lm = LinearRegression()

# Perform the fitting
lm.fit(X_train, y_train)

Evaluation

In [None]:
metrics_df = create_metrics_df()
metrics_df = add_new_metrics(
    metrics_df, 
    lm,
    X_train,
    y_train,
    "train",
    "Whithout sqft_lot"
)

metrics_df

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    lm,
    X_test,
    y_test,
    "test",
    "Whithout sqft_lot"
)

metrics_df

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=13)

rf_regressor.fit(X_train, y_train)

Evaluation

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_train,
    y_train,
    "train",
    "Whithout sqft_lot"
)

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_test,
    y_test,
    "test",
    "Whithout sqft_lot"
)
metrics_df

### XGBoost

In [None]:
import xgboost as xgb

xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)


Evaluation

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_train,
    y_train,
    "train",
    "Whithout sqft_lot"
)
metrics_df

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_test,
    y_test,
    "test",
    "Whithout sqft_lot"
)
metrics_df

### Conclusion

In [None]:
base_metrics

Without this feature the modules don´t change significally so we decided to drop it

## Drop 'floors'

Defining target and features

In [None]:
X = df.drop(['price','floors'], axis=1) # Features
y = df['price'] # Target
X.head()

Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
 # Create the Linear Regression estimator
lm = LinearRegression()

# Perform the fitting
lm.fit(X_train, y_train)

Evaluation

In [None]:
metrics_df = create_metrics_df()
metrics_df = add_new_metrics(
    metrics_df, 
    lm,
    X_train,
    y_train,
    "train",
    "Whithout floors"
)

metrics_df

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    lm,
    X_test,
    y_test,
    "test",
    "Whithout floors"
)

metrics_df

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=13)

rf_regressor.fit(X_train, y_train)

Evaluation

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_train,
    y_train,
    "train",
    "Whithout floors"
)
metrics_df

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_test,
    y_test,
    "test",
    "Whithout floors"
)
metrics_df

### XGBoost

In [None]:
import xgboost as xgb

xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)


Evaluation

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_train,
    y_train,
    "train",
    "Whithout floors"
)
metrics_df

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_test,
    y_test,
    "test",
    "Whithout floors"
)
metrics_df

### Conclusion

In [None]:
metrics_df.to_csv('Metrics/drop_floors.csv')

In [None]:
base_metrics

Without this feature the modules don´t change significally so we decided to drop it

## Drop 'month_sold'

Defining target and features

In [None]:
X = df.drop(['price','month_sold'], axis=1) # Features
y = df['price'] # Target
X.head()

Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
 # Create the Linear Regression estimator
lm = LinearRegression()

# Perform the fitting
lm.fit(X_train, y_train)

Evaluation

In [None]:
metrics_df = create_metrics_df()
metrics_df = add_new_metrics(
    metrics_df, 
    lm,
    X_train,
    y_train,
    "train",
    "Whithout month_sold"
)

metrics_df

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    lm,
    X_test,
    y_test,
    "test",
    "Whithout month_sold"
)

metrics_df

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=13)

rf_regressor.fit(X_train, y_train)

Evaluation

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_train,
    y_train,
    "train",
    "Whithout month_sold"
)
metrics_df

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_test,
    y_test,
    "test",
    "Whithout month_sold"
)
metrics_df

### XGBoost

In [None]:
import xgboost as xgb

xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)


Evaluation

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_train,
    y_train,
    "train",
    "Whithout month_sold"
)
metrics_df

In [None]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_test,
    y_test,
    "test",
    "Whithout month_sold"
)
metrics_df

### Conclusion

In [None]:
metrics_df.to_csv('Metrics/drop_month_sold.csv')

In [None]:
base_metrics

Without this feature the modules don´t change significally so we decided to drop it

## Drop 'day_sold'

Defining target and features

In [128]:
X = df.drop(['price','day_sold'], axis=1) # Features
y = df['price'] # Target
X.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year_sold,month_sold
0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,2014,10
1,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,2014,12
2,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,2015,2
3,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,2014,12
4,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,2015,2


Split the data

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)

### Linear Regression

In [130]:
from sklearn.linear_model import LinearRegression
 # Create the Linear Regression estimator
lm = LinearRegression()

# Perform the fitting
lm.fit(X_train, y_train)

Evaluation

In [131]:
metrics_df = create_metrics_df()
metrics_df = add_new_metrics(
    metrics_df, 
    lm,
    X_train,
    y_train,
    "train",
    "Whithout day_sold"
)

metrics_df

  updated_df = pd.concat([metrics_df, new_row_df], ignore_index=True)


Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6977,0.6973,125937.4648,202884.7569,0.256,Whithout day_sold


In [132]:
metrics_df = add_new_metrics(
    metrics_df, 
    lm,
    X_test,
    y_test,
    "test",
    "Whithout day_sold"
)

metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6977,0.6973,125937.4648,202884.7569,0.256,Whithout day_sold
1,LinearRegression,test,0.7161,0.7148,125973.0357,191559.9896,0.2595,Whithout day_sold


### Random Forest

In [133]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=13)

rf_regressor.fit(X_train, y_train)

Evaluation

In [134]:
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_train,
    y_train,
    "train",
    "Whithout day_sold"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6977,0.6973,125937.4648,202884.7569,0.256,Whithout day_sold
1,LinearRegression,test,0.7161,0.7148,125973.0357,191559.9896,0.2595,Whithout day_sold
2,RandomForestRegressor,train,0.9824,0.9824,25800.6256,48907.9896,0.0483,Whithout day_sold


In [135]:
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_test,
    y_test,
    "test",
    "Whithout day_sold"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6977,0.6973,125937.4648,202884.7569,0.256,Whithout day_sold
1,LinearRegression,test,0.7161,0.7148,125973.0357,191559.9896,0.2595,Whithout day_sold
2,RandomForestRegressor,train,0.9824,0.9824,25800.6256,48907.9896,0.0483,Whithout day_sold
3,RandomForestRegressor,test,0.8959,0.8954,67286.1944,115992.8695,0.1269,Whithout day_sold


### XGBoost

In [136]:
import xgboost as xgb

xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)


Evaluation

In [137]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_train,
    y_train,
    "train",
    "Whithout day_sold"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6977,0.6973,125937.4648,202884.7569,0.256,Whithout day_sold
1,LinearRegression,test,0.7161,0.7148,125973.0357,191559.9896,0.2595,Whithout day_sold
2,RandomForestRegressor,train,0.9824,0.9824,25800.6256,48907.9896,0.0483,Whithout day_sold
3,RandomForestRegressor,test,0.8959,0.8954,67286.1944,115992.8695,0.1269,Whithout day_sold
4,XGBRegressor,train,0.9783,0.9782,39015.3232,54398.0531,0.0871,Whithout day_sold


In [138]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_test,
    y_test,
    "test",
    "Whithout day_sold"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6977,0.6973,125937.4648,202884.7569,0.256,Whithout day_sold
1,LinearRegression,test,0.7161,0.7148,125973.0357,191559.9896,0.2595,Whithout day_sold
2,RandomForestRegressor,train,0.9824,0.9824,25800.6256,48907.9896,0.0483,Whithout day_sold
3,RandomForestRegressor,test,0.8959,0.8954,67286.1944,115992.8695,0.1269,Whithout day_sold
4,XGBRegressor,train,0.9783,0.9782,39015.3232,54398.0531,0.0871,Whithout day_sold
5,XGBRegressor,test,0.906,0.9056,64958.8204,110209.5275,0.1238,Whithout day_sold


### Conclusion

In [139]:
metrics_df.to_csv('Metrics/drop_day_sold.csv')

In [144]:
base_metrics

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6977,0.6974,125948.1118,202864.5703,0.2561,Baseline model
1,LinearRegression,test,0.7162,0.7148,125985.6747,191531.3335,0.2596,Baseline model
4,RandomForestRegressor,train,0.982,0.9819,25948.4444,49547.1197,0.0486,"Baseline, no normalization, random_state 13, d..."
5,RandomForestRegressor,test,0.8959,0.8954,67269.877,115994.2051,0.1271,"Baseline, no normalization, random_state 13, d..."
6,XGBRegressor,train,0.978,0.978,39126.7558,54676.7727,0.0872,"Baseline, no normalization, default values."
7,XGBRegressor,test,0.9015,0.901,65712.7448,112860.4995,0.1246,"Baseline, no normalization, default values."


## Drop 'yr_renovated'

Defining target and features

In [145]:
X = df.drop(['price','yr_renovated'], axis=1) # Features
y = df['price'] # Target
X.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,zipcode,lat,long,sqft_living15,sqft_lot15,year_sold,month_sold,day_sold
0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,98178,47.5112,-122.257,1340,5650,2014,10,13
1,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,98125,47.721,-122.319,1690,7639,2014,12,9
2,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,98028,47.7379,-122.233,2720,8062,2015,2,25
3,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,98136,47.5208,-122.393,1360,5000,2014,12,9
4,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,98074,47.6168,-122.045,1800,7503,2015,2,18


Split the data

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)

### Linear Regression

In [147]:
from sklearn.linear_model import LinearRegression
 # Create the Linear Regression estimator
lm = LinearRegression()

# Perform the fitting
lm.fit(X_train, y_train)

Evaluation

In [148]:
metrics_df = create_metrics_df()
metrics_df = add_new_metrics(
    metrics_df, 
    lm,
    X_train,
    y_train,
    "train",
    "Whithout yr_renovated"
)

metrics_df

  updated_df = pd.concat([metrics_df, new_row_df], ignore_index=True)


Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6973,0.6969,126097.584,203019.8769,0.2563,Whithout yr_renovated


In [149]:
metrics_df = add_new_metrics(
    metrics_df, 
    lm,
    X_test,
    y_test,
    "test",
    "Whithout yr_renovated"
)

metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6973,0.6969,126097.584,203019.8769,0.2563,Whithout yr_renovated
1,LinearRegression,test,0.7158,0.7144,126123.658,191673.0385,0.26,Whithout yr_renovated


### Random Forest

In [150]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=13)

rf_regressor.fit(X_train, y_train)

Evaluation

In [151]:
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_train,
    y_train,
    "train",
    "Whithout yr_renovated"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6973,0.6969,126097.584,203019.8769,0.2563,Whithout yr_renovated
1,LinearRegression,test,0.7158,0.7144,126123.658,191673.0385,0.26,Whithout yr_renovated
2,RandomForestRegressor,train,0.9819,0.9819,25962.452,49633.6552,0.0487,Whithout yr_renovated


In [152]:
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_test,
    y_test,
    "test",
    "Whithout yr_renovated"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6973,0.6969,126097.584,203019.8769,0.2563,Whithout yr_renovated
1,LinearRegression,test,0.7158,0.7144,126123.658,191673.0385,0.26,Whithout yr_renovated
2,RandomForestRegressor,train,0.9819,0.9819,25962.452,49633.6552,0.0487,Whithout yr_renovated
3,RandomForestRegressor,test,0.8958,0.8953,67255.3388,116066.5787,0.1272,Whithout yr_renovated


### XGBoost

In [153]:
import xgboost as xgb

xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)


Evaluation

In [154]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_train,
    y_train,
    "train",
    "Whithout yr_renovated"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6973,0.6969,126097.584,203019.8769,0.2563,Whithout yr_renovated
1,LinearRegression,test,0.7158,0.7144,126123.658,191673.0385,0.26,Whithout yr_renovated
2,RandomForestRegressor,train,0.9819,0.9819,25962.452,49633.6552,0.0487,Whithout yr_renovated
3,RandomForestRegressor,test,0.8958,0.8953,67255.3388,116066.5787,0.1272,Whithout yr_renovated
4,XGBRegressor,train,0.9774,0.9774,39451.3132,55430.4617,0.0869,Whithout yr_renovated


In [155]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_test,
    y_test,
    "test",
    "Whithout yr_renovated"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6973,0.6969,126097.584,203019.8769,0.2563,Whithout yr_renovated
1,LinearRegression,test,0.7158,0.7144,126123.658,191673.0385,0.26,Whithout yr_renovated
2,RandomForestRegressor,train,0.9819,0.9819,25962.452,49633.6552,0.0487,Whithout yr_renovated
3,RandomForestRegressor,test,0.8958,0.8953,67255.3388,116066.5787,0.1272,Whithout yr_renovated
4,XGBRegressor,train,0.9774,0.9774,39451.3132,55430.4617,0.0869,Whithout yr_renovated
5,XGBRegressor,test,0.8991,0.8986,66615.4646,114219.577,0.1262,Whithout yr_renovated


### Conclusion

In [None]:
metrics_df.to_csv('Metrics/drop_day_sold.csv')

In [None]:
base_metrics

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6977,0.6974,125948.1118,202864.5703,0.2561,Baseline model
1,LinearRegression,test,0.7162,0.7148,125985.6747,191531.3335,0.2596,Baseline model
4,RandomForestRegressor,train,0.982,0.9819,25948.4444,49547.1197,0.0486,"Baseline, no normalization, random_state 13, d..."
5,RandomForestRegressor,test,0.8959,0.8954,67269.877,115994.2051,0.1271,"Baseline, no normalization, random_state 13, d..."
6,XGBRegressor,train,0.978,0.978,39126.7558,54676.7727,0.0872,"Baseline, no normalization, default values."
7,XGBRegressor,test,0.9015,0.901,65712.7448,112860.4995,0.1246,"Baseline, no normalization, default values."


It makes it slightly worse

## Drop 'bedrooms'

Defining target and features

In [5]:
X = df.drop(['price','bedrooms'], axis=1) # Features
y = df['price'] # Target
X.head()

Unnamed: 0,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year_sold,month_sold,day_sold
0,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,2014,10,13
1,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,2014,12,9
2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,2015,2,25
3,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,2014,12,9
4,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,2015,2,18


Split the data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)

### Random Forest

In [8]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=13)

rf_regressor.fit(X_train, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",100
,"criterion  criterion: {""squared_error"", ""absolute_error"", ""friedman_mse"", ""poisson""}, default=""squared_error"" The function to measure the quality of a split. Supported criteria are ""squared_error"" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node, ""friedman_mse"", which uses mean squared error with Friedman's improvement score for potential splits, ""absolute_error"" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and ""poisson"" which uses reduction in Poisson deviance to find splits. Training using ""absolute_error"" is significantly slower than when using ""squared_error"". .. versionadded:: 0.18  Mean Absolute Error (MAE) criterion. .. versionadded:: 1.0  Poisson criterion.",'squared_error'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=1.0 The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None or 1.0, then `max_features=n_features`. .. note::  The default of 1.0 is equivalent to bagged trees and more  randomness can be achieved by setting smaller values, e.g. 0.3. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to 1.0. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",1.0
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


Evaluation

In [9]:
metrics_df = create_metrics_df()
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_train,
    y_train,
    "train",
    "Whithout bedrooms"
)
metrics_df

  updated_df = pd.concat([metrics_df, new_row_df], ignore_index=True)


Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,RandomForestRegressor,train,0.9823,0.9822,25888.0036,49132.8562,0.0486,Whithout bedrooms


In [10]:
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_test,
    y_test,
    "test",
    "Whithout yr_renovated"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,RandomForestRegressor,train,0.9823,0.9822,25888.0036,49132.8562,0.0486,Whithout bedrooms
1,RandomForestRegressor,test,0.8969,0.8964,67042.6859,115463.7677,0.127,Whithout yr_renovated


### XGBoost

In [11]:
import xgboost as xgb

xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)


0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


Evaluation

In [12]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_train,
    y_train,
    "train",
    "Whithout yr_renovated"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,RandomForestRegressor,train,0.9823,0.9822,25888.0036,49132.8562,0.0486,Whithout bedrooms
1,RandomForestRegressor,test,0.8969,0.8964,67042.6859,115463.7677,0.127,Whithout yr_renovated
2,XGBRegressor,train,0.9786,0.9786,38678.5863,54008.8395,0.086,Whithout yr_renovated


In [13]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_test,
    y_test,
    "test",
    "Whithout yr_renovated"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,RandomForestRegressor,train,0.9823,0.9822,25888.0036,49132.8562,0.0486,Whithout bedrooms
1,RandomForestRegressor,test,0.8969,0.8964,67042.6859,115463.7677,0.127,Whithout yr_renovated
2,XGBRegressor,train,0.9786,0.9786,38678.5863,54008.8395,0.086,Whithout yr_renovated
3,XGBRegressor,test,0.9003,0.8999,65793.1286,113502.9973,0.1241,Whithout yr_renovated


### Conclusion

In [156]:
metrics_df.to_csv('Metrics/drop_yr_renovated.csv')

In [None]:
base_metrics

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6977,0.6974,125948.1118,202864.5703,0.2561,Baseline model
1,LinearRegression,test,0.7162,0.7148,125985.6747,191531.3335,0.2596,Baseline model
4,RandomForestRegressor,train,0.982,0.9819,25948.4444,49547.1197,0.0486,"Baseline, no normalization, random_state 13, d..."
5,RandomForestRegressor,test,0.8959,0.8954,67269.877,115994.2051,0.1271,"Baseline, no normalization, random_state 13, d..."
6,XGBRegressor,train,0.978,0.978,39126.7558,54676.7727,0.0872,"Baseline, no normalization, default values."
7,XGBRegressor,test,0.9015,0.901,65712.7448,112860.4995,0.1246,"Baseline, no normalization, default values."


It improves a lot 

## Drop 'sqft_lot', 'floors', 'month_sold' and 'day_sold' toguether

Defining target and features

In [157]:
X = df.drop(['price', 'sqft_lot', 'floors', 'month_sold','day_sold'], axis=1) # Features
y = df['price'] # Target
X.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year_sold
0,3,1.0,1180,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,2014
1,3,2.25,2570,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,2014
2,2,1.0,770,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,2015
3,4,3.0,1960,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,2014
4,3,2.0,1680,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,2015


Split the data

In [158]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)

### Linear Regression

In [159]:
from sklearn.linear_model import LinearRegression
 # Create the Linear Regression estimator
lm = LinearRegression()

# Perform the fitting
lm.fit(X_train, y_train)

Evaluation

In [160]:
metrics_df = create_metrics_df()
metrics_df = add_new_metrics(
    metrics_df, 
    lm,
    X_train,
    y_train,
    "train",
    "Whithout 4 columns"
)

metrics_df

  updated_df = pd.concat([metrics_df, new_row_df], ignore_index=True)


Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6976,0.6973,126045.8997,202924.743,0.2561,Whithout 4 columns


In [161]:
metrics_df = add_new_metrics(
    metrics_df, 
    lm,
    X_test,
    y_test,
    "test",
    "Whithout 4 columns"
)

metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6976,0.6973,126045.8997,202924.743,0.2561,Whithout 4 columns
1,LinearRegression,test,0.7157,0.7145,126103.8682,191706.7474,0.2597,Whithout 4 columns


### Random Forest

In [162]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=13)

rf_regressor.fit(X_train, y_train)

Evaluation

In [163]:
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_train,
    y_train,
    "train",
    "Whithout 4 columns"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6976,0.6973,126045.8997,202924.743,0.2561,Whithout 4 columns
1,LinearRegression,test,0.7157,0.7145,126103.8682,191706.7474,0.2597,Whithout 4 columns
2,RandomForestRegressor,train,0.9823,0.9823,25887.6388,49105.0748,0.0486,Whithout 4 columns


In [164]:
metrics_df = add_new_metrics(
    metrics_df, 
    rf_regressor,
    X_test,
    y_test,
    "test",
    "Whithout 4 columns"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6976,0.6973,126045.8997,202924.743,0.2561,Whithout 4 columns
1,LinearRegression,test,0.7157,0.7145,126103.8682,191706.7474,0.2597,Whithout 4 columns
2,RandomForestRegressor,train,0.9823,0.9823,25887.6388,49105.0748,0.0486,Whithout 4 columns
3,RandomForestRegressor,test,0.8975,0.8971,67388.4881,115083.047,0.1276,Whithout 4 columns


### XGBoost

In [165]:
import xgboost as xgb

xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)


Evaluation

In [166]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_train,
    y_train,
    "train",
    "Whithout 4 columns"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6976,0.6973,126045.8997,202924.743,0.2561,Whithout 4 columns
1,LinearRegression,test,0.7157,0.7145,126103.8682,191706.7474,0.2597,Whithout 4 columns
2,RandomForestRegressor,train,0.9823,0.9823,25887.6388,49105.0748,0.0486,Whithout 4 columns
3,RandomForestRegressor,test,0.8975,0.8971,67388.4881,115083.047,0.1276,Whithout 4 columns
4,XGBRegressor,train,0.9756,0.9756,40967.3963,57584.1626,0.0907,Whithout 4 columns


In [167]:
metrics_df = add_new_metrics(
    metrics_df, 
    xgb_reg,
    X_test,
    y_test,
    "test",
    "Whithout 4 columns"
)
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6976,0.6973,126045.8997,202924.743,0.2561,Whithout 4 columns
1,LinearRegression,test,0.7157,0.7145,126103.8682,191706.7474,0.2597,Whithout 4 columns
2,RandomForestRegressor,train,0.9823,0.9823,25887.6388,49105.0748,0.0486,Whithout 4 columns
3,RandomForestRegressor,test,0.8975,0.8971,67388.4881,115083.047,0.1276,Whithout 4 columns
4,XGBRegressor,train,0.9756,0.9756,40967.3963,57584.1626,0.0907,Whithout 4 columns
5,XGBRegressor,test,0.9008,0.9004,67442.865,113232.2624,0.1288,Whithout 4 columns


### Conclusion

In [168]:
metrics_df.to_csv('Metrics/drop_4_columns.csv')

In [169]:
base_metrics

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,RMSE,MAPE,Comments
0,LinearRegression,train,0.6977,0.6974,125948.1118,202864.5703,0.2561,Baseline model
1,LinearRegression,test,0.7162,0.7148,125985.6747,191531.3335,0.2596,Baseline model
4,RandomForestRegressor,train,0.982,0.9819,25948.4444,49547.1197,0.0486,"Baseline, no normalization, random_state 13, d..."
5,RandomForestRegressor,test,0.8959,0.8954,67269.877,115994.2051,0.1271,"Baseline, no normalization, random_state 13, d..."
6,XGBRegressor,train,0.978,0.978,39126.7558,54676.7727,0.0872,"Baseline, no normalization, default values."
7,XGBRegressor,test,0.9015,0.901,65712.7448,112860.4995,0.1246,"Baseline, no normalization, default values."


Dropping this columns improves slightly in general the scores of Linear Regression and XGB but not Random Forest