In [62]:
import pandas as pd
import numpy as np

raw_df = pd.read_excel("./Real estate valuation data set.xlsx")
del raw_df['No']
del raw_df['X1 transaction date']

X=raw_df.iloc[:,:-1].values
y=raw_df.iloc[:,-1].values

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [64]:
from sklearn.preprocessing import StandardScaler
std_sc=StandardScaler()

std_X_train = std_sc.fit_transform(X_train)
std_X_test = std_sc.transform(X_test)

In [65]:
from sklearn.decomposition import PCA

std_pca = PCA(n_components=2)
pca_X_train = std_pca.fit_transform(std_X_train)
pca_X_test = std_pca.transform(std_X_test)
print('Variance with 2 columns and Standard Scaler:', sum(std_pca.explained_variance_ratio_ * 100))

Variance with 2 columns and Standard Scaler: 73.49063042239908


In [66]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

rf = RandomForestRegressor(n_estimators=30, random_state=6, max_depth=20)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf_mae = mean_absolute_error(y_test, y_pred)
rf_mse = mean_squared_error(y_test, y_pred)
rf_r2 = r2_score(y_test, y_pred)
print(rf_r2)
print(rf_mae)
print(rf_mse)

0.713716718427643
5.004758940524002
49.727123424372515


In [67]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR

alphas = [0.1, 0.5, 1, 5, 10]
linear_models = {
    'linear_regression' : {
        'model': LinearRegression(),
        'params': {},
        'random_state': [0]
    }, 
    'ridge': {
        'model': Ridge(),
        'params': {
            'alpha': alphas,
            'random_state': [0]
        }
    }, 
    'lasso': {
        'model': Lasso(),
        'params': {
            'alpha': alphas,
            'random_state': [0]
        }
    },
    'elasticnet': {
        'model': ElasticNet(),
        'params': {
            'alpha': alphas,
            'random_state': [0]
        }
    }
}

In [68]:
from sklearn.model_selection import GridSearchCV

standard_scores = []
pca_scores = []
for model, options in linear_models.items():
  gs_scaled = GridSearchCV(options['model'], options['params'], cv=5)
  gs_scaled.fit(std_X_train, y_train)
  standard_scores.append({
      'model': model,
      'best_score': gs_scaled.best_score_,
      'best_params': gs_scaled.best_params_
  })
  gs_pca = GridSearchCV(options['model'], options['params'], cv=5)
  gs_pca.fit(pca_X_train, y_train)
  pca_scores.append({
      'model': model,
      'best_score': gs_pca.best_score_,
      'best_params': gs_pca.best_params_
  })

In [78]:
print(gs_scaled.best_estimator_)
y_pred_scale = gs_scaled.predict(std_X_test)
print(r2_score(y_test, y_pred_scale))
print(mean_absolute_error(y_test, y_pred_scale))
print(mean_squared_error(y_test, y_pred_scale))

ElasticNet(alpha=0.1, random_state=0)
0.637839655928244
5.7889050050091
62.906894283722494


In [79]:
print(gs_scaled.best_estimator_)
y_pred_scale = gs_pca.predict(pca_X_test)
print(r2_score(y_test, y_pred_scale))
print(mean_absolute_error(y_test, y_pred_scale))
print(mean_squared_error(y_test, y_pred_scale))

ElasticNet(alpha=0.1, random_state=0)
0.5984322965842925
6.356587986277489
69.75191370351696


In [73]:
gs_svr_std = GridSearchCV(estimator=SVR(), 
             param_grid = 
             {
                'C': [1, 10, 20],
                'kernel': ['linear', 'rbf', 'poly'],
                'gamma': ['scale', 'auto']
            }, cv=5)

gs_svr_std.fit(std_X_train, y_train)

gs_svr_pca = GridSearchCV(estimator=SVR(), 
             param_grid = 
             {
                'C': [1, 10, 20],
                'kernel': ['linear', 'rbf', 'poly'],
                'gamma': ['scale', 'auto']
            }, cv=5)

gs_svr_pca.fit(pca_X_train, y_train)

In [74]:
print(gs_svr_std.best_score_)
print(gs_svr_pca.best_score_)

0.6630174527850374
0.5870099798299374


In [80]:
print(gs_svr_std.best_estimator_)
y_pred_scale = gs_svr_std.predict(std_X_test)
print(r2_score(y_test, y_pred_scale))
print(mean_absolute_error(y_test, y_pred_scale))
print(mean_squared_error(y_test, y_pred_scale))

SVR(C=20, gamma='auto')
0.713099337024311
5.1021199953086915
49.83436196472589


In [81]:
print(gs_svr_pca.best_estimator_)
y_pred_scale = gs_svr_pca.predict(pca_X_test)
print(r2_score(y_test, y_pred_scale))
print(mean_absolute_error(y_test, y_pred_scale))
print(mean_squared_error(y_test, y_pred_scale))

SVR(C=10)
0.6279564233955536
5.703751462323295
64.6236020191012


In [75]:
gs_rf_std = GridSearchCV(estimator=RandomForestRegressor(), 
             param_grid = 
             {
              'n_estimators': [10, 20, 30, 40, 50, 100, 150],
              'max_depth': [20, 40, 60],
              'random_state': [0],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]
             }, cv=5)
gs_rf_std.fit(std_X_train, y_train)

gs_rf_pca = GridSearchCV(estimator=RandomForestRegressor(), 
             param_grid = 
             {
              'n_estimators': [10, 20, 30, 40, 50, 100, 150],
              'max_depth': [20, 40, 60],
              'random_state': [0],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]
             }, cv=5)
gs_rf_pca.fit(pca_X_train, y_train)

In [84]:
print(gs_rf_std.best_score_)
print(gs_rf_pca.best_score_)

0.691115704798381
0.6406131686184092


In [83]:
print(gs_rf_std.best_estimator_)
y_pred_scale = gs_rf_std.predict(std_X_test)
print(r2_score(y_test, y_pred_scale))
print(mean_absolute_error(y_test, y_pred_scale))
print(mean_squared_error(y_test, y_pred_scale))

RandomForestRegressor(max_depth=20, min_samples_leaf=4, n_estimators=150,
                      random_state=0)
0.7537158243645968
4.698451389302878
42.77931820547523


In [85]:
print(gs_rf_pca.best_estimator_)
y_pred_scale = gs_rf_pca.predict(pca_X_test)
print(r2_score(y_test, y_pred_scale))
print(mean_absolute_error(y_test, y_pred_scale))
print(mean_squared_error(y_test, y_pred_scale))

RandomForestRegressor(max_depth=20, min_samples_leaf=4, n_estimators=150,
                      random_state=0)
0.6618173792485824
5.364156922785072
58.74198741093129
