# Load Package

In [327]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Load Data จากใน Google Colab

In [328]:
PATH_LOAD_TRAIN = 'dataset/california_housing_/california_housing_train.csv'
PATH_LOAD_TEST = 'dataset/california_housing_/california_housing_test.csv'

In [329]:
df_train = pd.read_csv(PATH_LOAD_TRAIN) # define variable 
df_test = pd.read_csv(PATH_LOAD_TEST) # define variable 

In [330]:
df_train.columns # columns check

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

# Preprocess Data

- Define function to find missing values

In [331]:
def handling_missing_value(_df):
    for col in _df.columns:
        _df[col] = _df[col].fillna(_df[col].mean()) # replace missing value 
        
    return _df # output dataframe have not N/A by replace df[col]-mean all columns

- Define function to remove multicolinerity

In [332]:
def remove_multicolinearity(_df, _threshold):
    
    corr_matrix = _df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > _threshold)]
    
    print(f'Drop Features: {to_drop}')
    
    # Drop features # Drop highly corrlerate features
    _df = _df.drop(columns = to_drop)
    
    return _df

- Define function to remove outliers

In [333]:
# IQR method
def remove_outlier(_df, _labels):
    q3_label = _df[_labels].quantile(.75) # define label at quantiile(.75)
    q1_label = _df[_labels].quantile(.25) # define label at quantiile(.25)
    
    IQR = q3_label - q1_label # IQR poit formular
    
    thershold_outlier_max = q3_label + 1.5*IQR # upper thershold 
    thershold_outlier_min = q3_label - 1.5*IQR # lower thershold 
    
    _df_remove_outlier = _df[(_df[_labels] >= thershold_outlier_min) &\
                             (_df[_labels] <= thershold_outlier_max)]
    
    return _df_remove_outlier # output dataframe dont have outlier

- Define function to normalization and standardize Scaler

In [334]:
def norm_and_scale(_df, _labels):
    
    _norm = MinMaxScaler() # Create Objective for normalize
    _stand = StandardScaler() # Create Objective for StandardScaler
    
    cols = [col for col in _df.columns if col != _labels] # col constraints are not label columns
    
    _df[cols] = _norm.fit_transform(_df[cols])
    _df[cols] = _stand.fit_transform(_df[cols])
    
    return _df, _norm, _stand # # output dataframe normalize and standard scaler success

- Preprocessing Data sum function

In [335]:
def handling_preprocessing_data(_df, _labels, _threshold= 0.8):
    
    # Preprocessing Data
    _df = handling_missing_value(_df) # 1.Define function to find missing values
    _df = remove_multicolinearity(_df, _threshold) # 2.remove_multicolinearity
    _df = remove_outlier(_df, _labels) # 3.remove_outlier
    _df, _train_norm, _train_stand = norm_and_scale(_df, _labels) # 4.norm_and_scale
    
    return _df, _train_norm, _train_stand # output three values df_prep, df_train_norm, df_train_stand
    

# Using function handling_preprocessing_data

In [336]:
df_train.median_house_value[:5]

0    66900.0
1    80100.0
2    85700.0
3    73400.0
4    65500.0
Name: median_house_value, dtype: float64

In [337]:
df_train_prep, train_norm, train_stand = handling_preprocessing_data(df_train, 'median_house_value', _threshold=.95)
df_train_prep.shape


Drop Features: ['households']


(16013, 8)

# Save Normalize, Standard

In [338]:
import joblib

In [339]:
PATH_SAVE_SCALE = 'prep/house_price/'

joblib.dump(train_norm, PATH_SAVE_SCALE + 'norm_hp.joblib')
joblib.dump(train_stand, PATH_SAVE_SCALE + 'stand_hp.joblib')

['prep/house_price/stand_hp.joblib']

# Split - Validation

In [340]:
# import train, test data
from sklearn.model_selection import train_test_split

In [341]:
# function for train test split
def split_data(_df, _label, test_size, random_state):
    
    X = _df.drop(columns=[_label]).values
    y = _df[_label].values
    
    _X_train, _X_test, _y_train, _y_test = train_test_split(X, y, test_size=0.3, random_state=2020)
    
    return _X_train, _X_test, _y_train, _y_test

In [342]:
X_train, X_test, y_train, y_test = split_data(df_train_prep, 'median_house_value', test_size=0.3, random_state=20)
    

# Train

In [343]:
from sklearn.model_selection import GridSearchCV

# Setting Parameters ขึ้นกับ Model

In [344]:
from sklearn.metrics import *
from sklearn.linear_model import ElasticNet

In [345]:
SCORERS.keys() # find key for use

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [346]:
reg_enet = ElasticNet() # create object fot train model

# Assumption parameters
params = [
    {'alpha': [0, 0.2, 0.4, 0.6, 0.8, 1],
    'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1],
    'max_iter': [100, 1000],
    'random_state': [2020, 2018, 2077, 2222]}
]

In [347]:
# Refer https://scikit-learn.org/stable/modules/model_evaluation.html
scoring = {
    'r_square': 'r2',
    'mse': 'neg_mean_squared_error'
}

In [348]:
def train_model(_classifier, _params, _scoring, _X_train, _y_train, _cv):
    
    _grid = GridSearchCV(_classifier, param_grid= _params, refit='mse', scoring= _scoring, cv= _cv)
    _grid.fit(_X_train, _y_train)
    
    return _grid

In [349]:
%%time
grid = train_model(reg_enet, params, scoring, X_train, y_train, _cv=5)

  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  po

Wall time: 23.7 s


  positive)


In [350]:
results = grid.cv_results_ # results become to Dictionary
df_result = pd.DataFrame(results).sort_values(by='rank_test_mse')

In [351]:
df_result[:1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,param_max_iter,param_random_state,params,split0_test_r_square,...,std_test_r_square,rank_test_r_square,split0_test_mse,split1_test_mse,split2_test_mse,split3_test_mse,split4_test_mse,mean_test_mse,std_test_mse,rank_test_mse
283,0.0092,0.001166,0.0012,0.000399,1,1,100,2222,"{'alpha': 1, 'l1_ratio': 1, 'max_iter': 100, '...",0.628244,...,0.030533,1,-3535411000.0,-3562962000.0,-3591842000.0,-3704942000.0,-4111776000.0,-3701387000.0,213139300.0,1


In [352]:
from math import sqrt

sqrt(df_result[:1]['mean_test_mse'].abs().tolist()[0])

60839.02153454268

In [353]:
df_result[-1:]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,param_max_iter,param_random_state,params,split0_test_r_square,...,std_test_r_square,rank_test_r_square,split0_test_mse,split1_test_mse,split2_test_mse,split3_test_mse,split4_test_mse,mean_test_mse,std_test_mse,rank_test_mse
241,0.009202,0.000748,0.0008,0.0004,1,0,100,2018,"{'alpha': 1, 'l1_ratio': 0, 'max_iter': 100, '...",0.374983,...,0.014234,281,-5943924000.0,-5857289000.0,-5829226000.0,-5867324000.0,-5918609000.0,-5883275000.0,41899380.0,285


In [354]:
sqrt(df_result[-1:]['mean_test_mse'].abs().tolist()[0])

76702.50669492205

# Check Best Params

In [355]:
best_params = df_result[df_result['rank_test_mse']== 1]['params'].tolist()
best_params

[{'alpha': 1, 'l1_ratio': 1, 'max_iter': 100, 'random_state': 2222},
 {'alpha': 1, 'l1_ratio': 1, 'max_iter': 100, 'random_state': 2077},
 {'alpha': 1, 'l1_ratio': 1, 'max_iter': 100, 'random_state': 2018},
 {'alpha': 1, 'l1_ratio': 1, 'max_iter': 100, 'random_state': 2020}]

# Full Train with Best Params

In [356]:
best_model = ElasticNet(alpha=1, l1_ratio=1, max_iter=100, random_state=2222)

In [357]:
df_train_prep.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'median_income', 'median_house_value'],
      dtype='object')

In [358]:
X = df_train_prep.drop(columns='median_house_value').values
y = df_train_prep['median_house_value'].values

In [359]:
%%time
best_model.fit(X, y) # train full model by the best paramiter

Wall time: 16 ms


  positive)


ElasticNet(alpha=1, copy_X=True, fit_intercept=True, l1_ratio=1, max_iter=100,
           normalize=False, positive=False, precompute=False, random_state=2222,
           selection='cyclic', tol=0.0001, warm_start=False)

# Save Best Model

In [360]:
PATH_SAVE_MODEL = 'prep/house_price/'

joblib.dump(best_model, PATH_SAVE_MODEL + 'best_enet_01.joblib')


['prep/house_price/best_enet_01.joblib']

# Test with data before full training data

In [361]:
r_square_train = best_model.score(X_train, y_train)
r_square_test = best_model.score(X_test, y_test)

print(f'r2-train: {r_square_train:.3f}, r2-test: {r_square_test:.3f}')

r2-train: 0.599, r2-test: 0.616


In [362]:
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

rmse_train = sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = sqrt(mean_squared_error(y_test, y_pred_test))

print(f'rmse_train: {rmse_train:.3f}, rmse_test: {rmse_test:.3f}')

rmse_train: 60685.047, rmse_test: 58571.085


# Load Preprocess and Testing

In [363]:
df_test[:3]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0


In [364]:
# Choose columns for df_test from df_train columns
def align_columns_train_test(_df_train, _df_test):
    
    _train_cols = _df_train.columns
    _df_test = _df_test[_train_cols]
    
    return _df_test

In [365]:
df_test = align_columns_train_test(df_train_prep, df_test)
df_test[:3]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,5.7934,270500.0


In [366]:
X_ftest = df_test.drop(columns='median_house_value').values
y_ftest = df_test['median_house_value'].values

In [367]:
# norm and standardscale
def norm_and_scale_test_data(_X_ftest, _norm, _stand):
    
    _X_ftest =  _norm.fit_transform(_X_ftest)
    _X_ftest = _stand.fit_transform(_X_ftest)
    
    return _X_ftest

In [368]:
# load data norm, standard scale

loaded_norm  = joblib.load(PATH_SAVE_SCALE + 'norm_hp.joblib')
loaded_norm  = joblib.load(PATH_SAVE_SCALE + 'stand_hp.joblib')

In [369]:
# normalize and standardscale by using norm_and_scale_test_data function
X_ftest_scale = norm_and_scale_test_data(X_ftest, train_norm, train_stand)

# Prediction

In [370]:
y_pred_ftest = best_model.predict(X_ftest_scale)
rmse_ftest = sqrt(mean_squared_error(y_ftest, y_pred_ftest))

print(f'r2-full-train: {rmse_ftest:.3f}')

r2-full-train: 72431.681


In [371]:
# check 3 values 
y_pred_ftest[:3]

array([315753.43830788, 199362.62774608, 242863.80251639])

# Svae Submission

In [372]:
df_submission = pd.DataFrame({'Id': df_test.index, 'SalePrice': y_pred_ftest})

In [373]:
df_submission[:3]

Unnamed: 0,Id,SalePrice
0,0,315753.438308
1,1,199362.627746
2,2,242863.802516


In [374]:
PATH_SAVE_SUB = 'dataset/california_housing_/submission/'

In [376]:
df_submission.to_csv(PATH_SAVE_SUB + 'submission_ver01.csv', index=False)

# Interprete Regression Model

In [None]:
beta_0 = best_model.intercept_
beta_ls = best_model.coef_

In [None]:
beta_0, beta_ls

In [None]:
df_beta = pd.DataFrame({'features': df_test.drop(columns='median_house_value').columns,
                       'beta_val': beta_ls}).sort_values(by= 'beta_val', ascending=False)

In [None]:
df_beta