In [187]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")   # se notebook sta in notebooks/ e src/ è a fianco

import src.data as dt
import src.models as md
import src.features as ft
import src.predict as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [99]:
(X_train, X_val, y_train, y_val), X, y = dt.load_train_set()

## 2. Feature Importance

In [63]:
importance_dfs = ft.get_feature_importances(md.MODELS, X_train, y_train)
importance_df = ft.combine_feature_importances(importance_dfs)

print(importance_df.shape)
print(len(importance_df))
importance_df.head()

Values to compute: .
(66, 7)
66


Unnamed: 0,feature,importance_xgb,importance_ctb,importance_lgb,importance_svr,importance_mlp,importance_knn
33,1stFlrSF,0.050795,0.055531,0.047587,0.077864,0.126215,0.032349
34,2ndFlrSF,0.014597,0.026175,0.018163,0.062962,0.11268,0.018871
59,3SsnPorch,0.0,7e-06,0.0,0.0,0.0,0.0
41,BedroomAbvGr,0.00285,0.008655,0.001769,0.004588,0.013974,0.008968
7,BldgType,0.003606,0.003777,0.000374,0.000508,0.009924,0.004133


In [64]:
comparison_df = ft.get_feature_comparison(importance_df)

ft.plot_importance_comparison(comparison_df)
ft.plot_rank_comparison(comparison_df)

In [None]:
feature_numbers = [10, 15, 20, 25, 30, 35]
scores = ft.evaluate_feature_number_per_models(md.MODELS, feature_numbers, X_train, y_train, features_by_importance)

In [72]:
ft.plot_model_scores(scores, feature_numbers, 3)


## 3. Model optimization

### 3.1 Single models

In [None]:
params = {}
cv_scores = {}
val_scores = {}
studies = {}

features_by_importance = ft.get_features_by_importance(comparison_df)

for key in md.MODELS:
    print(f'\nEvaulating: {md.MODELS[key]["name"]}')
    params[key], cv_scores[key], val_scores[key], studies[key] = md.evaluate_best_feature_number(
        md.MODELS[key],
        features_by_importance,
        [10, 20, 30, 40, 50, 60, 66],
        X_train,
        y_train,
        X_val,
        y_val,
    )
    
md.save_models_values(params, cv_scores, val_scores)


Evaulating: XGBoost Regressor
processsing number 10
cv_score: 0.30815722698911874, val_score: 0.39637082072552204, param: {'n_estimators': 289, 'learning_rate': 0.028290937070217625, 'max_depth': 5}
processsing number 20
cv_score: 0.2948775703787908, val_score: 0.3723840567619735, param: {'n_estimators': 277, 'learning_rate': 0.11963787101883668, 'max_depth': 2}
processsing number 30
cv_score: 0.2844780811885722, val_score: 0.3629209696781884, param: {'n_estimators': 1997, 'learning_rate': 0.04499079104570627, 'max_depth': 2}
processsing number 40
cv_score: 0.28080933956408777, val_score: 0.35595040013435597, param: {'n_estimators': 1013, 'learning_rate': 0.08530831194948968, 'max_depth': 2}
processsing number 50
cv_score: 0.2816905675923344, val_score: 0.35964622779773164, param: {'n_estimators': 1262, 'learning_rate': 0.09690087196285094, 'max_depth': 2}
processsing number 60
cv_score: 0.28194998554613837, val_score: 0.3558679049276485, param: {'n_estimators': 1641, 'learning_rate':

In [224]:
params, cv_scores, val_scores = md.load_models_values()

In [219]:
md.plot_model_scores(cv_scores, val_scores, [10, 20, 30, 40, 50, 60, 66], 3)

### 3.2 Stacking

In [215]:
model_selection = {
    'svr': 40,
    'xgb': 66,
    'ctb': 66,
    'lgb': 40,
    'mlp': 40,
    'knn': 20,
}

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge


def create_estimators(model_selection):
    estimators = []

    for key in model_selection:
        n = model_selection[key]
        feat_sel = ColumnTransformer(
            [(f"sel_{str(n)}", "passthrough", features_by_importance[:n])]
        )
        
        model = md.MODELS[key]['model']
        best_params = params[key][str(n)]
        estimators.append((key, Pipeline(
            [
                ('select', feat_sel),
                (key, model(**best_params))
            ]
        )))
        
    return estimators
    
def create_stack(model_selection, final_estimator):
    estimators = create_estimators(model_selection)
    
    return StackingRegressor(
        estimators,
        final_estimator=final_estimator,
        cv=5
    )
    
stack_regressor = create_stack(Ridge(alpha=1.0))
stack_regressor

0,1,2
,estimators,"[('svr', ...), ('xgb', ...), ...]"
,final_estimator,Ridge()
,cv,5
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,transformers,"[('sel_40', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,kernel,'rbf'
,degree,3
,gamma,0.0010233951435129876
,coef0,0.0
,tol,0.001
,C,35.96095969720813
,epsilon,0.06683786583436137
,shrinking,True
,cache_size,200
,verbose,False

0,1,2
,transformers,"[('sel_66', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,transformers,"[('sel_66', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,transformers,"[('sel_40', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,boosting_type,'gbdt'
,num_leaves,17
,max_depth,3
,learning_rate,0.1679945380290924
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,transformers,"[('sel_40', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,loss,'squared_error'
,hidden_layer_sizes,[243]
,activation,'relu'
,solver,'adam'
,alpha,0.9938031778771624
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.00015170042162798603
,power_t,0.5
,max_iter,2000

0,1,2
,transformers,"[('sel_20', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,n_neighbors,9
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [232]:
from sklearn.metrics import root_mean_squared_error

stack_regressor.fit(X_train, y_train)
y_pred = stack_regressor.predict(X_val)

print(root_mean_squared_error(y_val, y_pred))


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names



0.3238390396937007



X does not have valid feature names, but LGBMRegressor was fitted with feature names



## 4. Predictions

In [None]:
n = 30

In [134]:
X_test = dt.load_test_data()
raw_predictions = pd.get_raw_predictions(md.MLP, params_mlp[str(30)], X[features_by_importance[:n]], y, X_test[features_by_importance[:n]])
dt.prepare_submission(raw_predictions)