In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")   # se notebook sta in notebooks/ e src/ è a fianco

import src.data as dt
import src.models as md
import src.features as ft
import src.predict as pd
import mlflow

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
(X_train, X_val, y_train, y_val), X, y = dt.load_train_set()

In [23]:
mlflow.set_tracking_uri('http://127.0.0.1:8080')
experiment = mlflow.set_experiment('Kaggle_Ames_Housing')

data = X.copy()
data['SalePrice'] = y
dataset = mlflow.data.from_pandas(data, name='1-65_features_z=3_log_target')

## 2. Feature Importance

In [19]:
importance_dfs = ft.get_feature_importances(md.MODELS, X_train, y_train)
importance_df = ft.combine_feature_importances(importance_dfs)

print(importance_df.shape)
print(len(importance_df))
importance_df.head()

Values to compute: .
(65, 7)
65


Unnamed: 0,feature,importance_xgb,importance_ctb,importance_lgb,importance_svr,importance_mlp,importance_knn
33,1stFlrSF,0.036958,0.063832,0.038524,0.063769,0.029519,0.032321
34,2ndFlrSF,0.020995,0.032143,0.020722,0.062524,0.021373,0.022198
58,3SsnPorch,0.0,0.0,0.0,0.0,0.0,0.0
40,BedroomAbvGr,0.005514,0.003014,0.003541,0.00033,0.006322,0.007044
7,BldgType,0.00298,0.005433,0.001049,0.001794,0.000559,0.006424


In [20]:
comparison_df = ft.get_feature_comparison(importance_df)

ft.plot_importance_comparison(comparison_df)
ft.plot_rank_comparison(comparison_df)

In [None]:
feature_numbers = [10, 15, 20, 25, 30, 35]
# scores = ft.evaluate_feature_number_per_models(md.MODELS, feature_numbers, X_train, y_train, features_by_importance)

In [None]:
# ft.plot_model_scores(scores, feature_numbers, 3)


## 3. Model optimization

### 3.1 Single models

In [24]:
params = {}
cv_scores = {}
val_scores = {}
studies = {}

features_by_importance = ft.get_features_by_importance(comparison_df)

for key in md.MODELS:
    with mlflow.start_run(run_name=key):
        mlflow.set_tag('model', md.MODELS[key]['name'])
        mlflow.log_input(dataset)
    
        print(f'\nEvaulating: {md.MODELS[key]["name"]}')
        params[key], cv_scores[key], val_scores[key], studies[key] = md.evaluate_best_feature_number(
            md.MODELS[key],
            features_by_importance,
            [10, 20, 30, 40, 50, 60, 66],
            X_train,
            y_train,
            X_val,
            y_val,
        )
        
        best_par_num = min(val_scores[key], key=lambda x: val_scores[key][x])
        mlflow.log_params(params[key][best_par_num])
        mlflow.log_param('best_par_num', best_par_num)
        mlflow.log_metric('best_avg_cv_score', cv_scores[key][best_par_num].mean())
        mlflow.log_metric('best_std_cv_score', cv_scores[key][best_par_num].std())
        mlflow.log_metric('best_val_scores', val_scores[key][best_par_num])
        
        mlflow.end_run()
        
        
        
md.save_models_values(params, cv_scores, val_scores)


Evaulating: XGBoost Regressor
processsing number 10
cv_score: 0.342666009462209, val_score: 0.3413297573655316, param: {'n_estimators': 944, 'learning_rate': 0.018726555007116973, 'max_depth': 2}
processsing number 20
cv_score: 0.3164031133438835, val_score: 0.3395816247356788, param: {'n_estimators': 1040, 'learning_rate': 0.02639670132079428, 'max_depth': 2}
processsing number 30
cv_score: 0.3168306653633407, val_score: 0.3383845956538768, param: {'n_estimators': 612, 'learning_rate': 0.020766405705706877, 'max_depth': 2}
processsing number 40
cv_score: 0.3153239422445312, val_score: 0.3416846119998614, param: {'n_estimators': 747, 'learning_rate': 0.03556962537645393, 'max_depth': 2}
processsing number 50
cv_score: 0.31493869385235784, val_score: 0.33743893118081064, param: {'n_estimators': 452, 'learning_rate': 0.0525053301060766, 'max_depth': 2}
processsing number 60
cv_score: 0.315126430609749, val_score: 0.3364231411786321, param: {'n_estimators': 632, 'learning_rate': 0.041068

In [None]:
features_by_importance = ft.get_features_by_importance(comparison_df)
params, cv_scores, val_scores = md.load_models_values()

In [33]:
md.plot_model_scores(cv_scores, val_scores, [10, 20, 30, 40, 50, 60, 66], 3)

### 3.2 Stacking

In [38]:
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import Ridge

model_selection = {
    'xgb': 60,
    'ctb': 40,
    'lgb': 30,
    'svr': 66,
    'mlp': 50,
    # 'knn': 40,
}

In [39]:
oof_predictions_train, oof_predictions_val = md.get_oof_predictions(model_selection, X_train, y_train, X_val)

In [40]:
st_params = {}
st_cv_scores = {}
st_val_scores = {}
st_studies = {}


with mlflow.start_run(run_name='all'):
    mlflow.set_tag('model', 'Stacking Regressor')
    mlflow.log_input(dataset)
    mlflow.log_params(model_selection)
    
    for key in md.META_MODELS:
        with mlflow.start_run(nested=True, run_name=md.META_MODELS[key]['name']):
            print(f"Processing {md.META_MODELS[key]['name']}.")

            st_params[key], st_cv_scores[key], st_val_scores[key], st_studies[key] = (
                md.optimize_and_evaluate_model(
                    md.META_MODELS[key],
                    oof_predictions_train,
                    y_train,
                    oof_predictions_val,
                    y_val,
                    model_selection,
                    features_by_importance,
                )
            )
            
            mlflow.log_params(st_params[key])
            mlflow.log_metric('best_avg_cv_score', st_cv_scores[key].mean())
            mlflow.log_metric('best_std_cv_score', st_cv_scores[key].std())
            mlflow.log_metric('best_val_scores', st_val_scores[key])
            
            mlflow.end_run()
            
    best_key = min(st_val_scores, key=lambda x: st_val_scores[x])
    mlflow.log_param('best_final_estimator', md.META_MODELS[best_key]['name'])
    mlflow.log_metric('best_avg_cv_score', st_cv_scores[best_key].mean())
    mlflow.log_metric('best_std_cv_score', st_cv_scores[best_key].std())
    mlflow.log_metric('best_val_scores', st_val_scores[best_key])    

Processing Lasso.



Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead



🏃 View run Lasso at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/dcd2fdc3e4344c8187e9ee1772ec1a63
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412
Processing Ridge.



Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead



🏃 View run Ridge at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/b49cdbd98af44f30b19924a98c78d43f
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412
Processing Elastic Net.



Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead



🏃 View run Elastic Net at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/7101b45ba00f44a59f41dbb757d10bba
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412
Processing Support Vector Regressor.
🏃 View run Support Vector Regressor at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/9f833666b6b24b4199c86b4201c4663e
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412
🏃 View run all at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/b523279d6b5f4869bcde04c94f00adcb
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412



Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead



In [None]:
md.plot_stacking_scores(cv_scores, val_scores, mode='mean', items_per_row=4)

## 4. Predictions

In [35]:
X_test = dt.load_test_data()

In [36]:
n = 66
raw_predictions = pd.get_raw_predictions(md.SVR, params['svr'][str(n)], X[features_by_importance[:n]], y, X_test[features_by_importance[:n]])
dt.prepare_submission(raw_predictions)

In [37]:
params['svr'][str(n)]

{'C': 13.707899835810318,
 'gamma': 0.0010323906322973826,
 'epsilon': 0.05825336140300437}

In [41]:
from sklearn.svm import SVR
from sklearn.linear_model import Lasso

stack_regressor = md.create_stack_regressor(model_selection, Lasso(alpha=0.001579019), features_by_importance)
stack_regressor.fit(X, y)
raw_predictions = stack_regressor.predict(X_test)
dt.prepare_submission(raw_predictions)


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names

