In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")   # se notebook sta in notebooks/ e src/ è a fianco

import src.data as dt
import src.models as md
import src.features as ft
import src.predict as pd
import mlflow

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
X_train, X_val, y_train, y_val, full_df = dt.load_train_data()

In [20]:
full_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,QualArea,CondArea,HouseAge,RemodAge,HasBasement,HasGarage,HasFireplace,HasPool,Has2ndFloor,SalePrice
0,-0.871767,0.356203,-0.067789,-0.058745,0.067698,0.215833,-0.700031,-0.009381,0.03022,-0.342755,...,-1.04377,-0.309437,0.564975,-1.052462,0.170613,0.242776,-1.064644,-0.074193,-0.870019,-0.561773
1,-0.871767,0.356203,-0.35972,0.35448,0.067698,0.215833,-0.700031,1.274867,0.03022,-0.342755,...,-1.638013,-0.528565,0.891369,-0.906663,-5.855872,0.242776,-1.064644,-0.074193,-0.870019,-1.726913
2,1.489898,-2.349035,-0.162089,-0.944902,0.067698,0.215833,0.988369,3.506864,0.03022,-0.342755,...,0.842819,0.119219,-1.197554,-1.101062,0.170613,0.242776,0.938423,-0.074193,-0.870019,2.150389
3,2.906896,-2.349035,-2.000666,-1.737684,0.067698,0.215833,-0.700031,-0.009381,0.03022,-0.342755,...,-0.566521,-1.057407,-1.164914,-1.101062,0.170613,0.242776,-1.064644,-0.074193,-0.870019,-0.460278
4,-0.871767,0.356203,0.884896,0.297002,0.067698,0.215833,0.988369,-0.009381,0.03022,-0.014918,...,-0.349867,1.018681,-0.185732,-0.955263,0.170613,0.242776,0.938423,-0.074193,-0.870019,0.661214


In [10]:
mlflow.set_tracking_uri('http://127.0.0.1:8080')
experiment = mlflow.set_experiment('Kaggle_Ames_Housing')

dataset = mlflow.data.from_pandas(full_df, name='93_features_log_attr_targ')

## 2. Feature Importance

In [11]:
importance_dfs = ft.get_feature_importances(md.MODELS, X_train, y_train)
importance_df = ft.combine_feature_importances(importance_dfs)

print(importance_df.shape)
print(len(importance_df))
importance_df.head()

Values to compute: xgb, ctb, lgb, svr, mlp, knn.
xgb analyzed.
ctb analyzed.
lgb analyzed.


100%|██████████| 150/150 [03:56<00:00,  1.58s/it]


svr analyzed.


100%|██████████| 150/150 [00:04<00:00, 30.38it/s]


mlp analyzed.


100%|██████████| 150/150 [00:08<00:00, 17.79it/s]

knn analyzed.
a
a
a
a
a
a
(92, 7)
92





Unnamed: 0,feature,importance_xgb,importance_ctb,importance_lgb,importance_svr,importance_mlp,importance_knn
42,1stFlrSF,0.011525,0.021607,0.012634,0.032836,0.016621,0.016313
43,2ndFlrSF,0.005384,0.012723,0.004331,0.001088,0.070467,0.006724
68,3SsnPorch,0.001234,0.000409,0.000158,0.00309,0.003583,0.006563
5,Alley,0.003834,0.000392,0.000185,0.003206,0.003025,0.006928
50,BedroomAbvGr,0.004635,0.001634,0.00163,0.000683,0.005438,0.000832


In [12]:
comparison_df = ft.get_feature_comparison(importance_df)

ft.plot_importance_comparison(comparison_df)
ft.plot_rank_comparison(comparison_df)

## 3. Model optimization

In [21]:
feature_numbers = [10, 20, 30, 40, 50, 60, 70, 80, len(X_train.columns)]

### 3.1 Single models

In [None]:
params = {}
cv_scores = {}
val_scores = {}
studies = {}

features_by_importance = ft.get_features_by_importance(comparison_df)

for key in md.MODELS:
    with mlflow.start_run(run_name=key):
        mlflow.set_tag('model', md.MODELS[key]['name'])
        mlflow.log_input(dataset)
    
        print(f'\nEvaulating: {md.MODELS[key]["name"]}')
        params[key], cv_scores[key], val_scores[key], studies[key] = md.evaluate_best_feature_number(
            md.MODELS[key],
            features_by_importance,
            feature_numbers,
            X_train,
            y_train,
            X_val,
            y_val,
        )
        
        best_par_num = min(val_scores[key], key=lambda x: val_scores[key][x])
        mlflow.log_params(params[key][best_par_num])
        mlflow.log_param('best_par_num', best_par_num)
        mlflow.log_metric('best_avg_cv_score', cv_scores[key][best_par_num].mean())
        mlflow.log_metric('best_std_cv_score', cv_scores[key][best_par_num].std())
        mlflow.log_metric('best_val_scores', val_scores[key][best_par_num])
        
        mlflow.end_run()
        
md.save_models_values(params, cv_scores, val_scores)

In [None]:
features_by_importance = ft.get_features_by_importance(comparison_df)
params, cv_scores, val_scores = md.load_models_values()

In [25]:
md.plot_model_scores(cv_scores, val_scores, feature_numbers, 3)

### 3.2 Stacking

In [51]:
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import Ridge

model_selection = {
    # 'xgb': 92,
    'ctb': 50,
    'lgb': 60,
    'svr': 50,
    'mlp': 50,
    # 'knn': 10,
}

In [47]:
oof_predictions_train, oof_predictions_val = md.get_oof_predictions(model_selection, X_train, y_train, X_val)

In [48]:
st_params = {}
st_cv_scores = {}
st_val_scores = {}
st_studies = {}


with mlflow.start_run(run_name='all'):
    mlflow.set_tag('model', 'Stacking Regressor')
    mlflow.log_input(dataset)
    mlflow.log_params(model_selection)
    
    for key in md.META_MODELS:
        with mlflow.start_run(nested=True, run_name=md.META_MODELS[key]['name']):
            print(f"Processing {md.META_MODELS[key]['name']}.")

            st_params[key], st_cv_scores[key], st_val_scores[key], st_studies[key] = (
                md.optimize_and_evaluate_model(
                    md.META_MODELS[key],
                    oof_predictions_train,
                    y_train,
                    oof_predictions_val,
                    y_val,
                    model_selection,
                    features_by_importance,
                )
            )
            
            mlflow.log_params(st_params[key])
            mlflow.log_metric('best_avg_cv_score', st_cv_scores[key].mean())
            mlflow.log_metric('best_std_cv_score', st_cv_scores[key].std())
            mlflow.log_metric('best_val_scores', st_val_scores[key])
            
            mlflow.end_run()
            
    best_key = min(st_val_scores, key=lambda x: st_val_scores[x])
    mlflow.log_param('best_final_estimator', md.META_MODELS[best_key]['name'])
    mlflow.log_metric('best_avg_cv_score', st_cv_scores[best_key].mean())
    mlflow.log_metric('best_std_cv_score', st_cv_scores[best_key].std())
    mlflow.log_metric('best_val_scores', st_val_scores[best_key])    

Processing Lasso.



Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead



🏃 View run Lasso at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/af725e07f9a443c6a202e381e214407d
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412
Processing Ridge.



Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead



🏃 View run Ridge at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/42c2b360bf1147a9ab7f03b521d77343
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412
Processing Elastic Net.



Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead



🏃 View run Elastic Net at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/f86575469d4445a1b4a502a669afdce2
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412
Processing Support Vector Regressor.
🏃 View run Support Vector Regressor at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/4bef85a2a21d4bea937bb3ea24093fb3
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412
🏃 View run all at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/1c28133c7d6d4a3f8e9ab9e68ee857d0
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412



Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead



In [53]:
md.plot_stacking_scores(st_cv_scores, st_val_scores, mode='folds', items_per_row=2)

## 4. Predictions

In [38]:
X, y, X_test = dt.load_test_data()

In [42]:
n = 50
raw_predictions = pd.get_raw_predictions(md.SVR, params['svr'][str(n)], X[features_by_importance[:n]], y, X_test[features_by_importance[:n]])
dt.prepare_submission(raw_predictions)

In [52]:
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge

stack_regressor = md.create_stack_regressor(model_selection, Lasso(alpha=0.0021994031), features_by_importance)
stack_regressor.fit(X, y)
raw_predictions = stack_regressor.predict(X_test)
dt.prepare_submission(raw_predictions)


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names

