In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")   # se notebook sta in notebooks/ e src/ è a fianco

import src.data as dt
import src.models as md
import src.features as ft
import src.predict as pd
import src.preprocessing as pp
import mlflow

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
X_train, X_val, y_train, y_val, full_df = dt.load_train_data()

print('X_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)
print('X_val shape: ', X_val.shape)
print('y_val shape: ', y_val.shape)

print('Full dataframe shape: ', full_df.shape)
full_df.head()

In [11]:
mlflow.set_tracking_uri('http://127.0.0.1:8080')
experiment = mlflow.set_experiment('Kaggle_Ames_Housing')

dataset = mlflow.data.from_pandas(full_df, name='93_features_log_attr_targ')

## 2. Feature Importance

In [8]:
importance_dfs = ft.get_feature_importances(md.MODELS, X_train, y_train)
importance_df = ft.combine_feature_importances(importance_dfs)

print(importance_df.shape)
print(len(importance_df))
importance_df.head()

Values to compute: .
(92, 7)
92


Unnamed: 0,feature,importance_xgb,importance_ctb,importance_lgb,importance_svr,importance_mlp,importance_knn
42,1stFlrSF,0.011525,0.021607,0.012634,0.032836,0.016621,0.016313
43,2ndFlrSF,0.005384,0.012723,0.004331,0.001088,0.070467,0.006724
68,3SsnPorch,0.001234,0.000409,0.000158,0.00309,0.003583,0.006563
5,Alley,0.003834,0.000392,0.000185,0.003206,0.003025,0.006928
50,BedroomAbvGr,0.004635,0.001634,0.00163,0.000683,0.005438,0.000832


In [9]:
comparison_df = ft.get_feature_comparison(importance_df)

ft.plot_importance_comparison(comparison_df)
ft.plot_rank_comparison(comparison_df)

## 3. Outliers

In [92]:
pp.hist_matrix_with_outliers(X_train[['LotArea', 'QualArea', 'OverallGrade', 'CondArea', 'TotalSF']], z_thresh=4)

In [None]:
# discared for not improving performance

# outliers = pp.find_outliers_all(X_train[['LotArea', 'LotFrontage', 'QualArea', 'OverallGrade', 'CondArea', 'TotalSF']], z_thresh=4)

# X_train = X_train.drop(outliers)
# y_train = y_train.drop(outliers)

# print(f'Found outliers: {len(outliers)}')
# print(f'X_train shape: {X_train.shape}')
# print(f'y_train shape: {y_train.shape}')

Found outliers: 13
X_train shape: (1082, 92)
y_train shape: (1082,)


In [None]:
# full_df = X_train.copy()
# full_df['SalePrice'] = y_train

# dataset = mlflow.data.from_pandas(full_df, name='93_features_log_attr_targ-no_out_z=4')

## 5. Model optimization

In [21]:
feature_numbers = [20, 50, 92]
features_by_importance = ft.get_features_by_importance(comparison_df)

### 3.1 Single models

In [31]:
params = {}
cv_scores = {}
val_scores = {}
studies = {}

features_by_importance = ft.get_features_by_importance(comparison_df)

for key in {'svr': md.SVR}:
    # with mlflow.start_run(run_name=key):
        # mlflow.set_tag('model', md.MODELS[key]['name'])
        # mlflow.log_input(dataset)
    
        # print(f'\nEvaulating: {md.MODELS[key]["name"]}')
        params[key], cv_scores[key], val_scores[key], studies[key] = md.evaluate_best_feature_number(
            md.MODELS[key],
            features_by_importance,
            feature_numbers,
            X_train,
            y_train,
            X_val,
            y_val,
        )
        
        best_par_num = min(val_scores[key], key=lambda x: val_scores[key][x])
        # mlflow.log_params(params[key][best_par_num])
        # mlflow.log_param('best_par_num', best_par_num)
        # mlflow.log_metric('best_avg_cv_score', cv_scores[key][best_par_num].mean())
        # mlflow.log_metric('best_std_cv_score', cv_scores[key][best_par_num].std())
        # mlflow.log_metric('best_val_scores', val_scores[key][best_par_num])
        
        # mlflow.end_run()
        
# md.save_models_values(params, cv_scores, val_scores)

processsing number 20
cv_score: 0.3158938174519052, val_score: 0.34229612975122364, param: {'C': 47.10268491460695, 'gamma': 0.004149431773993826, 'epsilon': 0.2352323791698288}
processsing number 50
cv_score: 0.30199697527158553, val_score: 0.32892468838935335, param: {'C': 3.7328767364637137, 'gamma': 0.0035455818568278876, 'epsilon': 0.05852217090073111}
processsing number 92
cv_score: 0.30586149998173734, val_score: 0.32832633374542264, param: {'C': 4.518296525789322, 'gamma': 0.0011599330580039158, 'epsilon': 0.07078311463770695}


In [10]:
features_by_importance = ft.get_features_by_importance(comparison_df)
params, cv_scores, val_scores = md.load_models_values('model_optimization_new')

In [163]:
print(val_scores)

{'xgb': {'10': 0.34425525979359695, '20': 0.32759764231132343, '30': 0.32277901242449925, '40': 0.3175528177055372, '50': 0.3215509232606985, '60': 0.32253902887402747, '70': 0.32074504002718374, '80': 0.33057336191246184, '92': 0.3155689712243389}, 'ctb': {'10': 0.31723443143951696, '20': 0.30665500841963517, '30': 0.30512470112702833, '40': 0.30433995168619365, '50': 0.2972380279700994, '60': 0.3109702199491518, '70': 0.3042264152451372, '80': 0.3054649596868193, '92': 0.3088723214099789}, 'lgb': {'10': 0.3363639728314142, '20': 0.32926558506784626, '30': 0.3415427778038509, '40': 0.3390883065536448, '50': 0.32662085345300507, '60': 0.32242828395601175, '70': 0.32916505983028765, '80': 0.32452100593127536, '92': 0.3257991589014514}, 'svr': {'10': 0.3363786251651471, '20': 0.325778918842096, '30': 0.32011752053729425, '40': 0.3172427161249423, '50': 0.31206525299277793, '60': 0.32180851009698636, '70': 0.3276431727763828, '80': 0.3204252131699736, '92': 0.31395451541085057}, 'mlp': {'

In [164]:
md.plot_model_scores(cv_scores, val_scores, [10, 20, 30, 40, 50, 60, 70, 80, 92], 3)

### 3.2 Stacking

In [5]:
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import Ridge

model_selection = {
    'xgb': 92,
    'ctb': 50,
    'lgb': 60,
    'svr': 50,
    'mlp': 50,
    # 'knn': 20,
}

In [120]:
oof_predictions_train, oof_predictions_val = md.get_oof_predictions(model_selection, X_train, y_train, X_val)

In [121]:
st_params = {}
st_cv_scores = {}
st_val_scores = {}
st_studies = {}


with mlflow.start_run(run_name='div3'):
    mlflow.set_tag('model', 'Stacking Regressor')
    mlflow.log_input(dataset)
    mlflow.log_params(model_selection)
    
    for key in md.META_MODELS:
        with mlflow.start_run(nested=True, run_name=md.META_MODELS[key]['name']):
            print(f"Processing {md.META_MODELS[key]['name']}.")

            st_params[key], st_cv_scores[key], st_val_scores[key], st_studies[key] = (
                md.optimize_and_evaluate_model(
                    md.META_MODELS[key],
                    oof_predictions_train,
                    y_train,
                    oof_predictions_val,
                    y_val,
                    model_selection,
                    features_by_importance,
                )
            )
            
            mlflow.log_params(st_params[key])
            mlflow.log_metric('best_avg_cv_score', st_cv_scores[key].mean())
            mlflow.log_metric('best_std_cv_score', st_cv_scores[key].std())
            mlflow.log_metric('best_val_scores', st_val_scores[key])
            
            mlflow.end_run()
            
    best_key = min(st_val_scores, key=lambda x: st_val_scores[x])
    mlflow.log_param('best_final_estimator', md.META_MODELS[best_key]['name'])
    mlflow.log_metric('best_avg_cv_score', st_cv_scores[best_key].mean())
    mlflow.log_metric('best_std_cv_score', st_cv_scores[best_key].std())
    mlflow.log_metric('best_val_scores', st_val_scores[best_key])    

Processing Lasso.



Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead



🏃 View run Lasso at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/a78258c54d7c4607ac97211041761913
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412
Processing Ridge.



Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead



🏃 View run Ridge at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/e060b7a686ab4884bba99f42c13f963e
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412
Processing Elastic Net.



Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead



🏃 View run Elastic Net at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/ccc4b82a60034d5ab4e3f88b9f79e532
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412
Processing Support Vector Regressor.
🏃 View run Support Vector Regressor at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/265e123819eb4f39b776f3ded7761686
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412
🏃 View run div3 at: http://127.0.0.1:8080/#/experiments/341249612524648412/runs/630fec5c3655456a88fc443d08438d0f
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/341249612524648412



Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead



In [118]:
md.plot_stacking_scores(st_cv_scores, st_val_scores, mode='mean', items_per_row=4)

## 4. Predictions

In [None]:
X, y, X_test = dt.load_test_data()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,QualArea,CondArea,HouseAge,RemodAge,HasBasement,HasGarage,HasFireplace,HasPool,Has2ndFloor,SalePrice
0,1.535077,0.38336,-0.171116,-0.139931,0.06426,0.221813,-0.701859,-0.048913,0.026189,-0.344991,...,0.65265,0.216973,-1.044891,-0.871377,0.161308,0.242452,-1.054647,-0.06426,1.147633,0.567607
1,0.075528,0.38336,0.525038,0.108635,0.06426,0.221813,-0.701859,-0.048913,0.026189,-0.240652,...,-0.268116,0.673706,-0.185118,0.388527,0.161308,0.242452,0.947534,-0.06426,-0.870761,0.223273
2,1.535077,0.38336,-0.026603,0.419409,0.06426,0.221813,1.026707,-0.048913,0.026189,-0.344991,...,0.744108,0.335535,-0.978754,-0.822919,0.161308,0.242452,0.947534,-0.06426,1.147633,0.739119
3,-0.405824,0.38336,-0.418713,0.098437,0.06426,0.221813,1.026707,-0.048913,0.026189,0.072737,...,0.661212,0.228083,1.798972,0.630816,0.161308,0.242452,0.947534,-0.06426,1.147633,-0.428381
4,1.535077,0.38336,0.700609,0.887718,0.06426,0.221813,1.026707,-0.048913,0.026189,-0.240652,...,1.486971,0.910621,-0.945686,-0.726003,0.161308,0.242452,0.947534,-0.06426,1.147633,1.014375


In [42]:
n = 50
raw_predictions = pd.get_raw_predictions(md.SVR, params['svr'][str(n)], X[features_by_importance[:n]], y, X_test[features_by_importance[:n]])
dt.prepare_submission(raw_predictions)

In [29]:
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge

stack_regressor = md.create_stack_regressor(model_selection, Lasso(alpha=0.0018010627), features_by_importance)
stack_regressor.fit(X, y)
raw_predictions = stack_regressor.predict(X_test)
dt.prepare_submission(raw_predictions)


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names



In [27]:
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import root_mean_squared_error

X_train, X_val, y_train, y_val, full_df = dt.load_train_data()

stack_regressor = md.create_stack_regressor(model_selection, Lasso(alpha=0.0018010627), features_by_importance)
stack_regressor.fit(X_train, y_train)
raw_predictions = stack_regressor.predict(X_val)

print(root_mean_squared_error(y_val, raw_predictions))


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names


X does not have valid feature names, but LGBMRegressor was fitted with feature names



0.2981220750635965



X does not have valid feature names, but LGBMRegressor was fitted with feature names

