In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import KFold
import xgboost as xgb

import mlflow

import IPython.display as ipd
import seaborn as sns
import matplotlib.pylab as plt

In [2]:
data = pd.read_csv(os.path.join('..', 'data', 'processed', 'ubaar_features.csv'), encoding="utf-8", index_col="ID")

In [3]:
data.shape

(49371, 52)

In [4]:
features_columns = data.columns
# features_columns = [c for c in features_columns if not c.startswith('cluster_')]
features_columns = [c for c in features_columns if not c.startswith('weight_d')]
features_columns = list(features_columns)
features_columns.remove('price')
features_columns = np.array(features_columns)

In [5]:
# features_columns = data.columns[data.columns  != 'price'].values

In [6]:
def setup_mlflow():
    remote_server_uri = "http://18.185.244.61:5050"
    mlflow.set_tracking_uri(remote_server_uri)
    mlflow.set_experiment("UbaarCV") 
    mlflow.end_run()
    mlflow.start_run(run_name='')

In [9]:
def train_sklearn(x_train, y_train, x_dev, y_dev):
#     model = Ridge(alpha=5.0, fit_intercept=True, normalize=False, copy_X=True, solver='auto', random_state=42)
    model = RandomForestRegressor(n_estimators=20, max_depth=20, min_samples_leaf=8, random_state=42)
    
    
    model.fit(x_train, np.log(y_train * 0.95))

    preds_train = np.exp(model.predict(x_train))
    preds_dev = np.exp(model.predict(x_dev))

    return preds_train, preds_dev, model.__dict__

PARAMS = {'objective': 'reg:squarederror',
          'eval_metric': 'mape',
         'booster': 'gbtree', 'eta': 0.05, 'max_depth': 16,
         'min_child_weight': 0.01,
         'subsample': 0.95, 'colsample_bytree': 0.7,
         'colsample_bylevel': 0.06, 'alpha': 0.0,
         'lambda': 0.5, 'seed': 42, 'gamma': 0.0,
         'max_delta_step': 0, 'nthred': 4,
         'min_split_gain': 0.0, 'early_stopping_rounds': 300}

def train_xgb(x_train, y_train, x_dev, y_dev):
    
    dtrain = xgb.DMatrix(x_train, label=np.log(y_train * 0.95))
    dtest = xgb.DMatrix(x_dev, label=np.log(y_dev * 0.95))
    evallist = [(dtest, 'eval')]
    
    model = xgb.train(PARAMS, dtrain, 3000, evals=evallist, verbose_eval=False)
    preds_train = np.exp(model.predict(dtrain, model.best_iteration+0))
    preds_dev = np.exp(model.predict(dtest, model.best_iteration+0))
#     print(model.best_iteration)
    model_params = PARAMS
    return preds_train, preds_dev, model_params
    

In [12]:
setup_mlflow()
mlflow.log_param('features', features_columns)


y_full = data['price'].values
x_full = data[features_columns].values

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

train_mapes = []
dev_mapes = []
dev_preds = []
dev_refs = []
dev_inds = []

for train_ind, dev_ind in kfold.split(x_full):
    
    x_train = x_full[train_ind]
    y_train = y_full[train_ind]
    x_dev = x_full[dev_ind]
    y_dev = y_full[dev_ind]
    
#     scaler = StandardScaler()
#     scaler.fit(x_train)
#     x_train = scaler.transform(x_train)
#     x_dev = scaler.transform(x_dev)

#     preds_train, preds_dev, model_params = train_sklearn(x_train, y_train, x_dev, y_dev)
    preds_train, preds_dev, model_params = train_xgb(x_train, y_train, x_dev, y_dev)
#     preds_train2, preds_dev2, model_params2 = train_xgb(x_train, y_train, x_dev, y_dev)
    
#     preds_train = (preds_train + preds_train2)/2
#     preds_dev = (preds_dev + preds_dev2)/2
    
    mlflow.log_param('features', features_columns)
    mlflow.log_param('model_params', model_params)

    train_mape = mean_absolute_percentage_error(y_train, preds_train)
    dev_mape = mean_absolute_percentage_error(y_dev, preds_dev)
    
    train_mapes.append(train_mape)
    dev_mapes.append(dev_mape)
    
    dev_preds.extend(list(preds_dev))
    dev_refs.extend(list(y_dev))
    dev_inds.extend(list(dev_ind))
    
    print(f"Train MAPE: {train_mape}")
    print(f"Dev MAPE: {dev_mape}")
#     break
# 
print("================")
print(f"Mean MAPE: {np.mean(dev_mapes)}")
print(f"Std MAPE: {np.std(dev_mapes)}")

mlflow.log_metric("Mean dev MAPE", np.mean(dev_mapes))
mlflow.log_metric("Std dev MAPE", np.std(dev_mapes))
                  
mlflow.end_run()

Parameters: { early_stopping_rounds, min_split_gain, nthred } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Train MAPE: 0.05001071237189561
Dev MAPE: 0.1647930244056891
Parameters: { early_stopping_rounds, min_split_gain, nthred } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Train MAPE: 0.04999155077654669
Dev MAPE: 0.16590442404350966
Parameters: { early_stopping_rounds, min_split_gain, nthred } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used 

In [None]:
results = pd.DataFrame(list(zip(dev_refs, dev_preds, dev_inds)), columns=['refs', 'preds', 'inds'])
results = results.sort_values('inds')
results.head()

In [None]:
sorted_idx = model.feature_importances_.argsort()
plt.figure(figsize=(10,20))
plt.barh(features_columns[sorted_idx], model.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")