In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, StackingRegressor, VotingRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression,Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler 
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from pyearth import Earth
from xgboost import XGBRegressor
import time as time
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

In [39]:
# Read Data
X_train = data = pd.read_csv('X_train.csv').drop(columns = 'Unnamed: 0')
X_test = pd.read_csv('X_test.csv').drop(columns = 'Unnamed: 0').drop(3415)
y_train = pd.read_csv('y_train.csv').drop(columns = 'Unnamed: 0').totalyearlycompensation
y_test = pd.read_csv('y_test.csv').drop(columns = 'Unnamed: 0').totalyearlycompensation.drop(3415)

# Transformed Data
y_trans = np.log(y_train)

# Standardized Data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)

X_test_std = scaler.transform(X_test)

## Individual Models

In [40]:
# Linear Model
lin_model = LinearRegression().fit(X_train, y_trans)

# Ridge Model
ridge_model = Ridge(alpha = 85.06457640936617).fit(X_train_std, y_trans)

# Lasso Model
lasso_model = Lasso(alpha = 0.001).fit(X_train_std, y_trans)

# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=375,
                                 max_leaf_nodes=3500,
                                 max_depth=25,
                                 oob_score=True,
                                 n_jobs=-1,
                                 max_features=22,
                                 random_state=1).fit(X_train, y_trans)
# AdaBoost Model
ada_model = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(max_depth=18),
                              n_estimators = 200,
                              learning_rate=0.0125,
                              random_state=1).fit(X_train, y_trans)
# Gradient Boost Model
grad_model = GradientBoostingRegressor(max_depth=6,
                                       n_estimators=450,
                                       learning_rate=0.1,
                                       subsample=0.75,
                                       random_state=1,
                                       loss='huber').fit(X_train, y_trans)

# XGBoost Model
xgb_model = xgb.XGBRegressor(max_depth = 6,
                             learning_rate = 0.08,
                             n_estimators = 600,
                             reg_lambda = 10,
                             gamma = 0,
                             subsample = 0.45,
                             random_state = 1).fit(X_train, y_trans)

print('Linear MAE:', mean_absolute_error(np.exp(lin_model.predict(X_test)), y_test))
print('Ridge MAE:', mean_absolute_error(np.exp(ridge_model.predict(X_test_std)), y_test))
print('Lasso MAE:', mean_absolute_error(np.exp(lasso_model.predict(X_test_std)), y_test))
print('Random Forest MAE:', mean_absolute_error(np.exp(rf_model.predict(X_test)), y_test))
print('AdaBoost MAE:', mean_absolute_error(np.exp(ada_model.predict(X_test)), y_test))
print('Gradient Boost MAE:', mean_absolute_error(np.exp(grad_model.predict(X_test)), y_test))
print('XGBoost MAE:', mean_absolute_error(np.exp(xgb_model.predict(X_test)), y_test))

Linear MAE: 55484.18242534074
Ridge MAE: 55399.39345522172
Lasso MAE: 55364.52622961471
Random Forest MAE: 48543.843588033524
AdaBoost MAE: 50549.62023194333
Gradient Boost MAE: 47583.10095823529
XGBoost MAE: 47726.58988807753


## Voting Regressor

In [41]:
voting_model = VotingRegressor(estimators=[('linear', lin_model),
                                           ('ridge', ridge_model),
                                           ('lasso', lasso_model),
                                           ('rf', rf_model),
                                           ('ada', ada_model),
                                           ('grad', grad_model),
                                           ('xgb', xgb_model)])

voting_model.fit(X_train, y_trans)
print('Voting Ensemble MAE:',
      mean_absolute_error(np.exp(voting_model.predict(X_test)), y_test))

Voting Ensemble MAE: 48732.00551111569


## Stacking Regressor

In [42]:
# Stacking using Linear Regression as the meta-model
linear_stacked_model = StackingRegressor(estimators = [('linear', lin_model),
                                                       ('ridge', ridge_model),
                                                       ('lasso', lasso_model),
                                                       ('rf', rf_model),
                                                       ('ada', ada_model),
                                                       ('xgb', xgb_model)],
                                         final_estimator=LinearRegression(),
                                         cv = KFold(n_splits = 5, shuffle = True, random_state=1))

linear_stacked_model.fit(X_train, y_trans)

# Stacking using Lasso as the meta-model
lasso_stacked_model = StackingRegressor(estimators = [('linear', lin_model),
                                                      ('ridge', ridge_model),
                                                      ('lasso', lasso_model),
                                                      ('rf', rf_model),
                                                      ('ada', ada_model),
                                                      ('xgb', xgb_model)],
                                        final_estimator=LassoCV(),
                                        cv = KFold(n_splits = 5, shuffle = True, random_state=1))
lasso_stacked_model.fit(X_train, y_trans)

# Stacking using MARS as the meta-model
mars_stacked_model = StackingRegressor(estimators = [('linear', lin_model),
                                                     ('ridge', ridge_model),
                                                     ('lasso', lasso_model),
                                                     ('rf', rf_model),
                                                     ('ada', ada_model),
                                                     ('xgb', xgb_model)],
                                       final_estimator = Earth(max_degree=1),
                                       cv = KFold(n_splits = 5, shuffle = True, random_state=1))
mars_stacked_model.fit(X_train, y_trans)

print('Linear Regression metamodel MAE:',
      mean_absolute_error(np.exp(linear_stacked_model.predict(X_test)), y_test))
print('Lasso metamodel MAE:',
      mean_absolute_error(np.exp(lasso_stacked_model.predict(X_test)), y_test))
print('Mars metamodel MAE:',
      mean_absolute_error(np.exp(mars_stacked_model.predict(X_test)), y_test))

Linear Regression metamodel MAE: 47232.522451909084
Lasso metamodel MAE: 47342.356875929945
Mars metamodel MAE: 47082.329414820604


In [44]:
importances = xgb_model.feature_importances_
feature_names = xgb_model.feature_names_in_
df_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
df_importances = df_importances.sort_values('Importance', ascending=False)
df_importances.head(10)

Unnamed: 0,Feature,Importance
99,dmaid_807.0,0.044386
15,company_facebook,0.035608
59,dmaid_0.0,0.030468
21,company_other,0.021626
82,dmaid_616.0,0.020905
41,title_Software Engineering Manager,0.020873
4,Doctorate_Degree,0.016084
7,Race_Asian,0.015505
16,company_google,0.014828
34,title_Business Analyst,0.014755
