### Boosted Decision Tree Regressor

In [1]:
import pandas as pd

In [2]:
# read in data
df = pd.read_csv('/Users/calebreed/Documents/GitHub/DATA-4950-Capstone/DATA-4950-Capstone/data/external/salaries_clean.csv')
# drop the outliers
df = df[df.annual_base_pay < 1000000]
# drop unneccessary variables
df = df.drop(["index","salary_id","location_latitude","location_longitude","comments","submitted_at"],axis=1)
# job_title_rank, location_state, and location_country have too many missing values so they will be dropped
df = df.drop(['job_title_rank','location_state','location_country'],axis=1)
# drops employer experience years
df = df.drop("employer_experience_years",axis=1)
# drops columns with too many unique values
df = df.drop("employer_name",axis=1)
df = df.drop("job_title",axis=1)
df = df.drop("location_name",axis=1) 
# fill experience missing values with the mean
df.loc[:, "total_experience_years"] = df.loc[:, "total_experience_years"].fillna(df["total_experience_years"].mean())
df.loc[:, "annual_base_pay"] = df.loc[:, "annual_base_pay"].fillna(df["annual_base_pay"].mean())
df.loc[:, "signing_bonus"] = df.loc[:, "signing_bonus"].fillna(method='ffill')
df.loc[:, "annual_bonus"] = df.loc[:, "annual_bonus"].fillna(method='ffill')
# stock bonus is a string so will be dropping it
df = df.drop("stock_value_bonus",axis=1)
# creates dummy variables for job category
dummies = pd.get_dummies(df['job_title_category'], prefix='job_category')
# merges dummy variables with dataframe and drops original column
df = pd.concat([df, dummies], axis=1)
df = df.drop('job_title_category', axis=1)
df.head()

Unnamed: 0,total_experience_years,annual_base_pay,signing_bonus,annual_bonus,job_category_Applied Science,job_category_Data,job_category_Engineering,job_category_Management,job_category_Operations,job_category_Other,job_category_Software,job_category_Web
0,13.0,125000.0,5000.0,0.0,0,0,1,0,0,0,0,0
1,15.0,65000.0,5000.0,5000.0,0,0,0,0,0,0,1,0
2,4.0,86000.0,5000.0,6000.0,0,0,0,0,0,0,1,0
3,4.0,105000.0,5000.0,8500.0,0,0,0,0,0,1,0,0
4,4.0,110000.0,5000.0,7000.0,0,0,0,0,0,0,1,0


In [21]:
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

# seperates features into x and y variables
X = df.drop('annual_base_pay', axis = 1)

y = df['annual_base_pay'] 

X, y = make_regression(random_state=0)

# splits data into 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# intial model
reg = GradientBoostingRegressor(random_state=0)
reg.fit(X_train, y_train)
pred_val = reg.predict(X_test[1:2])
reg.score(X_test, y_test)


0.3941632660665042

In [4]:
reg.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 0,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [18]:
# hyperparameter tuning using grid search
from sklearn.model_selection import GridSearchCV

param_grid = {
    'loss': ['squared_error','absolute_error'],
    'max_depth': [3,4,5,6,7,8],
    'n_estimators': [100,200,300,400],
    'random_state': [0]
}

gb = GradientBoostingRegressor()
grid_search = GridSearchCV(estimator = gb, param_grid = param_grid, cv = 3)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=GradientBoostingRegressor(),
             param_grid={'loss': ['squared_error', 'absolute_error'],
                         'max_depth': [3, 4, 5, 6, 7, 8],
                         'n_estimators': [100, 200, 300, 400],
                         'random_state': [0]})

In [19]:
grid_search.best_params_

{'loss': 'absolute_error',
 'max_depth': 5,
 'n_estimators': 400,
 'random_state': 0}

In [23]:
# grid search best parameters model
final_tree = GradientBoostingRegressor(loss='absolute_error',max_depth=5,n_estimators=400,random_state=0)
final_tree.fit(X_train, y_train)
pred_val2 = final_tree.predict(X_test[1:2])
final_tree.score(X_test, y_test)

0.3298237601108567

In [24]:
final_tree.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'absolute_error',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 400,
 'n_iter_no_change': None,
 'random_state': 0,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

The default model still has a better score. Will use the default model as the final model.