In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV

In [42]:
np.random.seed(42)
grid = {
    "n_estimators":np.arange(10,100,10),
    "max_depth":[None,3,5,10],
    "min_samples_split":np.arange(2,20,2),
    "min_samples_leaf":np.arange(1,20,2),
    "max_features": [0.5,1,"sqrt","auto"],
    "max_samples":[10000,12000,15000,20000]
}

In [43]:
training_data = pd.read_csv('./data/Train_rev1.zip', compression='zip', header=0, sep=',', quotechar='"')

In [44]:
for label, content in training_data.items():
    if pd.api.types.is_string_dtype(content):
        training_data[label] = content.astype("category").cat.as_ordered()

In [45]:
for label,content in training_data.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample had missing value
        training_data[label+"<unk>"]=pd.isnull(content)
        # Turn categories into numbers and add+1
        training_data[label] = pd.Categorical(content).codes+1

In [46]:
df_copy = training_data.copy()

In [47]:
X = df_copy.drop(columns=["SalaryNormalized","FullDescription"],axis=1)
y = df_copy["SalaryNormalized"]

In [48]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

In [49]:
model = RandomizedSearchCV(
RandomForestRegressor(n_jobs=-1,
                     random_state=42),
                    param_distributions = grid,
                     n_iter=5,
                    cv=5,
                    verbose=True)

In [50]:
model.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


RandomizedSearchCV(cv=5,
                   estimator=RandomForestRegressor(n_jobs=-1, random_state=42),
                   n_iter=5,
                   param_distributions={'max_depth': [None, 3, 5, 10],
                                        'max_features': [0.5, 1, 'sqrt',
                                                         'auto'],
                                        'max_samples': [10000, 12000, 15000,
                                                        20000],
                                        'min_samples_leaf': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
                                        'min_samples_split': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18]),
                                        'n_estimators': array([10, 20, 30, 40, 50, 60, 70, 80, 90])},
                   verbose=True)

In [51]:
y_preds = model.predict(X_test)

In [52]:
mae_hyp = mean_absolute_error(y_test,y_preds)

In [53]:
mae_hyp

4582.112930743556