## Import Packages

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
import dill
import random

from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, Lasso, LassoCV, RidgeCV, LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_predict
from xgboost import XGBRegressor

from patsy import dmatrices, dmatrix, build_design_matrices

# Set number of CPU cores for parallel algorithms
import os
if "CPU_LIMIT" in os.environ:
    # If you are on JupyterHub, this gives you the right number of CPUs for your virtual machine
    num_cpus = int(os.getenv("CPU_LIMIT").split('.')[0])
else:
    # If you are not on JupyterHub, this gives you the right number for your computer.
    num_cpus = os.cpu_count()

In [2]:
# This sets some nicer defaults for plotting.
# This must be run in a separate cell from importing matplotlib due to a bug.
params = {'legend.fontsize': 'large',
          'figure.figsize': (11.0, 11.0),
          'axes.labelsize': 'x-large',
          'axes.titlesize':'xx-large',
          'xtick.labelsize':'large',
          'ytick.labelsize':'large'}
mpl.rcParams.update(params)

# This makes it so that the pandas dataframes don't get truncated horizontally.
pd.options.display.max_columns = 200

## Load the Data 

In [3]:
col_classes = {"LOAN_IDENTIFIER": np.character, 
               "CHANNEL": 'category', 
               "SELLER_NAME": np.character, 
               "ORIGINAL_INTEREST_RATE": np.float32, 
               "ORIGINAL_UPB": np.float64,
               "ORIGINAL_LOAN_TERM": "Int16", 
               "ORIGINATION_DATE": np.character,
               "FIRST_PAYMENT_DATE": np.character, 
               "ORIGINAL_LTV": np.float32, 
               "ORIGINAL_COMBINED_LTV": np.float32, 
               "NUMBER_OF_BORROWERS": 'category', 
               "DTI": np.float32, 
               "BORROWER_CREDIT_SCORE_AT_ORIGINATION": "UInt16", 
               "COBORROWER_CREDIT_SCORE_AT_ORIGINATION": 'UInt16', 
               "FIRST_TIME_HOME_BUYER_INDICATOR": 'category', 
               "LOAN_PURPOSE": 'category', 
               "PROPERTY_TYPE": 'category',
               "NUMBER_OF_UNITS": "UInt16", 
               "OCCUPANCY_STATUS": 'category', 
               "PROPERTY_STATE": 'category', 
               "MSA": 'category', 
               "ZIP_CODE_SHORT": 'category', 
               "MORTGAGE_INSURANCE_PERCENTAGE": np.float32, 
               "AMORTIZATION_TYPE": np.character,
               "MORTGAGE_INSURANCE_TYPE": 'category', 
               "RELOCATION_MORTGAGE_INDICATOR": 'category',
               "CREDIT_SCORE_MIN": "UInt16",
               "ORIGINAL_VALUE": float,
               "ZERO_BALANCE_CODE": 'category',
               "LOAN_AGE": "Int16",
               "NET_LOSS": float,
               "NET_SEVERITY": float,
               "LAST_STAT": 'category',
               "LOAN_MODIFICATION_COSTS": float,
               "TOTAL_LOSSES": float,
               "MSA_NAME": 'category',
               "CENSUS_2010_POP": float}

date_columns = ["ORIGINATION_DATE",
                "FIRST_PAYMENT_DATE"]

In [4]:
%%time

full_data_set = True

FILES_LOCATION = '../Shared Data (Read Only)/Fannie Mae Data/'

if not full_data_set:
    df_train = pd.read_csv(FILES_LOCATION + "FannieMaeSmallTrain.csv",
                           index_col="LOAN_IDENTIFIER",
                           dtype=col_classes,
                           parse_dates=date_columns,
                           sep='|')
    df_test = pd.read_csv(FILES_LOCATION + "FannieMaeSmallTest.csv",
                          index_col="LOAN_IDENTIFIER",
                          dtype=col_classes,
                          parse_dates=date_columns,
                          sep='|')

if full_data_set:
    # This p is the proportion of the training data you load.
    # You can set it anywhere from 0 to 1.
    p = 0.25
    random.seed(201)
    df_train = pd.read_csv(FILES_LOCATION + "FannieMaeTrain.csv",
                           index_col="LOAN_IDENTIFIER",
                           dtype=col_classes,
                           parse_dates=date_columns,
                           sep='|',
                           skiprows=lambda i: i>0 and random.random() > p)
    df_test = pd.read_csv(FILES_LOCATION + "FannieMaeTest.csv",
                          index_col="LOAN_IDENTIFIER",
                          dtype=col_classes,
                          parse_dates=date_columns,
                          sep='|')

CPU times: user 2min 3s, sys: 7.85 s, total: 2min 11s
Wall time: 2min 13s


In [5]:
df_train.shape

(11094188, 36)

In [6]:
df_test.shape

(4930731, 36)

## Summarize the Data

In [7]:
if 'ZERO_BALANCE_CODE' in df_train:
    df_train.drop(['ZERO_BALANCE_CODE', 'LOAN_AGE', 'NET_SEVERITY', 'LAST_STAT', 'LOAN_MODIFICATION_COSTS', 'TOTAL_LOSSES'],
                  axis=1,
                  inplace=True)
if 'ZERO_BALANCE_CODE' in df_test:
    df_test.drop(['ZERO_BALANCE_CODE', 'LOAN_AGE', 'NET_SEVERITY', 'LAST_STAT', 'LOAN_MODIFICATION_COSTS', 'TOTAL_LOSSES'],
                  axis=1,
                  inplace=True)

In [8]:
def summarize_dataframe(df):
    """Summarize a dataframe, and report missing values."""
    missing_values = pd.DataFrame({'Variable Name': df.columns,
                                   'Data Type': df.dtypes,
                                   'Missing Values': df.isnull().sum(),
                                   'Unique Values': [df[name].nunique() for name in df.columns]}
                                 ).set_index('Variable Name')
    with pd.option_context("display.max_rows", 1000):
        display(pd.concat([missing_values, df.describe(include='all', datetime_is_numeric=True).transpose()], axis=1).fillna(""))

## Engineer Row Based Features

In [9]:
df_train['ORIGINATION_DATE'] = pd.to_datetime(df_train['ORIGINATION_DATE'], format='%Y-%m-%d')

df_test['ORIGINATION_DATE'] = pd.to_datetime(df_test['ORIGINATION_DATE'], format='%Y-%m-%d')

In [10]:
df_train['YEAR'] = df_train['ORIGINATION_DATE'].dt.year
df_test['YEAR'] = df_test['ORIGINATION_DATE'].dt.year

### Date Features

In [11]:
# pull out quarter and month
df_train['QUARTER'] = df_train['ORIGINATION_DATE'].dt.quarter
df_test['QUARTER'] = df_test['ORIGINATION_DATE'].dt.quarter
df_train['MONTH'] = df_train['ORIGINATION_DATE'].dt.month
df_test['MONTH'] = df_test['ORIGINATION_DATE'].dt.month

In [12]:
year_train = df_train['YEAR']
year_test= df_test['YEAR']

In [13]:
df_train['years_since_crash'] = df_train['YEAR'] - 2008
df_test['years_since_crash'] = df_test['YEAR'] - 2008
df_train.years_since_crash

LOAN_IDENTIFIER
384584747416     3
784893300593    -6
98826276        11
732335456818    -6
778665625108    -6
                ..
406816644901     0
902779441341    -6
222954820423     8
100236974       12
322600763099     5
Name: years_since_crash, Length: 11094188, dtype: int64

### Geography Features

In [14]:
#region code
states = {
        'AK': 'O',
        'AL': 'S',
        'AR': 'S',
        'AS': 'O',
        'AZ': 'W',
        'CA': 'W',
        'CO': 'W',
        'CT': 'N',
        'DC': 'N',
        'DE': 'N',
        'FL': 'S',
        'GA': 'S',
        'GU': 'O',
        'HI': 'O',
        'IA': 'M',
        'ID': 'W',
        'IL': 'M',
        'IN': 'M',
        'KS': 'M',
        'KY': 'S',
        'LA': 'S',
        'MA': 'N',
        'MD': 'N',
        'ME': 'N',
        'MI': 'W',
        'MN': 'M',
        'MO': 'M',
        'MP': 'O',
        'MS': 'S',
        'MT': 'W',
        'NA': 'O',
        'NC': 'S',
        'ND': 'M',
        'NE': 'W',
        'NH': 'N',
        'NJ': 'N',
        'NM': 'W',
        'NV': 'W',
        'NY': 'N',
        'OH': 'M',
        'OK': 'S',
        'OR': 'W',
        'PA': 'N',
        'PR': 'O',
        'RI': 'N',
        'SC': 'S',
        'SD': 'M',
        'TN': 'S',
        'TX': 'S',
        'UT': 'W',
        'VA': 'S',
        'VI': 'O',
        'VT': 'N',
        'WA': 'W',
        'WI': 'M',
    
        'WV': 'S',
        'WY': 'W'
}


In [15]:
df_train['REGION'] = df_train['PROPERTY_STATE'].map(states)
df_train[['PROPERTY_STATE','REGION']]

Unnamed: 0_level_0,PROPERTY_STATE,REGION
LOAN_IDENTIFIER,Unnamed: 1_level_1,Unnamed: 2_level_1
384584747416,MD,N
784893300593,UT,W
98826276,VA,S
732335456818,FL,S
778665625108,FL,S
...,...,...
406816644901,CO,W
902779441341,AZ,W
222954820423,LA,S
100236974,OR,W


In [16]:
df_test['REGION'] = df_test['PROPERTY_STATE'].map(states)
df_test[['PROPERTY_STATE','REGION']]

Unnamed: 0_level_0,PROPERTY_STATE,REGION
LOAN_IDENTIFIER,Unnamed: 1_level_1,Unnamed: 2_level_1
101189015,FL,S
321503282919,NV,W
291437756634,MI,W
980873633380,AR,S
435213593495,NV,W
...,...,...
658790054482,AL,S
610939793362,TX,S
787788149067,IL,M
552091843200,MI,W


## Split Into Training and Validation

In [18]:
df_smaller_train, df_validation = train_test_split(df_train, test_size = 0.25, random_state = 201)

In [19]:
df_smaller_train = df_smaller_train.copy()
df_validation = df_validation.copy()

## Impute Missing Values

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin

class CategoricalImputer(BaseEstimator, TransformerMixin):
    """
    Custom defined imputer for categorical data. This allows you to specify an 
    other class where any category that doesn't meet the requirements necessary to
    be in 
    """
    
    def __init__(self, other_threshold=0, 
                 other_label="OTHER",
                 missing_first=True,
                 missing_values=np.nan, 
                 strategy='constant', 
                 fill_value="MISSING", 
                 verbose=0, 
                 copy=True, 
                 add_indicator=False):
        self.add_indicator = add_indicator
        self.copy=copy
        self.verbose=verbose
        self.fill_value=fill_value
        self.missing_first=missing_first
        self.missing_values=missing_values
        self.other_label=other_label
        self.other_threshold=other_threshold
        self.strategy=strategy
        if hasattr(missing_values, "__iter__"):
            self.missing_values = missing_values
        else:
            self.missing_values = [missing_values]
        self._imputer = SimpleImputer(missing_values=missing_values, strategy=strategy, fill_value=fill_value, verbose=verbose, copy=copy, add_indicator=False)
        self._column_categories = {}

        
    def fit(self, X, y=None):
        if type(self.other_threshold) == int or type(self.other_threshold) == float:
            other_threshold = [self.other_threshold]*len(X.columns)
        elif len(self.other_threshold) == len(X.columns):
            other_threshold = self.other_threshold
        else:
            raise TypeError("other_threshold must be either a single number or a list of numbers equal to the number of columns.")

        i = 0
        X = X.copy()
        X = X[:].astype(object)
        if self.missing_first:
            X = pd.DataFrame(self._imputer.fit_transform(X), columns=X.columns, index=X.index)
        column_categories = {}
        for column in X.columns:
            if other_threshold[i] < 1:
                other_threshold[i] = other_threshold[i]*X[column].shape[0]
            
            value_counts = X[column].value_counts()
            categories = [category for category in value_counts.index if value_counts.loc[category] >= other_threshold[i]]
            if value_counts.iloc[-1] >= other_threshold[i]:
                categories[-1] = self.other_label
            else:
                categories.append(self.other_label)
            
            self._column_categories[column] = categories
            i = i + 1
        
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X = X[:].astype(object)
        if self.missing_first:
            X = pd.DataFrame(self._imputer.fit_transform(X), columns=X.columns, index=X.index)
        for column in X.columns:
            X.loc[~X[column].isin(self._column_categories[column]) & ~X[column].isin(self.missing_values), column] = self.other_label
        return pd.DataFrame(self._imputer.fit_transform(X), columns=X.columns, index=X.index)[:].astype(str)

In [21]:
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer_zero = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
categorical_imputer = CategoricalImputer(other_threshold=.01)
imputer_missing = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='missing')
# Dan imputer
imputer_special = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=700)

In [22]:
continuous_mean = ['ORIGINAL_INTEREST_RATE',
                   'ORIGINAL_UPB',
                   'ORIGINAL_LOAN_TERM',
                   'ORIGINAL_LTV',
                   'ORIGINAL_COMBINED_LTV',
                   'DTI',
                   'ORIGINAL_VALUE',
                   'YEAR', 'years_since_crash']

continuous_zero = ['MORTGAGE_INSURANCE_PERCENTAGE',
                   'CREDIT_SCORE_MIN']

special = ['BORROWER_CREDIT_SCORE_AT_ORIGINATION',
                   'COBORROWER_CREDIT_SCORE_AT_ORIGINATION']

continuous_variables = continuous_mean + continuous_zero + special

categorical_missing = ['FIRST_TIME_HOME_BUYER_INDICATOR',
                       'MORTGAGE_INSURANCE_TYPE']


categorical_variables = ['CHANNEL',
                         'SELLER_NAME',
                         'NUMBER_OF_BORROWERS',
                         'LOAN_PURPOSE',
                         'PROPERTY_TYPE',
                         'NUMBER_OF_UNITS',
                         'OCCUPANCY_STATUS',
                         'PROPERTY_STATE',
                         'ZIP_CODE_SHORT',
                         'AMORTIZATION_TYPE',
                         'RELOCATION_MORTGAGE_INDICATOR',
                         'MSA',
                         'MSA_NAME',
                         'CENSUS_2010_POP']

In [23]:
imputer_mean.fit(df_smaller_train[continuous_mean])
df_smaller_train[continuous_mean] = imputer_mean.transform(df_smaller_train[continuous_mean])
df_validation[continuous_mean] = imputer_mean.transform(df_validation[continuous_mean])

In [24]:
imputer_zero.fit(df_smaller_train[continuous_zero])
df_smaller_train[continuous_zero] = imputer_zero.transform(df_smaller_train[continuous_zero])
df_validation[continuous_zero] = imputer_zero.transform(df_validation[continuous_zero])

In [25]:
categorical_imputer.fit(df_smaller_train[categorical_variables])
df_smaller_train[categorical_variables] = categorical_imputer.transform(df_smaller_train[categorical_variables])
df_validation[categorical_variables] = categorical_imputer.transform(df_validation[categorical_variables])

In [26]:
imputer_special.fit(df_smaller_train[special])
df_smaller_train[special] = imputer_special.transform(df_smaller_train[special])
df_validation[special] = imputer_special.transform(df_validation[special])

In [27]:
imputer_missing.fit(df_smaller_train[categorical_missing])
df_smaller_train[categorical_missing] = imputer_missing.transform(df_smaller_train[categorical_missing])
df_validation[categorical_missing] = imputer_missing.transform(df_validation[categorical_missing])

## Set Up the Evaluation Metric

In [28]:
average_loss = df_train['NET_LOSS'].mean()

In [29]:
def accuracy(y_true, y_pred):
    """Function that returns a table showing RMSE and MAE."""
    RMSE = mean_squared_error(y_true, y_pred)**(1/2)
    naive_RMSE = mean_squared_error(y_true, [average_loss]*len(y_true))**(1/2)
    acc_df = pd.DataFrame(data = {"RMSE": [RMSE],
                                  "Naive - RMSE": [naive_RMSE - RMSE]})
    display(acc_df.style.hide_index())

## Feature Engineering for Tree Based Models

In [30]:
continuous_features_trees = ['ORIGINAL_INTEREST_RATE',
                             'ORIGINAL_UPB',
                             'ORIGINAL_LOAN_TERM',
                             'ORIGINAL_LTV',
                             'ORIGINAL_COMBINED_LTV',
                             'DTI',
                             'BORROWER_CREDIT_SCORE_AT_ORIGINATION',
                             'COBORROWER_CREDIT_SCORE_AT_ORIGINATION',
                             'MORTGAGE_INSURANCE_PERCENTAGE',
                             'CREDIT_SCORE_MIN',
                             'ORIGINAL_VALUE','years_since_crash']

cat_ordinal_features_trees = ['SELLER_NAME', 
                              'NUMBER_OF_BORROWERS', 
                              'PROPERTY_STATE',
                              'CENSUS_2010_POP',
                              'NUMBER_OF_UNITS']

cat_dummy_features_trees = ['FIRST_TIME_HOME_BUYER_INDICATOR',
                            'CHANNEL',
                            'LOAN_PURPOSE',
                            'PROPERTY_TYPE',
                            'OCCUPANCY_STATUS',
                            'AMORTIZATION_TYPE',
                            'MORTGAGE_INSURANCE_TYPE',
                            'RELOCATION_MORTGAGE_INDICATOR',
                            'QUARTER',
                            'MONTH',
                            'YEAR', 
                            'REGION']

In [31]:
X_tree_train = df_smaller_train[continuous_features_trees + cat_ordinal_features_trees]
y_tree_train = df_smaller_train['NET_LOSS']

In [32]:
# patsy matrix to get dummies
formula_tree = "0 + " + " + ".join(cat_dummy_features_trees)

In [33]:
formula_tree

'0 + FIRST_TIME_HOME_BUYER_INDICATOR + CHANNEL + LOAN_PURPOSE + PROPERTY_TYPE + OCCUPANCY_STATUS + AMORTIZATION_TYPE + MORTGAGE_INSURANCE_TYPE + RELOCATION_MORTGAGE_INDICATOR + QUARTER + MONTH + YEAR + REGION'

In [34]:
# add a couple interactions
formula_tree = "0 + " + " + ".join(cat_dummy_features_trees) + " + FIRST_TIME_HOME_BUYER_INDICATOR:BORROWER_CREDIT_SCORE_AT_ORIGINATION + \
ORIGINAL_LTV:years_since_crash"

In [35]:
X_tree_train_patsy = dmatrix(formula_tree, df_smaller_train, return_type="dataframe")

In [36]:
X_tree_train = pd.concat([X_tree_train, X_tree_train_patsy], axis=1)

In [37]:
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(X_tree_train[cat_ordinal_features_trees])
X_tree_train[cat_ordinal_features_trees] = ordinal_encoder.transform(X_tree_train[cat_ordinal_features_trees])

In [38]:
# get variables ready for xgboost
X_tree_train.columns = X_tree_train.columns.str.replace('[', '(').str.replace(']', ')')

  X_tree_train.columns = X_tree_train.columns.str.replace('[', '(').str.replace(']', ')')


In [39]:
# transform validation set
X_tree_validation = df_validation[continuous_features_trees + cat_ordinal_features_trees]
y_tree_validation = df_validation['NET_LOSS']

X_tree_validation_patsy = build_design_matrices([X_tree_train_patsy.design_info], df_validation, return_type="dataframe")[0]

X_tree_validation = pd.concat([X_tree_validation, X_tree_validation_patsy], axis=1)

X_tree_validation[cat_ordinal_features_trees] = ordinal_encoder.transform(X_tree_validation[cat_ordinal_features_trees])

X_tree_validation.columns = X_tree_validation.columns.str.replace('[', '(').str.replace(']', ')')

  X_tree_validation.columns = X_tree_validation.columns.str.replace('[', '(').str.replace(']', ')')


In [40]:
# just pick some variables to start
xgb_model = XGBRegressor(max_depth=5,
                         n_estimators = 500,
                         learning_rate=.1,
                         min_child_weight = 1,
                         colsample_bytree = 0.8,
                         subsample = 0.8,
                         gamma = 0.1,
                         random_state=201,
                         n_jobs=num_cpus)

In [41]:
# fit first iteration with early stopping
xgb_model.fit(X_tree_train, y_tree_train, early_stopping_rounds = 50, verbose = 50,
              eval_set = [(X_tree_train, y_tree_train), (X_tree_validation, y_tree_validation)])

[0]	validation_0-rmse:7413.33398	validation_1-rmse:7377.79688
[50]	validation_0-rmse:7136.19434	validation_1-rmse:7099.67139
[100]	validation_0-rmse:7090.05371	validation_1-rmse:7056.65967
[150]	validation_0-rmse:7064.70508	validation_1-rmse:7035.02197
[200]	validation_0-rmse:7044.48682	validation_1-rmse:7018.87793
[250]	validation_0-rmse:7031.87012	validation_1-rmse:7010.49951
[300]	validation_0-rmse:7021.84375	validation_1-rmse:7005.06738
[350]	validation_0-rmse:7011.79102	validation_1-rmse:6998.58691
[400]	validation_0-rmse:7004.62939	validation_1-rmse:6994.86230
[450]	validation_0-rmse:6997.92432	validation_1-rmse:6992.94287
[499]	validation_0-rmse:6991.65576	validation_1-rmse:6991.54785


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=0.1, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=15, num_parallel_tree=1, random_state=201,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='approx', validate_parameters=1, verbosity=None)

In [43]:
# tune max_depth and min_child_weight
param_test1 = {'max_depth':np.arange(4,14,1),
               'min_child_weight':np.arange(1,15,1)}

In [None]:
# grid search with parameter grid - chose max_depth = 7 and min_child_weight = 8
gsearch1 = GridSearchCV(estimator = XGBRegressor(learning_rate =0.1, n_estimators=500, max_depth=5,
                                                 min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8, seed=201, 
                                                 n_jobs=num_cpus), 
                        param_grid = param_test1, scoring = 'neg_mean_squared_error', cv=5, n_jobs=num_cpus)
gsearch1.fit(X_tree_train,y_tree_train, early_stopping_rounds = 50, verbose = False,
            eval_set = [(X_tree_train, y_tree_train), (X_tree_validation, y_tree_validation)])
gsearch1.best_params_, gsearch1.best_score_

In [None]:
# tune gamma
param_test2 = {
 'gamma':[i/10.0 for i in range(0,5)]
}

In [None]:
# grid search chose gamma = 0.0
gsearch2 = GridSearchCV(estimator = XGBRegressor(learning_rate =0.1, n_estimators=193, max_depth=7,
                                                 min_child_weight=8, gamma=0.1, subsample=0.8, colsample_bytree=0.8, seed=201, 
                                                 n_jobs=num_cpus), 
                        param_grid = param_test2, scoring = 'neg_mean_squared_error', cv=5, n_jobs=num_cpus)
gsearch2.fit(X_tree_train,y_tree_train, early_stopping_rounds = 50, verbose = False, eval_metric = 'rmse',
            eval_set = [(X_tree_train, y_tree_train), (X_tree_validation, y_tree_validation)])
gsearch2.best_params_, gsearch2.best_score_

In [None]:
# fit second xgboost model with updated parameters
xgb_model2 = XGBRegressor(max_depth=7,
                         n_estimators = 1000,
                         learning_rate=.1,
                         min_child_weight = 8,
                         colsample_bytree = 0.8,
                         gamma = 0.0,
                         random_state=201,
                         n_jobs=num_cpus)

In [None]:
xgb_model2.fit(X_tree_train, y_tree_train, early_stopping_rounds = 50, verbose = 50,
              eval_set = [(X_tree_train, y_tree_train), (X_tree_validation, y_tree_validation)])

In [None]:
# tune subsample and colsample_bytree
param_test3 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

In [None]:
# grid search chose subsample = .7 and colsample_bytree = .9
gsearch3 = GridSearchCV(estimator = XGBRegressor(learning_rate =0.1, n_estimators=120, max_depth=7,
                                                 min_child_weight=8, gamma=0.0, subsample=0.9, colsample_bytree=0.7, seed=201, 
                                                 n_jobs=num_cpus), 
                        param_grid = param_test3, scoring = 'neg_mean_squared_error', cv=5, n_jobs=num_cpus)
gsearch3.fit(X_tree_train,y_tree_train, early_stopping_rounds = 50, verbose = False,
            eval_set = [(X_tree_train, y_tree_train), (X_tree_validation, y_tree_validation)])
gsearch3.best_params_, gsearch3.best_score_

In [None]:
# re-tune subsample and colsample_bytree using smaller grid chosen from gsearch3
param_test4 = {
 'subsample':[i/100.0 for i in range(85,100,5)],
 'colsample_bytree':[i/100.0 for i in range(65,80,5)]
}

In [None]:
# still 0.7 and 0.9
gsearch4 = GridSearchCV(estimator = XGBRegressor(learning_rate =0.1, n_estimators=120, max_depth=7,
                                                 min_child_weight=8, gamma=0.0, subsample=0.8, colsample_bytree=0.8, seed=201, 
                                                 n_jobs=num_cpus), 
                        param_grid = param_test4, scoring = 'neg_mean_squared_error', cv=5, n_jobs=num_cpus)
gsearch4.fit(X_tree_train,y_tree_train, early_stopping_rounds = 50, verbose = False,
            eval_set = [(X_tree_train, y_tree_train), (X_tree_validation, y_tree_validation)])
gsearch4.best_params_, gsearch4.best_score_

In [None]:
# tune alpha regularization (L1)
param_test5 = {
 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.025, 0.05]
}

In [None]:
# grid search chose alpha = 0.05 indicating need for larger parameters
gsearch5 = GridSearchCV(estimator = XGBRegressor(learning_rate =0.1, n_estimators=120, max_depth=7,
                                                 min_child_weight=8, gamma=0.0, subsample=0.9, colsample_bytree=0.7, seed=201, 
                                                 n_jobs=num_cpus), 
                        param_grid = param_test5, scoring = 'neg_mean_squared_error', cv=5, n_jobs=num_cpus)
gsearch5.fit(X_tree_train,y_tree_train, early_stopping_rounds = 50, verbose = False,
            eval_set = [(X_tree_train, y_tree_train), (X_tree_validation, y_tree_validation)])
gsearch5.best_params_, gsearch5.best_score_

In [None]:
param_test6 = {
 'reg_alpha':[0.75, 0.85, 1.0, 1.1, 1.2, 1.3]
}

In [None]:
# alpha = 0.75
gsearch6 = GridSearchCV(estimator = XGBRegressor(learning_rate =0.1, n_estimators=120, max_depth=7,
                                                 min_child_weight=8, gamma=0.0, subsample=0.9, colsample_bytree=0.7, seed=201, 
                                                 reg_alpha = 0.5, n_jobs=num_cpus), 
                        param_grid = param_test6, scoring = 'neg_mean_squared_error', cv=5, n_jobs=num_cpus)
gsearch6.fit(X_tree_train,y_tree_train, early_stopping_rounds = 50, verbose = False,
            eval_set = [(X_tree_train, y_tree_train), (X_tree_validation, y_tree_validation)])
gsearch6.best_params_, gsearch6.best_score_

In [None]:
# xgboost model 3 with updated parameters
xgb_model3 = XGBRegressor(max_depth=7,
                         n_estimators = 1000,
                         learning_rate=.1,
                         min_child_weight = 8,
                         colsample_bytree = 0.7,
                         subsample = 0.9,
                         gamma = 0.0,
                         reg_alpha = 0.75,
                         random_state=201,
                         n_jobs=num_cpus)

In [None]:
xgb_model3.fit(X_tree_train, y_tree_train, early_stopping_rounds = 50, verbose = 50,
              eval_set = [(X_tree_train, y_tree_train), (X_tree_validation, y_tree_validation)])

In [None]:
# tune learning rate
param_test7 = {
 'learning_rate':[0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
}

In [None]:
# learning_rate = 0.1
gsearch7 = GridSearchCV(estimator = XGBRegressor(learning_rate =0.1, n_estimators=120, max_depth=7,
                                                 min_child_weight=8, gamma=0.0, subsample=0.9, colsample_bytree=0.7, seed=201, 
                                                 reg_alpha = 0.75, n_jobs=num_cpus), 
                        param_grid = param_test7, scoring = 'neg_mean_squared_error', cv=5, n_jobs=num_cpus)
gsearch7.fit(X_tree_train,y_tree_train, early_stopping_rounds = 50, verbose = False,
            eval_set = [(X_tree_train, y_tree_train), (X_tree_validation, y_tree_validation)])
gsearch7.best_params_, gsearch7.best_score_

In [42]:
# re-fit xgboost with updated parameters
# early_stopping stopped at n_estimators = 632. Will use n_estimators = 300 for computational purposes
xgb_model4 = XGBRegressor(max_depth=7,
                         n_estimators = 5000,
                         learning_rate=.1,
                         min_child_weight = 8,
                         colsample_bytree = 0.7,
                         subsample = 0.9,
                         gamma = 0.0,
                         reg_alpha = 0.75,
                         random_state=201,
                         n_jobs=num_cpus)

In [43]:
xgb_model4.fit(X_tree_train, y_tree_train, early_stopping_rounds = 50, verbose = 50,
              eval_set = [(X_tree_train, y_tree_train), (X_tree_validation, y_tree_validation)])

[0]	validation_0-rmse:7405.80762	validation_1-rmse:7370.45508
[50]	validation_0-rmse:7050.16699	validation_1-rmse:7027.95508
[100]	validation_0-rmse:6992.12695	validation_1-rmse:6988.80566
[150]	validation_0-rmse:6956.40918	validation_1-rmse:6973.25879
[200]	validation_0-rmse:6929.62647	validation_1-rmse:6966.47607
[250]	validation_0-rmse:6903.05908	validation_1-rmse:6962.67627
[300]	validation_0-rmse:6882.24707	validation_1-rmse:6962.21240
[350]	validation_0-rmse:6862.15332	validation_1-rmse:6961.43311
[400]	validation_0-rmse:6842.32178	validation_1-rmse:6961.32617
[450]	validation_0-rmse:6822.42480	validation_1-rmse:6960.71094
[500]	validation_0-rmse:6804.83350	validation_1-rmse:6961.68066
[510]	validation_0-rmse:6801.32910	validation_1-rmse:6961.58203


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0.0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=7,
             min_child_weight=8, missing=nan, monotone_constraints='()',
             n_estimators=5000, n_jobs=15, num_parallel_tree=1,
             random_state=201, reg_alpha=0.75, reg_lambda=1, scale_pos_weight=1,
             subsample=0.9, tree_method='approx', validate_parameters=1,
             verbosity=None)

In [44]:
# fit final xgboost train model
xgb_train_final = XGBRegressor(max_depth=7,
                         n_estimators = 300,
                         learning_rate=.1,
                         min_child_weight = 8,
                         colsample_bytree = 0.7,
                         subsample = 0.9,
                         gamma = 0.0,
                         reg_alpha = 0.75,
                         random_state=201,
                         n_jobs=num_cpus)
xgb_train_final.fit(X_tree_train, y_tree_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0.0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=7,
             min_child_weight=8, missing=nan, monotone_constraints='()',
             n_estimators=300, n_jobs=15, num_parallel_tree=1, random_state=201,
             reg_alpha=0.75, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
             tree_method='approx', validate_parameters=1, verbosity=None)

In [45]:
pd.DataFrame({'Importance': xgb_train_final.feature_importances_}, index=X_tree_train.columns).sort_values(['Importance'], ascending=False)

Unnamed: 0,Importance
years_since_crash,0.072932
REGION(T.W),0.071585
YEAR,0.069796
LOAN_PURPOSE(T.P),0.066067
ORIGINAL_LTV,0.064148
PROPERTY_STATE,0.062658
REGION(T.S),0.040347
ORIGINAL_LOAN_TERM,0.039146
ORIGINAL_LTV:years_since_crash,0.035354
BORROWER_CREDIT_SCORE_AT_ORIGINATION,0.033316


In [46]:
xgb_pred = xgb_train_final.predict(X_tree_validation)

In [47]:
accuracy(df_validation['NET_LOSS'], xgb_pred)

RMSE,Naive - RMSE
6962.905084,436.458382


## Re-Fit XGBoost on Full Train Set

In [40]:
imputer_mean_final = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer_zero_final = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
categorical_imputer_final = CategoricalImputer(other_threshold=.01)
special_imputer_final = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=700)
imputer_missing = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='missing')

In [41]:
imputer_mean_final.fit(df_train[continuous_mean])
df_train[continuous_mean] = imputer_mean_final.transform(df_train[continuous_mean])
df_test[continuous_mean] = imputer_mean_final.transform(df_test[continuous_mean])

In [42]:
imputer_zero_final.fit(df_train[continuous_zero])
df_train[continuous_zero] = imputer_zero_final.transform(df_train[continuous_zero])
df_test[continuous_zero] = imputer_zero_final.transform(df_test[continuous_zero])

In [43]:
categorical_imputer_final.fit(df_train[categorical_variables])
df_train[categorical_variables] = categorical_imputer_final.transform(df_train[categorical_variables])
df_test[categorical_variables] = categorical_imputer_final.transform(df_test[categorical_variables])

In [44]:
special_imputer_final.fit(df_train[special])
df_train[special] = special_imputer_final.transform(df_train[special])
df_test[special] = special_imputer_final.transform(df_test[special])

In [45]:
imputer_missing.fit(df_smaller_train[categorical_missing])
df_train[categorical_missing] = imputer_missing.transform(df_train[categorical_missing])
df_test[categorical_missing] = imputer_missing.transform(df_test[categorical_missing])

In [46]:
X_tree_train_final = df_train[continuous_features_trees + cat_ordinal_features_trees]
y_tree_train_final = df_train['NET_LOSS']

In [47]:
X_tree_train_patsy_final = dmatrix(formula_tree, df_train, return_type="dataframe")
X_tree_train_final = pd.concat([X_tree_train_final, X_tree_train_patsy_final], axis=1)

In [48]:
ordinal_encoder_final = OrdinalEncoder()
ordinal_encoder_final.fit(X_tree_train_final[cat_ordinal_features_trees])
X_tree_train_final[cat_ordinal_features_trees] = ordinal_encoder_final.transform(X_tree_train_final[cat_ordinal_features_trees])

In [49]:
X_tree_train_final.columns = X_tree_train_final.columns.str.replace('[', '(').str.replace(']', ')')

  X_tree_train_final.columns = X_tree_train_final.columns.str.replace('[', '(').str.replace(']', ')')


In [50]:
X_tree_test = df_test[continuous_features_trees + cat_ordinal_features_trees]
y_tree_test = df_test['NET_LOSS']

X_tree_test_patsy = build_design_matrices([X_tree_train_patsy_final.design_info], df_test, return_type="dataframe")[0]

X_tree_test = pd.concat([X_tree_test, X_tree_test_patsy], axis=1)

X_tree_test[cat_ordinal_features_trees] = ordinal_encoder_final.transform(X_tree_test[cat_ordinal_features_trees])

X_tree_test.columns = X_tree_test.columns.str.replace('[', '(').str.replace(']', ')')

  X_tree_test.columns = X_tree_test.columns.str.replace('[', '(').str.replace(']', ')')


In [51]:
# two models - can also sub n_estimators from 300 to 632
xgb_model_final = XGBRegressor(max_depth = 7,
                               n_estimators = 300,
                               learning_rate = 0.1,
                               min_child_weight = 8,
                               colsample_bytree = 0.7,
                               subsample = 0.9,
                               reg_alpha = 0.75,
                               gamma = 0.0,
                               random_state = 201,
                               n_jobs = num_cpus)
xgb_model_final.fit(X_tree_train_final, y_tree_train_final)
xgb_pred_final = xgb_model_final.predict(X_tree_test)

 # different model
 # xgb_model_final = XGBRegressor(max_depth = 13,
                             #  n_estimators = 117,
                             #  learning_rate = 0.0331,
                             #  min_child_weight = 14,
                             #  colsample_bytree = 0.578,
                             #  subsample = 0.396,
                             #  gamma = 0.0449,
                             #  random_state = 201,
                             #  n_jobs = num_cpus)


In [52]:
accuracy(df_test['NET_LOSS'], xgb_pred_final)

RMSE,Naive - RMSE
6961.700841,437.754505


In [53]:
pd.DataFrame({'Importance': xgb_model_final.feature_importances_}, index=X_tree_train_final.columns).sort_values(['Importance'], ascending=False)

Unnamed: 0,Importance
years_since_crash,0.075217
PROPERTY_STATE,0.072003
ORIGINAL_LTV,0.064469
LOAN_PURPOSE(T.P),0.06416
REGION(T.W),0.060023
YEAR,0.05681
ORIGINAL_LTV:years_since_crash,0.049925
ORIGINAL_INTEREST_RATE,0.038653
ORIGINAL_LOAN_TERM,0.037234
REGION(T.S),0.032592


In [55]:
if not full_data_set:
    df_test_fresh = pd.read_csv(FILES_LOCATION + "FannieMaeSmallTest.csv",
                          index_col="LOAN_IDENTIFIER",
                          dtype=col_classes,
                          parse_dates=date_columns,
                          sep='|')
elif full_data_set:
    df_test_fresh = pd.read_csv(FILES_LOCATION + "FannieMaeTest.csv",
                          index_col="LOAN_IDENTIFIER",
                          dtype=col_classes,
                          parse_dates=date_columns,
                          sep='|')

In [56]:
df_test_fresh['PREDICTIONS_NET_LOSS'] = xgb_pred_final

In [57]:
df_test_fresh.to_csv('FannieMaeTestWithPredictionsNetLoss.csv', sep='|')

In [58]:
xgb_pred_final = pd.Series(xgb_pred_final)
xgb_pred_final.to_csv('xgb_pred_final.csv')