In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Binarizer, OneHotEncoder, OrdinalEncoder, add_dummy_feature
from sklearn.metrics import mean_squared_error
from shared_functions import rmsle, train_and_test, try_different_models
import math
import graphviz


pd.set_option('max_colwidth', 100)
pd.set_option('precision', 4)
pd.options.display.float_format = '{:.4f}'.format

In [None]:
data_df = pd.read_csv(r"clean_data.csv", index_col=0)

In [None]:
data_df['LogSalePrice'] = np.log(data_df['SalePrice'])

In [None]:
NUM_CROSS_VALS = 20

## Modeling

For now, we do a simple split of 10% of our data into a dev set. We also incorporate cross-validation, as we have found that for this amount of data, the differences in the models and their scores on the dev sets can vary significantly based on which rows end up in the train and dev sets. Repeated random sub-sampling cross validation helps us get more consistent r

In [None]:
cross_val_list = []
for i in range(NUM_CROSS_VALS):
    split_idx = int(data_df.shape[0] * .85)
    # line below is what shuffles
    data_df = data_df.sample(frac=1)
    train_df = data_df[:split_idx]
    dev_df = data_df[split_idx:]
    split_dict = {'train_df': train_df,
                  'dev_df': dev_df}
    cross_val_list.append(split_dict)

As our primary error metric, we focus on the root mean squared error of the logarithm of the prices, which is the error metric being used to create the leaderboard for this kaggle competition. See rmsle() in shared_functions.py for our implementation of the root mean squared error, an implementation we found (where).

When we consulted our resident real estate expert, Hilary's dad, about this problem, he told us that only one of these factors matters - "location, location, location." In the spirit of that insight, we created a baseline "model" which looks at what neighborhood the house is in and takes the mean price of houses from that neighborhood in the training set. 

In [None]:
def baseline_pred(row,
                  train_df):
    for col in train_df:
        if 'Neighborhood' in col:
            if row[col] == 1:
                neighborhood_var = col
                break
    return np.mean(train_df[train_df[neighborhood_var]==1]['LogSalePrice'])

def get_baseline_cross_val(cross_val_list):
    all_rmses = []
    for di in cross_val_list:
        dev_df = di['dev_df']
        dev_df['baseline_pred'] = dev_df.apply(lambda row: baseline_pred(row,
                                                                         di['train_df']), axis=1)
        rmse = rmsle(list(np.exp(dev_df['LogSalePrice'])), list(np.exp(dev_df['baseline_pred'])))
        all_rmses.append(rmse)
    return np.mean(all_rmses) 

# baseline RMSLE
print("Baseline RMSLE: {:.3f}".format(get_baseline_cross_val(cross_val_list)))

With this as a baseline, we began exploring how different types of models perform on the problem.

todo: should we add a parsimony metric?

#### Linear Regression

We begin with linear regression as the standard choice for a regression problem. In ordinary least squares regression, the regression line is fit by minimizing the sum of squared residuals between the predicted line and the true data points. We can interpret the resulting coefficients on each feature as representing the additional impact of a one-unit change in that feature on the final price.

In [None]:
models = [LinearRegression()]
outcome_vars = ['LogSalePrice']
feature_sets = [[col for col in data_df.columns if col not in ['YrMoSold', 'LogSalePrice', 'SalePrice'] and 
                 'Condition2' not in col],
                [col for col in data_df.columns if col not in ['YrMoSold', 'LogSalePrice', 'SalePrice', 'OverallQual_10'] and 
                 'Condition2' not in col],
               [col for col in data_df.columns if col not in ['YrMoSold', 'LogSalePrice', 'SalePrice']]]
lrdf = try_different_models(cross_val_list, models, outcome_vars, feature_sets)
lrdf.sort_values('Root MSE', ascending=True)

^ to explain: normalize makes this way worse - I'm guessing that's because of how many dummy cols we have where the mean overall is going to skew to 0? but then the next time, normalize stayed the same....ugh

#### Tree-Based Regressors

The family of tree-based regressors learns a series of simple decision rules to predict the final sale price. The decision tree regressor makes one single Decision Tree, whereas the Random Forest regressor trains an ensemble of decision trees.

In [None]:
models = [DecisionTreeRegressor(), RandomForestRegressor()]
df = try_different_models(cross_val_list, models, outcome_vars, feature_sets)
df[['Model', 'Num Features', 'Outcome Var', 'Root MSE']].sort_values('Root MSE', ascending=True)

The random forest regressor shows up in the table as the DecisionTreeRegressor with the parentheses around the whole function. 

In [None]:
# it's saying the model in the table isn't fit yet, which doesn't really make sense to me
# need to figure that out
'''dot_data = export_graphviz(df.iloc[1]['Model'], out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("housing") '''

#### Bayesian Ridge Regression

In [None]:
models = [BayesianRidge()]
df = try_different_models(cross_val_list, models, outcome_vars, feature_sets)
df[['Model', 'Num Features', 'Outcome Var', 'Root MSE']].sort_values('Root MSE', ascending=True)

#### More sections with particular model types, explanations, visualizations

### Error Analysis

In this section, we'll go into more detail about how we actually iterated on models and chose whichever ones we end up deciding are our best. Our primary tools will be this error correlation table, which we'll use to look at patterns of errors the model is making, and diagnostics to determine whether or not the model is overfitting. We'll compare different models to each other and explain the model or ensemble that we chose as our "best."

In [None]:
lr = lrdf.iloc[0]['Model']

In [None]:
def create_error_correlation_table(model,
                                   outcome_var,
                                   feature_set,
                                   dev_df):
    
    '''
    finds correlation between absolute value of error
    and each feature
    '''
    
    dev_preds = model.predict(dev_df[feature_set])
    dev_df = dev_df.reset_index()
    
    rmsles = []
    for i in range(len(dev_preds)):
        rmsles.append(rmsle([dev_df[outcome_var][i]], [dev_preds[i]]))
        
    plt.hist(rmsles, bins=20)
    plt.xlabel("RMSLE")
    plt.ylabel("Number of Occurrences")
    plt.show()
    
    dev_df['linear_reg_errors'] = rmsles
    
    cols = []
    corrs = []
    for col in dev_df.columns:
        try:
            cor = np.corrcoef(abs(dev_df['linear_reg_errors']), dev_df[col])[0,1]
            cols.append(col)
            corrs.append(cor)
        except:
            pass
    
    corrs_df = pd.DataFrame(data={'col': cols, 'correlation': corrs})
    corrs_df = corrs_df.dropna(subset=['correlation'])
    return corrs_df
    
corrs_df = create_error_correlation_table(lr, 'LogSalePrice', feature_sets[-1], dev_df)
corrs_df.reindex(corrs_df.correlation.abs().sort_values(ascending=False).index)

In [None]:
### Third section: interpretability, deeper dive into certain use cases 