In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Binarizer, OneHotEncoder, OrdinalEncoder, add_dummy_feature
from sklearn.metrics import mean_squared_error
from shared_functions import rmsle, train_and_test, try_different_models
import math
import graphviz


pd.set_option('max_colwidth', 100)
pd.set_option('precision', 4)
pd.options.display.float_format = '{:.4f}'.format

In [None]:
data_df = pd.read_csv(r"clean_data.csv", index_col=0)

In [None]:
data_df.dtypes

In [None]:
### todo: add log sale price?

## Modeling

For now, we do a simple split of 10% of our data into a dev set. Eventually, we plan to incorporate cross-validation in order to test our models on different subsets, as we have found that the quality of our models (tested on the dev set) varies drastically across different random splits.

In [None]:
split_idx = int(data_df.shape[0] * .9)
data_df = data_df.sample(frac=1)
train_df = data_df[:split_idx]
dev_df = data_df[split_idx:]
del data_df
print(train_df.shape)
print(dev_df.shape)

As our primary error metric, we focus on the root mean squared error of the logarithm of the prices, which is the error metric being used to create the leaderboard for this kaggle competition. See rmsle() in shared_functions.py for our implementation of the root mean squared error, an implementation we found (where).

As a baseline, we created a simple "model" which just predicts the median price of the training set. We also tried using the mean, but we found that across different train/dev splits, the median consistently gave us slightly lower root mean squared logarithmic error. 

In [None]:
def baseline_pred(features):
    return np.median(train_df['SalePrice'])

preds = [baseline_pred(ftrs) for i, ftrs in dev_df.iterrows()]

# baseline RMSLE
print("Baseline RMSLE: {:.3f}".format(rmsle(list(dev_df['SalePrice']), preds)))

With this as a baseline, we began exploring how different types of models perform on the problem.

todo: should we add a parsimony metric?

#### Linear Regression

In [None]:
### todo: why do i sometimes get a math domain error? what feature causes that???

We begin with linear regression as the standard choice for a regression problem. In ordinary least squares regression, the regression line is fit by minimizing the sum of squared residuals between the predicted line and the true data points. We can interpret the resulting coefficients on each feature as representing the additional impact of a one-unit change in that feature on the final price.

In [None]:
models = [LinearRegression(), LinearRegression(normalize=True)]
outcome_vars = ['SalePrice']
feature_sets = [[col for col in train_df.columns if col not in ['YrMoSold', 'SalePrice']]]
lrdf = try_different_models(train_df, dev_df, models, outcome_vars, feature_sets)
lrdf.sort_values('Root MSE', ascending=True)

^ to explain: normalize makes this way worse - I'm guessing that's because of how many dummy cols we have where the mean overall is going to skew to 0? but then the next time, normalize stayed the same....ugh

#### Tree-Based Regressors

The family of tree-based regressors learns a series of simple decision rules to predict the final sale price. The decision tree regressor makes one single Decision Tree, whereas the Random Forest regressor trains an ensemble of decision trees.

In [None]:
models = [DecisionTreeRegressor(), RandomForestRegressor()]
df = try_different_models(train_df, dev_df, models, outcome_vars, feature_sets)
df[['Model', 'Num Features', 'Outcome Var', 'Root MSE']].sort_values('Root MSE', ascending=True)

The random forest regressor shows up in the table as the DecisionTreeRegressor with the parentheses around the whole function. 

In [None]:
# it's saying the model in the table isn't fit yet, which doesn't really make sense to me
# need to figure that out
'''dot_data = export_graphviz(df.iloc[1]['Model'], out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("housing") '''

#### Bayesian Ridge Regression

In [None]:
models = [BayesianRidge()]
df = try_different_models(train_df, dev_df, models, outcome_vars, feature_sets)
df[['Model', 'Num Features', 'Outcome Var', 'Root MSE']].sort_values('Root MSE', ascending=True)

#### More sections with particular model types, explanations, visualizations

### Error Analysis

In [None]:
lr = lrdf.iloc[0]['Model']

In [None]:
def create_error_correlation_table(model,
                                   feature_set,
                                   dev_df):
    
    '''
    finds correlation between absolute value of error
    and each feature
    '''
    
    dev_preds = model.predict(dev_df[feature_set])
    dev_df = dev_df.reset_index()
    
    rmsles = []
    for i in range(len(dev_preds)):
        rmsles.append(rmsle([dev_df['SalePrice'][i]], [dev_preds[i]]))
        
    plt.hist(rmsles, bins=20)
    plt.xlabel("RMSLE")
    plt.ylabel("Number of Occurrences")
    plt.show()
    
    dev_df['linear_reg_errors'] = rmsles
    
    cols = []
    corrs = []
    for col in dev_df.columns:
        try:
            cor = np.corrcoef(abs(dev_df['linear_reg_errors']), dev_df[col])[0,1]
            cols.append(col)
            corrs.append(cor)
        except:
            pass
    
    corrs_df = pd.DataFrame(data={'col': cols, 'correlation': corrs})
    corrs_df = corrs_df.dropna(subset=['correlation'])
    return corrs_df
    
corrs_df = create_error_correlation_table(lr, feature_sets[0], dev_df)
corrs_df.reindex(corrs_df.correlation.abs().sort_values(ascending=False).index)