In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import Binarizer, OneHotEncoder, OrdinalEncoder, add_dummy_feature
from sklearn.metrics import mean_squared_error
import math

pd.set_option('max_colwidth', 100)

In [None]:
REDO_EDA = False

In [None]:
data_df = pd.read_csv(r"../all/train.csv", index_col=0)
with open(r"../all/data_description.txt", 'r') as file:
    data_desc = file.readlines()

In [None]:
data_df.head()

In [None]:
og_columns = data_df.columns
og_columns

### Print out analysis graphs 

In [None]:
%%capture cap --no-stderr
if REDO_EDA:
    # TODO: need to remine the descriptions if i want that part to work
    for column in data_df:
        if column in cols_w_descripts:
            entry = [item for item in data_desc if column + ":" in item]
            print(column + ": " + entry[0].split(column + ":")[1].split('\n')[0])
        print(data_df[column].value_counts())
        try:
            plt.hist(data_df[column])
            plt.title(column)
            plt.savefig("../analysis/" + column + ".jpg")
            plt.plot(data_df[column], data_df['SalePrice'], 'bo')
            plt.title(column + " vs SalePrice")
            plt.savefig("../analysis/" + column + "_vs_saleprice.jpg")
            plt.show()
        except:
            pass

    with open('../analysis/output.txt', 'w') as f:
        f.write(cap.stdout)

### Split into train, dev

In [None]:
split_idx = int(data_df.shape[0] * .9)
data_df = data_df.sample(frac=1)
train_df = data_df[:split_idx]
dev_df = data_df[split_idx:]
del data_df
print(train_df.shape)
print(dev_df.shape)
train_df.head()

### Create baseline MSE

In [None]:
def baseline_pred(features):
    return np.median(train_df['SalePrice'])

preds = [baseline_pred(ftrs) for i, ftrs in dev_df.iterrows()]

# baseline MSE
baseline_MSE = mean_squared_error(preds, dev_df['SalePrice'])

### Add new features to dataframee

In [None]:
def full_square_footage(row):
    return row['1stFlrSF'] + row['2ndFlrSF']

def central_air(row):
    return row['CentralAir']=='Y'

def get_dummy_columns(train_df, 
                      dev_df,
                      col_to_dummy):
    cols_before = train_df.columns
    train_df = pd.concat([train_df, pd.get_dummies(train_df[col_to_dummy])], axis=1)
    dev_df = pd.concat([dev_df, pd.get_dummies(dev_df[col_to_dummy])], axis=1)
    
    neighborhood_dummy_cols = [col for col in train_df.columns if col not in cols_before]
    
    # make sure that any column that gets added in train
    # also gets added in dev
    # todo: how to make this more robust? what about test data?
    for col in neighborhood_dummy_cols:
        if col not in dev_df:
            dev_df[col] = 0
    return train_df, dev_df, neighborhood_dummy_cols

    

train_df['full_SF'] = train_df.apply(lambda row: full_square_footage(row), axis=1)
dev_df['full_SF'] = dev_df.apply(lambda row: full_square_footage(row), axis=1)

train_df['CentralAirBool'] = train_df.apply(lambda row: central_air(row), axis=1)
dev_df['CentralAirBool'] = dev_df.apply(lambda row: central_air(row), axis=1)


train_df, dev_df, neighborhood_dummy_cols = get_dummy_columns(train_df, dev_df, 'Neighborhood')
train_df, dev_df, bldg_type_dummy_cols = get_dummy_columns(train_df, dev_df, 'BldgType')
train_df, dev_df, overall_qual_dummy_cols = get_dummy_columns(train_df, dev_df, 'OverallQual')
train_df, dev_df, house_style_dummy_cols = get_dummy_columns(train_df, dev_df, 'HouseStyle')
train_df, dev_df, condition_1_dummy_cols = get_dummy_columns(train_df, dev_df, 'Condition1')

In [None]:
def add_secondary_conditions(row, 
                             condition_rows):
    for col in condition_rows:
        if row['Condition2'] == col:
            row[col] == 1

train_df.apply(lambda row: add_secondary_conditions(row, 
                                                    condition_1_dummy_cols), axis=1)
dev_df.apply(lambda row: add_secondary_conditions(row, 
                                                    condition_1_dummy_cols), axis=1)

In [None]:
outputs = []

def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y.iloc[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5


def train_and_compare(model, 
                      params,
                      features, 
                      comparison):
    model = model()
    fit_model = model.fit(train_df[features], train_df['SalePrice'])
    dev_preds = fit_model.predict(dev_df[features])
    mse = mean_squared_error(dev_preds, dev_df['SalePrice'])
    rmse = rmsle(dev_df['SalePrice'], dev_preds)
    print("\nMean squared error: {:.2f} ".format(mse))
    print("Change in MSE from comparison: {:.2f}".format(mse - comparison))
    return model, mse, rmse, features
    
outputs.append(train_and_compare(LinearRegression, 
                                 {},
                                ['full_SF'],
                                baseline_MSE))

outputs.append(train_and_compare(LinearRegression,
                                 {},
                                 ['1stFlrSF', '2ndFlrSF'],
                                 baseline_MSE))

outputs.append(train_and_compare(LinearRegression,
                                {},
                                ['1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr'],
                                baseline_MSE))

outputs.append(train_and_compare(DecisionTreeRegressor,
                                 {},
                                  ['1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr'],
                                 baseline_MSE))


outputs.append(train_and_compare(LinearRegression,
                                {},
                                ['1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr'] + neighborhood_dummy_cols,
                                baseline_MSE))

outputs.append(train_and_compare(LinearRegression,
                                {},
                                (['1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr']
                                 + bldg_type_dummy_cols
                                 + neighborhood_dummy_cols),
                                baseline_MSE))

outputs.append(train_and_compare(LinearRegression,
                                {},
                                (['1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'OverallQual']
                                 + bldg_type_dummy_cols
                                 + neighborhood_dummy_cols),
                                baseline_MSE))

outputs.append(train_and_compare(LinearRegression,
                                {},
                                (['1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr']
                                 + bldg_type_dummy_cols
                                 + neighborhood_dummy_cols
                                 + overall_qual_dummy_cols),
                                baseline_MSE))

outputs.append(train_and_compare(LinearRegression,
                                {},
                                (['1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr']
                                 + bldg_type_dummy_cols
                                 + neighborhood_dummy_cols
                                 + overall_qual_dummy_cols
                                 + house_style_dummy_cols),
                                baseline_MSE))

outputs.append(train_and_compare(LinearRegression,
                                {},
                                (['1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr']
                                 + bldg_type_dummy_cols
                                 + neighborhood_dummy_cols
                                 + overall_qual_dummy_cols
                                 + house_style_dummy_cols),
                                baseline_MSE))

outputs.append(train_and_compare(LinearRegression,
                                {},
                                (['1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'CentralAirBool']
                                 + bldg_type_dummy_cols
                                 + neighborhood_dummy_cols
                                 + overall_qual_dummy_cols
                                 + house_style_dummy_cols
                                 + condition_1_dummy_cols),
                                baseline_MSE))

outputs.append(train_and_compare(BayesianRidge,
                                {},
                                (['1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'CentralAirBool']
                                 + bldg_type_dummy_cols
                                 + neighborhood_dummy_cols
                                 + overall_qual_dummy_cols
                                 + house_style_dummy_cols
                                 + condition_1_dummy_cols),
                                baseline_MSE))

outputs.append(train_and_compare(DecisionTreeRegressor,
                                {},
                                ['1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr'] + neighborhood_dummy_cols,
                                baseline_MSE))

In [None]:
models = [item[0] for item in outputs]
mses = [item[1] for item in outputs]
rmses = [item[2] for item in outputs]
features = [item[3] for item in outputs]
scores_df = pd.DataFrame(data={'Model': models, 'Features': features, 'Root MSE': rmses, 'MSE': mses})
scores_df.sort_values('Root MSE', ascending=True)