In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Binarizer, OneHotEncoder, OrdinalEncoder, add_dummy_feature
from sklearn.metrics import mean_squared_error
from shared_functions import rmsle, train_and_test, try_different_models
import math

pd.set_option('max_colwidth', 100)

In [None]:
REDO_EDA = False

In [None]:
data_df = pd.read_csv(r"../all/train.csv", index_col=0)
with open(r"../all/data_description.txt", 'r') as file:
    data_desc = file.readlines()

In [None]:
og_columns = list(data_df.columns)

### Print out analysis graphs 

In [None]:
data_df['LogSalePrice'] = np.log(data_df['SalePrice'])

In [None]:
%%capture cap --no-stderr
if REDO_EDA:
    # TODO: need to remine the descriptions if i want that part to work
    for column in data_df:
        #if column in cols_w_descripts:
        #    entry = [item for item in data_desc if column + ":" in item]
        #    print(column + ": " + entry[0].split(column + ":")[1].split('\n')[0])
        #print(data_df[column].value_counts())
        try:
            plt.plot(data_df[column], data_df['LogSalePrice'], 'bo')
            plt.title(column + " vs Log SalePrice")
            plt.savefig("../log_analysis/" + column + "_vs_saleprice.jpg")
            plt.show()
        except:
            print(column)

    '''with open('../analysis/output.txt', 'w') as f:
        f.write(cap.stdout)'''

In [None]:
plt.plot(data_df['LotFrontage'].fillna(0), data_df['SalePrice'], 'bo')
plt.show()
data_df = data_df.fillna(0)

### Split into train, dev

In [None]:
split_idx = int(data_df.shape[0] * .9)
data_df = data_df.sample(frac=1)
train_df = data_df[:split_idx]
dev_df = data_df[split_idx:]
del data_df
print(train_df.shape)
print(dev_df.shape)
train_df.head()

### Create baseline MSE

In [None]:
def baseline_pred(features):
    return np.median(train_df['SalePrice'])

preds = [baseline_pred(ftrs) for i, ftrs in dev_df.iterrows()]

# baseline MSE
baseline_MSE = mean_squared_error(preds, dev_df['SalePrice'])

### Add new features to dataframee

In [None]:
def full_square_footage(row):
    return row['1stFlrSF'] + row['2ndFlrSF']

def central_air(row):
    return row['CentralAir']=='Y'

def get_dummy_columns(train_df, 
                      dev_df,
                      col_to_dummy):
    cols_before = train_df.columns
    train_df = pd.concat([train_df, pd.get_dummies(train_df[col_to_dummy])], axis=1)
    dev_df = pd.concat([dev_df, pd.get_dummies(dev_df[col_to_dummy])], axis=1)
    
    dummy_cols = [col for col in train_df.columns if col not in cols_before]
    
    # make sure that any column that gets added in train
    # also gets added in dev
    # todo: how to make this more robust? what about test data?
    for col in dummy_cols:
        if col not in dev_df:
            dev_df[col] = 0
    return train_df, dev_df, dummy_cols

    

train_df['full_SF'] = train_df.apply(lambda row: full_square_footage(row), axis=1)
dev_df['full_SF'] = dev_df.apply(lambda row: full_square_footage(row), axis=1)

train_df['CentralAirBool'] = train_df.apply(lambda row: central_air(row), axis=1)
dev_df['CentralAirBool'] = dev_df.apply(lambda row: central_air(row), axis=1)

train_df['LogSalePrice'] = np.log(train_df['SalePrice'])
dev_df['LogSalePrice'] = np.log(dev_df['SalePrice'])


train_df, dev_df, neighborhood_dummy_cols = get_dummy_columns(train_df, dev_df, 'Neighborhood')
train_df, dev_df, bldg_type_dummy_cols = get_dummy_columns(train_df, dev_df, 'BldgType')
train_df, dev_df, overall_qual_dummy_cols = get_dummy_columns(train_df, dev_df, 'OverallQual')
train_df, dev_df, house_style_dummy_cols = get_dummy_columns(train_df, dev_df, 'HouseStyle')
train_df, dev_df, condition_1_dummy_cols = get_dummy_columns(train_df, dev_df, 'Condition1')
train_df, dev_df, ms_zoning_dummy_cols = get_dummy_columns(train_df, dev_df, 'MSZoning')
train_df, dev_df, ms_sub_class_dummy_cols = get_dummy_columns(train_df, dev_df, 'MSSubClass')

In [None]:
def add_secondary_conditions(row, 
                             condition_rows):
    for col in condition_rows:
        if row['Condition2'] == col:
            row[col] == 1

train_df.apply(lambda row: add_secondary_conditions(row, 
                                                    condition_1_dummy_cols), axis=1)
dev_df.apply(lambda row: add_secondary_conditions(row, 
                                                    condition_1_dummy_cols), axis=1)

In [None]:
models = [RandomForestRegressor(), RandomForestRegressor(criterion='mae')]
outcome_vars = ['LogSalePrice']
single_feature_sets = ([features for features in [neighborhood_dummy_cols,
                bldg_type_dummy_cols,
                ms_zoning_dummy_cols,
                house_style_dummy_cols,
                overall_qual_dummy_cols, 
                condition_1_dummy_cols,
                ms_sub_class_dummy_cols]] +
                [[feature] for feature in 
                og_columns if train_df[feature].dtypes ==np.int64 
                           or train_df[feature].dtypes==np.float
                           or train_df[feature].dtypes==np.int32
                           or train_df[feature].dtypes==int
                           or train_df[feature].dtypes==float])
single_feature_sets.remove(['SalePrice'])

In [None]:
df = try_different_models(train_df, dev_df, models, outcome_vars, single_feature_sets)
df.sort_values('Root MSE', ascending=True)

In [None]:
all_features = ([feature for feature in 
                og_columns if train_df[feature].dtypes ==np.int64 
                           or train_df[feature].dtypes==np.float
                           or train_df[feature].dtypes==np.int32
                           or train_df[feature].dtypes==int
                           or train_df[feature].dtypes==float]
               + neighborhood_dummy_cols
               + bldg_type_dummy_cols
               + ms_zoning_dummy_cols
               + house_style_dummy_cols
               + overall_qual_dummy_cols 
               + condition_1_dummy_cols
               + ms_sub_class_dummy_cols)
all_features.remove('SalePrice')
all_features.remove('OverallQual')

In [None]:
all_features

In [None]:
top_feature_ideas = [['OverallQual', 'GrLivArea', 'GarageArea', 'GarageCars','TotalBsmtSF'] + neighborhood_dummy_cols,
                    ['OverallQual', 'GrLivArea','GarageCars','TotalBsmtSF'] + neighborhood_dummy_cols,
                    ['OverallQual', 'GrLivArea', 'GarageArea','TotalBsmtSF'] + neighborhood_dummy_cols,
                    ['GrLivArea', 'GarageArea', 'GarageCars','TotalBsmtSF'] 
                     + neighborhood_dummy_cols 
                     + overall_qual_dummy_cols,
                    ['YearRemodAdd', 'OverallQual', 'GrLivArea', 'GarageArea', 'GarageCars','TotalBsmtSF'] + neighborhood_dummy_cols,
                    ['YearRemodAdd', 'OverallQual', 'GrLivArea', 'GarageArea', 'GarageCars','TotalBsmtSF'] 
                     + neighborhood_dummy_cols
                    + ms_sub_class_dummy_cols,
                     ['YearRemodAdd', 'GrLivArea', 'GarageArea', 'GarageCars','TotalBsmtSF'] 
                     + neighborhood_dummy_cols
                    + ms_sub_class_dummy_cols
                    + bldg_type_dummy_cols
                   + ms_zoning_dummy_cols
                   + house_style_dummy_cols
                   + overall_qual_dummy_cols 
                   + condition_1_dummy_cols,
                    ['YearRemodAdd', 'GrLivArea', 'GarageArea', 'GarageCars','TotalBsmtSF'] 
                   + ms_zoning_dummy_cols
                   + house_style_dummy_cols
                   + overall_qual_dummy_cols,
                    ['OverallQual', 'YearRemodAdd', 'GrLivArea', 'GarageArea', 'GarageCars','TotalBsmtSF'] 
                   + ms_zoning_dummy_cols
                   + house_style_dummy_cols,
                    all_features]
df = try_different_models(train_df, dev_df, models, outcome_vars, top_feature_ideas)
df.sort_values('Root MSE', ascending=True)