In [1]:
import pandas as pd
import numpy as np 
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats.mstats import winsorize
import timeit
import scipy

In [2]:
df = pd.read_csv("https://liangfgithub.github.io/Data/Ames_data.csv")
testID = pd.read_csv(
    'https://liangfgithub.github.io/Data/project1_testIDs.dat',delim_whitespace=' ',header=None)

In [195]:
test_indices = testID.iloc[:,3]

test = df.iloc[test_indices]
train = df.drop(test_indices)
# test = test.drop(columns="Sale_Price")

In [199]:
new = numeric_convert(train, "LR")

In [194]:
test_pred = pd.read_csv("mysubmission2.txt")["Sale_Price"].values
np.sqrt(np.mean((np.log(test["Sale_Price"].values) - np.log(test_pred))**2))

0.1353154971048763

In [193]:
test.to_csv("test.csv")
train.to_csv("train.csv")

In [251]:
bad_cols = ['PID', 'Garage_Yr_Blt', 'Street', 'Utilities', 'Condition_2', 'Roof_Matl', 'Pool_QC', 'Misc_Feature', 'Low_Qual_Fin_SF', 'Pool_Area', 'Longitude', 'Latitude', 'Alley', 'Land_Contour', 'Land_Slope',
            'Exter_Cond', 'Bsmt_Half_Bath', 'Three_season_porch', 'MS_Zoning', "Misc_Val", "Kitchen_AbvGr", "Pool_Area", "Garage_Cond"]

limit_cols = ["Lot_Frontage", "Lot_Area", "Mas_Vnr_Area", "BsmtFin_SF_2", "Bsmt_Unf_SF", "Total_Bsmt_SF", "Second_Flr_SF", 'First_Flr_SF', "Gr_Liv_Area", "Garage_Area", "Wood_Deck_SF", "Open_Porch_SF", "Enclosed_Porch", "Screen_Porch"]


def numeric_convert(frame, model):
    # We may want to normalize data as well
    if model == 'LR':
        frame = frame.drop(columns=bad_cols)
    elif model == 'RF':
        frame = frame.drop(columns=['PID', 'Garage_Yr_Blt'])

    numeric_data = pd.DataFrame()

    category_cols = []
    for col in frame:
        try:
            numeric_data[col] = pd.to_numeric(frame[col])
            if col in limit_cols:
                numeric_data[col] = winsorize(frame[col], (0, .05))
        except:
            category_cols.append(col)

    category_cols = np.array(category_cols)
    categorical_data = pd.get_dummies(frame[category_cols])

    final_data = pd.concat([numeric_data, categorical_data], axis=1)

    return final_data



In [287]:
lr_rmse = []
lr_times = []
lr_models = []
lr_lambda = []
lambdas = np.exp(np.linspace(-1,-8,80))
for i in range(0,10):

    test_indices = testID.iloc[:,i]

    test = df.iloc[test_indices]
    train = df.drop(test_indices)

    xtrain_frame_LR = numeric_convert(train.iloc[:,:-1], "LR")
    xtrain_LR = xtrain_frame_LR.to_numpy()
    ytrain = np.log(train.iloc[:,-1]).to_numpy()

    xtest_frame_LR = numeric_convert(test.iloc[:,:-1], "LR")
    xtest_LR = xtest_frame_LR.reindex(
        columns=xtrain_frame_LR.columns, fill_value=0).values
    ytest = np.log(test.iloc[:,-1]).to_numpy()

    lr = Ridge(alpha=5, max_iter=4000, positive=False, fit_intercept=True)

    start = timeit.default_timer()
    lr.fit(xtrain_LR, ytrain)
    stop = timeit.default_timer()
    
    yhat = lr.predict(xtest_LR)
    lr_models.append(lr)
    lr_times.append(stop - start)
    lr_rmse.append(np.sqrt(np.mean((yhat - ytest)**2)))
    print(f'Split {i+1} RMSE: {lr_rmse[-1]}, runtime: {lr_times[-1]}')

Split 1 RMSE: 0.12319327197345147, runtime: 0.007176500000241504
Split 2 RMSE: 0.11493195190488027, runtime: 0.00785120000000461
Split 3 RMSE: 0.12447361459404291, runtime: 0.0071770000004107715
Split 4 RMSE: 0.13144664146969853, runtime: 0.00824549999924784
Split 5 RMSE: 0.12960217583049802, runtime: 0.008532800000466523
Split 6 RMSE: 0.1231901352061848, runtime: 0.009077499999875727
Split 7 RMSE: 0.11494398216966434, runtime: 0.00967930000024353
Split 8 RMSE: 0.12443850786993514, runtime: 0.00717259999964881
Split 9 RMSE: 0.13144224549599995, runtime: 0.009057899999788788
Split 10 RMSE: 0.12960418806089813, runtime: 0.007639899999958288


In [282]:

lr_rmse = []
lr_times = []
lr_models = []
lr_lambda = []
lambdas = np.exp(np.linspace(-1, -8, 80))
for i in range(0, 10):

    test_indices = testID.iloc[:, i]

    test = df.iloc[test_indices]
    train = df.drop(test_indices)

    xtrain_frame_LR = numeric_convert(train.iloc[:, :-1], "RF")
    xtrain_LR = xtrain_frame_LR.to_numpy()
    ytrain = np.log(train.iloc[:, -1]).to_numpy()

    xtest_frame_LR = numeric_convert(test.iloc[:, :-1], "RF")
    xtest_LR = xtest_frame_LR.reindex(
        columns=xtrain_frame_LR.columns, fill_value=0).values
    ytest = np.log(test.iloc[:, -1]).to_numpy()

    lr = GradientBoostingRegressor(n_estimators=500, random_state=1, max_depth=4)

    start = timeit.default_timer()
    lr.fit(xtrain_LR, ytrain)
    stop = timeit.default_timer()

    yhat = lr.predict(xtest_LR)
    lr_models.append(lr)
    lr_times.append(stop - start)
    lr_rmse.append(np.sqrt(np.mean((yhat - ytest)**2)))
    print(f'Split {i+1} RMSE: {lr_rmse[-1]}, runtime: {lr_times[-1]}')


Split 1 RMSE: 0.12348903677598773, runtime: 10.316630199999963
Split 2 RMSE: 0.1179244970746627, runtime: 10.428963600000316
Split 3 RMSE: 0.12467532230854074, runtime: 10.131811899999775
Split 4 RMSE: 0.1344456472040848, runtime: 10.132989699999598
Split 5 RMSE: 0.13176621364142727, runtime: 10.102658599999813
Split 6 RMSE: 0.12351107763707757, runtime: 10.108520599999792
Split 7 RMSE: 0.11759130217922187, runtime: 10.131380499999977
Split 8 RMSE: 0.12180940755571291, runtime: 10.133308299999953
Split 9 RMSE: 0.1291808169632984, runtime: 10.141578399999162
Split 10 RMSE: 0.13225866189047367, runtime: 10.386503700000503
