In [7]:
import pandas as pd
import numpy as np 
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import timeit
import scipy
import glmnet_python
from glmnet import glmnet
from glmnetPredict import glmnetPredict
from glmnetCoef import glmnetCoef

In [8]:
df = pd.read_csv("https://liangfgithub.github.io/Data/Ames_data.csv")
testID = pd.read_csv(
    'https://liangfgithub.github.io/Data/project1_testIDs.dat',delim_whitespace=' ',header=None)


In [9]:
def numeric_convert(frame):
    # We may want to normalize data as well 
    for col in frame:
        try:
            frame[col] = pd.to_numeric(frame[col])
        except:
            frame[col] = pd.factorize(frame[col])[0]
    
    return frame

def get_split(frame, index):

    frame = frame.drop('Garage_Yr_Blt', axis=1)

    num_rows = np.arange(len(frame))

    test_index = testID.iloc[:,index]
    train_index = np.array([i for i in num_rows if i not in test_index])

    xtest = numeric_convert(frame.iloc[test_index,1:-1].copy())
    xtrain = numeric_convert(frame.iloc[train_index,1:-1].copy())

    # convert to log to get better model
    ytest = np.log(frame.iloc[test_index,-1].copy())
    ytrain = np.log(frame.iloc[train_index,-1].copy())

    return xtrain,xtest,ytrain,ytest




In [10]:
def clean_data(df, cols):
    for col in cols:
        df = df.drop(col, axis=1)
    return df

In [11]:
bad_cols = ['PID', 'Garage_Yr_Blt', 'Street', 'Utilities', 'Condition_2', 'Roof_Matl', 'Heating', 'Pool_QC', 'Misc_Feature', 'Low_Qual_Fin_SF', 'Pool_Area', 'Longitude','Latitude']
PID = df['PID']
# Set up data for use with scikit-learn
frame = clean_data(df, bad_cols)
cvsplits = []
num_rows = np.arange(len(frame))
for index in range(0,10):
    test_index = testID.iloc[:,index]
    train_index = np.array([i for i in num_rows if i not in test_index])
    cvsplits.append((train_index, test_index.values))

x = numeric_convert(frame.iloc[:,:-1].copy())
for c in x:
    x[c] = x[c] / np.max(np.abs(x[c]))

# convert to log to get better model
y = np.log(frame.iloc[:,-1].copy())

In [None]:
lr_rmse = []
lr_times = []
lr_models = []
lr_lambda = []
lambdas = np.exp(np.linspace(-1,-8,80))
for i in range(0,10):
    lr = Lasso()
    start = timeit.default_timer()
    lr.fit(x.iloc[cvsplits[i][0]], y[cvsplits[i][0]])
    stop = timeit.default_timer()
    yhat = lr.predict(x.iloc[cvsplits[i][1]])
    ytest = y[cvsplits[i][1]]
    rf_models.append(rf)
    rf_times.append(stop - start)
    rf_rmse.append(np.sqrt(np.mean((yhat - ytest)**2)))
    print(f'Split {i} RMSE: {rf_rmse[-1]}, runtime: {rf_times[-1]}')

In [13]:
lr_rmse = []
lr_times = []
lr_lambdas = []
lr_models = []
for i in range(0,10):
    start = timeit.default_timer()
    ytrain = y.iloc[cvsplits[i][0]].to_numpy()
    fit = glmnet(x = x.iloc[cvsplits[i][0]].to_numpy(), y=ytrain, family = 'gaussian')
    lambdau = fit['lambdau']
    fold_rmse = []
    for l in lambdau:
        ytrainhat = (glmnetPredict(fit, x.iloc[cvsplits[i][0]].to_numpy(), s = scipy.float64([l]))).flatten()
        fold_rmse.append(np.sqrt(np.mean((ytrainhat - ytrain)**2)))
    best_idx = np.where(fold_rmse == np.min(np.abs(fold_rmse)))[0][0]
    lr_lambdas.append(lambdau[best_idx])
    stop = timeit.default_timer()
    yhat = glmnetPredict(fit, x.iloc[cvsplits[i][1]].to_numpy(), s = scipy.float64([lr_lambdas[-1]])).flatten()
    ytest = y[cvsplits[i][1]]
    lr_models.append(fit)
    lr_times.append(stop - start)
    lr_rmse.append(np.sqrt(np.mean((yhat - ytest)**2)))
    print(f'Split {i} RMSE: {lr_rmse[-1]}, runtime: {lr_times[-1]}')

Split 0 RMSE: 0.1458660492361605, runtime: 0.34390075400006026
Split 1 RMSE: 0.14875871404828894, runtime: 0.34559665600045264
Split 2 RMSE: 0.16894699503168575, runtime: 0.21791181000025972
Split 3 RMSE: 0.16504560614278746, runtime: 0.2648320230000536
Split 4 RMSE: 0.15623868857774714, runtime: 0.33875934699972277
Split 5 RMSE: 0.14582337257907863, runtime: 0.30205251400002453
Split 6 RMSE: 0.14875736370380688, runtime: 0.1928231970005072
Split 7 RMSE: 0.16888216163770614, runtime: 0.2252748540004177
Split 8 RMSE: 0.164849583012185, runtime: 0.24972158899981878
Split 9 RMSE: 0.15623020650410496, runtime: 0.2389675040003567


In [19]:
lr_lambdas

[0.00018263318833249426,
 0.00018263318833249426,
 0.00018263318833249426,
 0.00018263318833249426,
 0.00018263318833249426,
 0.00018263318833249426,
 0.00018263318833249426,
 0.00018263318833249426,
 0.00018263318833249426,
 0.00018263318833249426]

In [14]:
best_idx = np.where(lr_rmse == np.min(lr_rmse))[0][0]
lr_best = lr_models[best_idx]
yhat_best = np.exp(glmnetPredict(lr_best, x.to_numpy(), s=scipy.float64([lr_lambdas[best_idx]]))).flatten()
pd.Series(yhat_best, name='Sale_Price')

0       218001.477644
1       116608.410042
2        84212.068569
3       283076.667240
4       196584.601125
            ...      
2925    151017.143889
2926    147737.891234
2927    116037.325249
2928    187085.115343
2929    229838.453524
Name: Sale_Price, Length: 2930, dtype: float64

In [15]:
pd.DataFrame(data={'PID': PID, 'Sale_Price': pd.Series(yhat_best, name='Sale_Price')}).to_csv('mysubmission1.txt', index=False)

In [16]:
rf_rmse = []
rf_times = []
rf_models = []
for i in range(0,10):
    rf = RandomForestRegressor(criterion='squared_error')
    start = timeit.default_timer()
    rf.fit(x.iloc[cvsplits[i][0]], y[cvsplits[i][0]])
    stop = timeit.default_timer()
    yhat = rf.predict(x.iloc[cvsplits[i][1]])
    ytest = y[cvsplits[i][1]]
    rf_models.append(rf)
    rf_times.append(stop - start)
    rf_rmse.append(np.sqrt(np.mean((yhat - ytest)**2)))
    print(f'Split {i} RMSE: {rf_rmse[-1]}, runtime: {rf_times[-1]}')

Split 0 RMSE: 0.08560174009149243, runtime: 2.360530913999355
Split 1 RMSE: 0.08755284967027353, runtime: 2.3281028270002935
Split 2 RMSE: 0.09702514716671658, runtime: 2.334553627000787
Split 3 RMSE: 0.09286057760848333, runtime: 2.434900249000748
Split 4 RMSE: 0.0901684012156606, runtime: 2.4509402490002685
Split 5 RMSE: 0.08661063964214978, runtime: 2.432449457000075
Split 6 RMSE: 0.08533644085192464, runtime: 2.36360153699934
Split 7 RMSE: 0.09695785526464452, runtime: 2.3423530660002143
Split 8 RMSE: 0.0923036871921028, runtime: 2.435029413999473
Split 9 RMSE: 0.08794964719264137, runtime: 2.422240172999409


In [17]:
rf_best = rf_models[np.where(rf_rmse == np.min(rf_rmse))[0][0]]
yhat_best = np.exp(rf_best.predict(x))
pd.Series(yhat_best, name='Sale_Price')

0       196490.049809
1       128850.577556
2       155984.480263
3       220791.345294
4       184100.554639
            ...      
2925    143394.537198
2926    132253.127864
2927    129521.132480
2928    172499.626035
2929    204174.396405
Name: Sale_Price, Length: 2930, dtype: float64

In [18]:
pd.DataFrame(data={'PID': PID, 'Sale_Price': pd.Series(yhat_best, name='Sale_Price')}).to_csv('mysubmission2.txt', index=False)