In [5]:
#training and validating data using all the cleaning techniques from the exploration
import pandas as pd
import numpy as np
from typing import Sequence

from sklearn.ensemble import RandomForestRegressor
from pandas.api.types import is_string_dtype, is_object_dtype, is_categorical_dtype
from sklearn.metrics import mean_absolute_error, mean_squared_log_error, mean_squared_error, r2_score

from rfpimp import *
from prep import *
import feather

In [71]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, mean_squared_error, r2_score, roc_curve
from sklearn.linear_model import LogisticRegression , Ridge, Lasso, LinearRegression
from sklearn.preprocessing import StandardScaler
import scipy.stats as scs
import statsmodels.api as sm
import statsmodels.formula.api as smf
from glm.glm import GLM
from glm.families import Gaussian
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import re
from clean import *
from score_model import *
from ridge import *
# only techniques allowed, no RF
# Linear Regression.
# Logistic Regression.
# Median Regression (linear regression by minimizing the sum of absolute deviations).
# Any other GLM.
# Regularization: Ridge and LASSO.

In [147]:

# load data

df = feather.read_dataframe("data/bulldozer-train.feather")
df = df.iloc[-100_000:] # same 100,000 records as before
X, y = df.drop('SalePrice', axis=1), df['SalePrice']


In [148]:
# apply log to result, clean and categorize the data as before
y = np.log(y)
clean(X)
X, catencoders = feature_eng(X)
medians = numericalize(X, catencoders)


In [78]:
df_train = feather.read_dataframe("data/bulldozer-valid.feather")
X_train, y_train = df_train.drop('SalePrice', axis=1), df_train['SalePrice']

y_train = np.log(y_train)
clean(X_train)
X_train = feature_eng_test(X_train, catencoders)
numericalize_test(X_train, medians, catencoders)


In [49]:
def RMSE(y_pred, y_true):
    return np.sqrt(MSE(y_pred, y_true))

In [79]:
X_train = X_train.reindex(columns=X.columns)

In [81]:
#linear regression

from sklearn.metrics import mean_squared_error, mean_absolute_error

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
rmsle_train_baseline = np.sqrt( mean_squared_error(y_train, y_pred) )
r2_train_baseline = model.score(X_train, y_train)
print(f"Validation R^2 {r2_train_baseline:.5f}, "+
      f"RMSLE {rmsle_train_baseline:.5f}")


Validation R^2 0.43461, RMSLE 0.54363


In [142]:
#Ridge

model = Ridge(alpha=0.5)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
rmsle_train_baseline = np.sqrt( mean_squared_error(y_train, y_pred) )
r2_train_baseline = model.score(X_train, y_train)
print(f"Validation R^2 {r2_train_baseline:.5f}, "+
      f"RMSLE {rmsle_train_baseline:.5f}")


Validation R^2 0.74030, RMSLE 0.36844


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number2.920072e-36
  overwrite_a=True).T


In [94]:
#Lasso

model = Lasso(alpha=0.5)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
rmsle_train_baseline = np.sqrt( mean_squared_error(y_train, y_pred) )
r2_train_baseline = model.score(X_train, y_train)
print(f"Validation R^2 {r2_train_baseline:.5f}, "+
      f"RMSLE {rmsle_train_baseline:.5f}")


Validation R^2 0.31890, RMSLE 0.59667


In [109]:
def select_features(X, y, X_valid, y_valid, drop=0.10):
   min_rmsle = 99999
   X_valid = X_valid.reindex(columns=X.columns)
   rf, _, rmsle, _ = test_valid(X, y, X_valid, y_valid,
                                max_features=.3, min_samples_leaf=2)
   I = importances(rf, X_valid, y_valid)
   features = list(I.index)
   keep = best_features = features
   n = int(.9/drop) # how many iterations? get to 90%
   for i in range(1,n+1):
       X2 = X[keep]
       X_valid2 = X_valid[keep]
       print(f"\nNum features = {len(keep)}")
       rf2, _, rmsle, _ = test_valid(X2, y, X_valid2, y_valid,
                                     max_features=.3, min_samples_leaf=2)
       if rmsle < min_rmsle:
           min_rmsle = rmsle
           best_features = keep
       I2 = importances(rf2, X_valid2, y_valid) # recompute since collinear
       features = list(I2.index)
       keep = features[0:int(len(features)*(1-drop))]

   return min_rmsle, best_features


In [110]:
# find best features by importance and drop those not relevant

min_rmsle, best_features = \
    select_features(X, y, X_valid, y_valid, drop=0.10)
print(f"{len(best_features)} features is best:")
print(best_features)


OOB R^2 0.92223 using 19,106,374 tree nodes 40.0 median tree height
Validation R^2 0.84946, RMSLE 0.28051, MAE $6600

Num features = 83
OOB R^2 0.92227 using 19,102,790 tree nodes 41.0 median tree height
Validation R^2 0.85072, RMSLE 0.27934, MAE $6562

Num features = 74
OOB R^2 0.92200 using 19,120,610 tree nodes 40.0 median tree height
Validation R^2 0.84487, RMSLE 0.28475, MAE $6767

Num features = 66
OOB R^2 0.92191 using 19,081,424 tree nodes 41.0 median tree height
Validation R^2 0.84722, RMSLE 0.28260, MAE $6692

Num features = 59
OOB R^2 0.92188 using 19,077,334 tree nodes 40.0 median tree height
Validation R^2 0.84791, RMSLE 0.28196, MAE $6718

Num features = 53
OOB R^2 0.92189 using 19,031,906 tree nodes 40.0 median tree height
Validation R^2 0.84868, RMSLE 0.28124, MAE $6746

Num features = 47
OOB R^2 0.92212 using 19,024,078 tree nodes 40.0 median tree height
Validation R^2 0.84626, RMSLE 0.28348, MAE $6736

Num features = 42
OOB R^2 0.92189 using 18,942,216 tree nodes 40.0

In [150]:
df = feather.read_dataframe("data/bulldozer-train-all.feather")
df = df.query('saledate.dt.year>=2007').copy()
X, y = df.drop('SalePrice', axis=1), df['SalePrice']
y = np.log(y)
clean(X)
X, catencoders = feature_eng(X)
medians = numericalize(X, catencoders)
X = X[best_features]


In [158]:
df_test = feather.read_dataframe("data/bulldozer-test.feather")
X_test, y_test = df_test.drop('SalePrice', axis=1), df_test['SalePrice']
y_test = np.log(y_test)
clean(X_test)
X_test = feature_eng_test(X_test, catencoders)
df_apply_cats(X_test, catencoders)
df_fix_missing_test_nums(X_test, medians)
df_cat_to_catcode(X_test)
X_test = X_test[best_features]


In [165]:
# Looks like Ridge has the best validation score.
# check vs test error
#Ridge

test_soln = pd.read_csv('data/do_not_open/test_soln.csv')
test_soln = np.log(test_soln['SalePrice'])

X_test = X_test.reindex(columns=X.columns)
model = Ridge(alpha=0.5)
model.fit(X, y)
y_pred = model.predict(X_test)
rmsle_test = np.sqrt( mean_squared_error(test_soln, y_pred) )
r2_score_test = model.score(X_test, test_soln)
print(f"Validation R^2 {r2_train_baseline:.5f}, "+
      f"RMSLE {rmsle_train_baseline:.5f}")


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.420980e-39
  overwrite_a=True).T


Validation R^2 0.66884, RMSLE 0.42264


In [166]:
#Best score is .42 with Ridge regression

#If Random Forest was allowed, it would be around .24