# Introduction
In this tutorial, we will train a regression model with Foreshadow using the [House Pricing](https://www.kaggle.com/c/house-prices-advanced-regression-techniques) dataset from Kaggle.


# Getting Started
To get started with foreshadow, install the package using `pip install foreshadow`. This will also install the dependencies. Now create a simple python script that uses all the defaults with Foreshadow. Note that Foreshadow requires `Python >=3.6, <4.0`. 

First import foreshadow related classes. Also import sklearn, pandas and numpy packages. 

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import get_scorer
from sklearn.metrics import mean_squared_log_error

from foreshadow import Foreshadow
from foreshadow.intents import IntentType
from foreshadow.utils import ProblemType

pd.options.display.max_columns=None

RANDOM_SEED=42
np.random.seed(RANDOM_SEED)

# Load the dataset

In [2]:
df_train = pd.read_csv("train.csv")
X_df = df_train.drop(columns="SalePrice")
y_df = df_train[["SalePrice"]]
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2)
X_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
254,255,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1957,1957,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,Gd,CBlock,TA,TA,No,Rec,922,Unf,0,392,1314,GasA,TA,Y,SBrkr,1314,0,0,1314,1,0,1,0,3,1,TA,5,Typ,0,,Attchd,1957.0,RFn,1,294,TA,TA,Y,250,0,0,0,0,0,,,,0,6,2010,WD,Normal
1066,1067,60,RL,59.0,7837,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,7,1993,1994,Gable,CompShg,VinylSd,VinylSd,,0.0,Gd,TA,PConc,Gd,TA,No,Unf,0,Unf,0,799,799,GasA,Gd,Y,SBrkr,799,772,0,1571,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1993.0,RFn,2,380,TA,TA,Y,0,40,0,0,0,0,,,,0,5,2009,WD,Normal
638,639,30,RL,67.0,8777,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Feedr,Norm,1Fam,1Story,5,7,1910,1950,Gable,CompShg,MetalSd,Wd Sdng,,0.0,TA,TA,CBlock,Fa,TA,No,Unf,0,Unf,0,796,796,GasA,Gd,Y,FuseA,796,0,0,796,0,0,1,0,2,1,TA,4,Typ,0,,,,,0,0,,,P,328,0,164,0,0,0,,MnPrv,,0,5,2008,WD,Normal
799,800,50,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,Corner,Gtl,SWISU,Feedr,Norm,1Fam,1.5Fin,5,7,1937,1950,Gable,CompShg,Wd Sdng,Wd Sdng,BrkFace,252.0,TA,TA,BrkTil,Gd,TA,No,ALQ,569,Unf,0,162,731,GasA,Ex,Y,SBrkr,981,787,0,1768,1,0,1,1,3,1,Gd,7,Typ,2,TA,Detchd,1939.0,Unf,1,240,TA,TA,Y,0,0,264,0,0,0,,MnPrv,,0,6,2007,WD,Normal
380,381,50,RL,50.0,5000,Pave,Pave,Reg,Lvl,AllPub,Inside,Gtl,SWISU,Norm,Norm,1Fam,1.5Fin,5,6,1924,1950,Gable,CompShg,BrkFace,Wd Sdng,,0.0,TA,TA,BrkTil,TA,TA,No,LwQ,218,Unf,0,808,1026,GasA,TA,Y,SBrkr,1026,665,0,1691,0,0,2,0,3,1,Gd,6,Typ,1,Gd,Detchd,1924.0,Unf,1,308,TA,TA,Y,0,0,242,0,0,0,,,,0,5,2010,WD,Normal


# Model Training Iteration 1 - ElasticNet

In [3]:
def measure(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    print('root mean squared log error = %5.4f' % rmsle)
    return rmsle

In [4]:
shadow1 = Foreshadow(problem_type=ProblemType.REGRESSION, random_state=RANDOM_SEED, n_jobs=-1, estimator=ElasticNet(random_state=RANDOM_SEED))
_ = shadow1.fit(X_train, y_train)

2020-03-29 12:20:42,876 - foreshadow - INFO - 64366 - Identified columns with over 90% missing values: Alley,PoolQC,MiscFeature and they will be dropped.
2020-03-29 12:20:56,214 - foreshadow - INFO - 64366 - Exported processed data to processed_training_data.csv
  "Columns are not all uniquely named, automatically resolving"


In [5]:
_ = measure(shadow1, X_test, y_test)

2020-03-29 12:20:59,823 - foreshadow - INFO - 64366 - Exported processed data to processed_test_data.csv


root mean squared log error = 0.1794


  "Columns are not all uniquely named, automatically resolving"


### You might be curious how Foreshadow handled the input data. Let's take a look

In [6]:
shadow1.get_data_summary()

Unnamed: 0,SaleCondition,Exterior2nd,MasVnrType,GarageType,FireplaceQu,Fireplaces,Foundation,BsmtQual,BsmtCond,Exterior1st,BsmtExposure,Functional,BsmtFinType2,KitchenAbvGr,BedroomAbvGr,SaleType,HalfBath,CentralAir,FullBath,BsmtFinType1,BsmtHalfBath,RoofMatl,YearRemodAdd,YrSold,MSZoning,Fence,PavedDrive,GarageCond,GarageQual,LandContour,GarageCars,RoofStyle,LotConfig,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,GarageFinish,OverallCond,YearBuilt,LandSlope,BsmtFullBath,KitchenQual,Id,Utilities,Street,LotShape,ExterQual,ExterCond,Heating,HeatingQC,Electrical,BsmtUnfSF,MSSubClass,MoSold,MiscVal,LotFrontage,PoolArea,ScreenPorch,3SsnPorch,EnclosedPorch,OpenPorchSF,WoodDeckSF,2ndFlrSF,TotalBsmtSF,1stFlrSF,GarageArea,LowQualFinSF,OverallQual,GarageYrBlt,MasVnrArea,BsmtFinSF1,TotRmsAbvGrd,BsmtFinSF2,LotArea,GrLivArea,SalePrice
intent,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Categorical,Droppable,Droppable,Droppable,Droppable,Droppable,Droppable,Droppable,Droppable,Droppable,Droppable,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Label
count,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168,1168
nan_pct,0,0,0.513699,5.47945,46.8322,0,0,2.39726,2.39726,0,2.39726,0,2.39726,0,0,0,0,0,0,2.39726,0,0,0,0,0,80.0514,0,5.47945,5.47945,0,0,0,0,0,0,0,0,0,5.47945,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0856164,0,0,0,0,18.5788,0,0,0,0,0,0,0,0,0,0,0,0,5.47945,0.513699,0,0,0,0,0,0
unique,6,16,4,6,5,4,6,4,4,15,4,7,6,4,8,9,3,2,4,6,3,7,61,5,5,4,3,5,5,4,5,6,5,25,9,8,5,8,3,9,111,3,4,4,1168,2,2,4,4,5,6,5,4,685,15,12,19,107,7,66,17,98,188,244,361,630,657,394,20,10,94,286,549,12,118,890,734,571
#1_value,Normal 82.53%,VinylSd 35.10%,None 57.96%,Attchd 59.33%,Gd 26.11%,0 46.83%,PConc 44.52%,TA 44.61%,TA 89.47%,VinylSd 35.96%,No 65.84%,Typ 92.81%,Unf 86.39%,1 95.21%,3 55.57%,WD 86.64%,0 62.50%,Y 92.89%,2 53.51%,Unf 29.54%,0 94.35%,CompShg 98.37%,1950 12.33%,2009 23.63%,RL 79.11%,MnPrv 10.96%,Y 91.61%,TA 90.84%,TA 89.90%,Lvl 90.67%,2 57.96%,Gable 77.57%,Inside 70.38%,NAmes 15.50%,Norm 85.96%,Norm 99.06%,1Fam 83.73%,1Story 49.40%,Unf 41.10%,5 56.25%,2006 4.45%,Gtl 94.86%,0 58.65%,TA 50.43%,1460 0.09%,AllPub 99.91%,Pave 99.66%,Reg 62.41%,TA 62.24%,TA 87.16%,GasA 97.60%,Ex 49.91%,SBrkr 91.70%,0 7.88%,20 37.16%,6 17.64%,0 96.23%,60.0 9.59%,0 99.49%,0 91.70%,0 98.29%,0 86.30%,0 43.84%,0 52.91%,0 56.25%,0 2.40%,864 1.80%,0 5.48%,0 98.20%,6 26.63%,2005.0 4.45%,0.0 57.71%,0 31.93%,6 27.83%,0 88.78%,9600 1.97%,864 1.54%,140000 1.28%
#2_value,Partial 90.92%,Wd Sdng 49.23%,BrkFace 89.30%,Detchd 85.70%,TA 47.69%,1 91.70%,CBlock 87.67%,Gd 86.82%,Gd 94.18%,HdBoard 51.03%,Av 80.82%,Min2 95.29%,Rec 90.33%,2 99.74%,2 79.54%,New 94.95%,1 99.32%,N 100.00%,1 97.26%,GLQ 57.62%,1 99.91%,Tar&Grv 99.14%,2006 18.92%,2007 45.89%,RM 93.84%,GdPrv 15.24%,N 97.86%,Fa 93.15%,Fa 92.98%,Bnk 94.78%,1 81.85%,Hip 97.69%,Corner 89.30%,CollgCr 25.34%,Feedr 91.61%,Feedr 99.32%,TwnhsE 91.27%,2Story 80.22%,RFn 70.12%,6 73.89%,2005 8.90%,Mod 99.23%,1 98.89%,Gd 90.67%,498 0.17%,NoSeWa 100.00%,Grvl 100.00%,IR1 96.15%,Gd 95.46%,Gd 97.60%,GasW 98.89%,TA 79.62%,FuseA 97.60%,728 8.48%,60 57.71%,7 33.73%,400 96.92%,70.0 14.47%,738 99.57%,192 92.21%,180 98.46%,112 87.41%,36 45.89%,192 55.39%,504 56.93%,864 4.62%,912 2.91%,440 8.73%,80 98.46%,5 53.00%,2006.0 8.30%,108.0 58.30%,24 32.88%,7 50.77%,180 89.21%,7200 3.51%,1040 2.48%,135000 2.40%
#3_value,Abnorml 97.52%,MetalSd 63.36%,Stone 98.37%,BuiltIn 92.04%,Fa 50.00%,2 99.66%,BrkTil 97.60%,Ex 95.12%,Fa 97.52%,MetalSd 65.84%,Gd 89.64%,Min1 97.69%,LwQ 93.41%,3 99.91%,4 94.86%,COD 98.03%,2 100.00%,,3 99.57%,ALQ 72.86%,2 100.00%,WdShngl 99.49%,2005 24.23%,2006 67.29%,FV 98.37%,GdWo 19.18%,P 100.00%,Gd 93.92%,Gd 94.09%,HLS 97.77%,3 94.18%,Flat 98.63%,CulDSac 96.49%,OldTown 33.13%,Artery 95.03%,Artery 99.49%,Duplex 94.78%,1.5Fin 90.58%,Fin 94.52%,7 87.76%,2004 12.84%,Sev 100.00%,2 99.91%,Ex 97.26%,478 0.26%,,,IR2 99.32%,Ex 99.06%,Fa 99.66%,Grav 99.40%,Gd 96.32%,FuseF 99.66%,572 9.08%,50 67.38%,5 48.97%,500 97.60%,80.0 19.09%,648 99.66%,224 92.55%,144 98.63%,96 87.93%,20 47.35%,144 57.71%,728 57.53%,672 5.91%,1040 3.94%,576 11.90%,528 98.54%,7 75.09%,2004.0 12.16%,180.0 58.90%,16 33.56%,5 69.09%,374 89.47%,6000 4.71%,1200 3.25%,190000 3.42%
#4_value,Family 99.06%,HdBoard 77.31%,BrkCmn 99.49%,Basment 93.41%,Ex 51.80%,3 100.00%,Slab 99.32%,Fa 97.60%,Po 97.60%,Wd Sdng 80.48%,Mn 97.60%,Mod 98.80%,BLQ 95.29%,0 100.00%,1 97.77%,ConLD 98.63%,,,0 100.00%,BLQ 83.39%,,WdShake 99.74%,2007 29.28%,2008 88.36%,RH 99.66%,MnWw 19.95%,,Po 94.35%,Ex 94.35%,Low 100.00%,0 99.66%,Gambrel 99.40%,FR2 99.74%,Edwards 40.58%,RRAn 96.66%,PosN 99.66%,Twnhs 97.52%,SLvl 95.03%,,8 92.47%,2007 16.10%,,3 100.00%,Fa 100.00%,481 0.34%,,,IR3 100.00%,Fa 100.00%,Ex 99.91%,Wall 99.74%,Fa 99.91%,FuseP 99.91%,300 9.59%,120 72.86%,4 58.13%,700 98.03%,50.0 23.12%,555 99.74%,120 92.89%,216 98.80%,216 88.27%,40 48.63%,100 59.85%,720 58.13%,912 7.11%,672 4.71%,528 14.38%,53 98.63%,8 86.64%,2007.0 15.58%,200.0 59.42%,20 33.99%,8 82.02%,279 89.64%,10800 5.82%,894 3.94%,110000 4.45%
#5_value,Alloca 99.66%,Plywood 86.90%,,CarPort 94.01%,Po 53.17%,,Stone 99.74%,,,Plywood 87.41%,,Maj1 99.57%,ALQ 96.75%,,5 99.06%,ConLI 98.97%,,,,Rec 92.29%,,Metal 99.83%,2004 33.39%,2010 100.00%,C (all) 100.00%,,,Ex 94.52%,Po 94.52%,,4 100.00%,Mansard 99.83%,FR3 100.00%,Somerst 46.49%,PosN 97.95%,RRAn 99.74%,2fmCon 100.00%,SFoyer 97.43%,,4 96.23%,2003 19.18%,,,,482 0.43%,,,,,Po 100.00%,OthW 99.91%,Po 100.00%,,625 10.10%,70 77.31%,8 66.10%,450 98.37%,75.0 26.71%,519 99.83%,180 93.24%,168 98.97%,192 88.61%,48 49.83%,120 61.99%,546 58.73%,1040 7.96%,848 5.39%,240 16.78%,120 98.72%,4 94.18%,2003.0 19.01%,120.0 59.93%,936 34.42%,4 88.01%,182 89.81%,8400 6.76%,912 4.62%,155000 5.39%
#6_value,AdjLand 100.00%,CmentBd 90.67%,,2Types 94.52%,,,Wood 100.00%,,,CemntBd 91.27%,,Maj2 99.91%,GLQ 97.60%,,6 99.57%,ConLw 99.32%,,,,LwQ 97.60%,,Roll 99.91%,2003 36.90%,,,,,,,,,Shed 100.00%,,NWAmes 52.14%,RRAe 98.80%,PosA 99.83%,,1.5Unf 98.46%,,9 98.03%,1976 21.32%,,,,483 0.51%,,,,,,Floor 100.00%,,,672 10.62%,30 81.59%,3 73.29%,2000 98.72%,65.0 29.71%,512 99.91%,189 93.58%,130 99.06%,144 88.96%,45 51.03%,168 63.78%,672 59.25%,816 8.82%,894 6.08%,484 19.01%,144 98.80%,9 97.26%,1998.0 21.32%,106.0 60.45%,1200 34.76%,9 93.58%,539 89.98%,9000 7.62%,816 5.22%,125000 6.25%


#### Foreshadow use a machine learning model to identify the 'intent' of features. 3 intents are supported as of v1.0 and they are 'Categorical', 'Numeric' and 'Text'. Foreshadow will transform the features intelligently according to its intent and statistics. Features not belonging to these three are tagged as 'Droppable'. For example, the Id is droppable since it has a unique value for each row and will not provide any signal to the model. Also in the above table, 'Label' in the intent row indicate that is the target column.

# Model Training Iteration 2 - Override

In [7]:
shadow2 = Foreshadow(problem_type=ProblemType.REGRESSION, random_state=RANDOM_SEED, n_jobs=-1, estimator=ElasticNet(random_state=RANDOM_SEED))
shadow2.override_intent('ExterQual', IntentType.CATEGORICAL)
shadow2.override_intent('ExterCond', IntentType.CATEGORICAL)
shadow2.override_intent('LotShape', IntentType.CATEGORICAL)
shadow2.override_intent('HeatingQC', IntentType.CATEGORICAL)
shadow2.override_intent('YearBuilt', IntentType.NUMERIC)
shadow2.override_intent('YearRemodAdd', IntentType.NUMERIC)
shadow2.override_intent('YrSold', IntentType.NUMERIC)
_ = shadow2.fit(X_train, y_train)

2020-03-29 12:21:00,114 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column ExterQual exist to ensure the override takes effect.
2020-03-29 12:21:00,115 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column ExterCond exist to ensure the override takes effect.
2020-03-29 12:21:00,116 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column LotShape exist to ensure the override takes effect.
2020-03-29 12:21:00,117 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column HeatingQC exist to ensure the override takes effect.
2020-03-29 12:21:00,117 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column YearBuilt exist to ensure the override takes effect.
2020-03-29 12:21:00,118 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the

In [8]:
_ = measure(shadow2, X_test, y_test)

2020-03-29 12:21:15,368 - foreshadow - INFO - 64366 - Exported processed data to processed_test_data.csv


root mean squared log error = 0.1721


# Model Training Iteration 3 - AutoEstimator

#### Instead of trying one estimator, we can leverage AutoEstimator to search ML models and hyper-parameters. When we do not provide an estimator, Foreshadow will create the AutoEstimator automatically. 

In [9]:
shadow3 = Foreshadow(problem_type=ProblemType.REGRESSION, allowed_seconds=300, random_state=RANDOM_SEED, n_jobs=-1)
shadow3.override_intent('ExterQual', IntentType.CATEGORICAL)
shadow3.override_intent('ExterCond', IntentType.CATEGORICAL)
shadow3.override_intent('LotShape', IntentType.CATEGORICAL)
shadow3.override_intent('HeatingQC', IntentType.CATEGORICAL)
shadow3.override_intent('YearBuilt', IntentType.NUMERIC)
shadow3.override_intent('YearRemodAdd', IntentType.NUMERIC)
shadow3.override_intent('YrSold', IntentType.NUMERIC)
_ = shadow3.fit(X_train, y_train)

2020-03-29 12:21:15,412 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column ExterQual exist to ensure the override takes effect.
2020-03-29 12:21:15,413 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column ExterCond exist to ensure the override takes effect.
2020-03-29 12:21:15,414 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column LotShape exist to ensure the override takes effect.
2020-03-29 12:21:15,415 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column HeatingQC exist to ensure the override takes effect.
2020-03-29 12:21:15,416 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column YearBuilt exist to ensure the override takes effect.
2020-03-29 12:21:15,417 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', style=ProgressStyle(description_w…

Generation 1 - Current best internal CV score: -819018849.1253982

5.12 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: RandomForestRegressor(RidgeCV(input_matrix), bootstrap=True, max_features=0.5, min_samples_leaf=6, min_samples_split=14, n_estimators=100)


In [10]:
_ = measure(shadow3, X_test, y_test)

2020-03-29 12:26:40,688 - foreshadow - INFO - 64366 - Exported processed data to processed_test_data.csv


root mean squared log error = 0.1483


# Model Training Iteration 4 - Customize Scoring Function in Search

#### The Kaggle competition use root mean squared log error to rank result. Let's ask the AutoEstimator to optimize for this scoring function

In [11]:
shadow4 = Foreshadow(problem_type=ProblemType.REGRESSION, allowed_seconds=300, random_state=RANDOM_SEED, 
                     n_jobs=-1, auto_estimator_kwargs={"scoring": get_scorer('neg_mean_squared_log_error')})
shadow4.override_intent('ExterQual', IntentType.CATEGORICAL)
shadow4.override_intent('ExterCond', IntentType.CATEGORICAL)
shadow4.override_intent('LotShape', IntentType.CATEGORICAL)
shadow4.override_intent('HeatingQC', IntentType.CATEGORICAL)
shadow4.override_intent('YearBuilt', IntentType.NUMERIC)
shadow4.override_intent('YearRemodAdd', IntentType.NUMERIC)
shadow4.override_intent('YrSold', IntentType.NUMERIC)
_ = shadow4.fit(X_train, y_train)

2020-03-29 12:26:40,747 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column ExterQual exist to ensure the override takes effect.
2020-03-29 12:26:40,748 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column ExterCond exist to ensure the override takes effect.
2020-03-29 12:26:40,748 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column LotShape exist to ensure the override takes effect.
2020-03-29 12:26:40,750 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column HeatingQC exist to ensure the override takes effect.
2020-03-29 12:26:40,752 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the column YearBuilt exist to ensure the override takes effect.
2020-03-29 12:26:40,753 - foreshadow - INFO - 64366 - The foreshadow object is not trained yet. Please make sure the

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', style=ProgressStyle(description_w…

Generation 1 - Current best internal CV score: -0.01865521912350544

5.29 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: RandomForestRegressor(ExtraTreesRegressor(RidgeCV(input_matrix), bootstrap=True, max_features=0.8, min_samples_leaf=5, min_samples_split=6, n_estimators=100), bootstrap=True, max_features=0.5, min_samples_leaf=12, min_samples_split=14, n_estimators=100)


In [12]:
rmsle = measure(shadow4, X_train, y_train)

2020-03-29 12:32:16,312 - foreshadow - INFO - 64366 - Exported processed data to processed_test_data.csv


root mean squared log error = 0.0885


## Compare Kaggle Leaderboard

In [13]:
leaderboard = pd.read_csv('house-prices-advanced-regression-techniques-publicleaderboard.csv')
leaderboard.sort_values(by='Score', ascending=True, inplace=True)
better_solutions = leaderboard[leaderboard.Score < rmsle]
ranking = len(better_solutions) * 100.0 / len(leaderboard)
print('Our solution ranked at %dth position within top %0.2f%%' % (len(better_solutions), ranking))

Our solution ranked at 26th position within top 0.19%


In [14]:
test = pd.read_csv('test.csv')


In [15]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal


In [16]:
pred = shadow4.predict(test)

2020-03-29 12:32:19,032 - foreshadow - INFO - 64366 - Exported processed data to processed_test_data.csv


In [17]:
pred

array([119308.38905396, 149935.97377123, 179095.82853535, ...,
       168496.05743447, 115339.76860167, 233267.59634448])

In [18]:
test['SalePrice'] = pred

In [19]:
test[['Id', 'SalePrice']].to_csv('submission.csv', index=False)