# Imports

In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

# File to read

In [2]:
iowa_file_path = "01_input/train.csv"

# Load to pandas

In [3]:
home_data = pd.read_csv(iowa_file_path)

In [4]:
home_data.head().T

Unnamed: 0,0,1,2,3,4
Id,1,2,3,4,5
MSSubClass,60,20,60,70,60
MSZoning,RL,RL,RL,RL,RL
LotFrontage,65.0,80.0,68.0,60.0,84.0
LotArea,8450,9600,11250,9550,14260
...,...,...,...,...,...
MoSold,2,5,9,2,12
YrSold,2008,2007,2008,2006,2008
SaleType,WD,WD,WD,WD,WD
SaleCondition,Normal,Normal,Normal,Abnorml,Normal


# Creating Target Variable

In [5]:
y = home_data.SalePrice

# Creating Independent Variables

In [6]:
features = ["LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "FullBath", 
            "BedroomAbvGr", "TotRmsAbvGrd"]
X = home_data[features]

In [7]:
X.head()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
0,8450,2003,856,854,2,3,8
1,9600,1976,1262,0,2,3,6
2,11250,2001,920,866,2,3,6
3,9550,1915,961,756,1,3,7
4,14260,2000,1145,1053,2,4,9


# Spit into Train and Test data

In [8]:
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=1)
print(f"Train Size: \t{len(train_X)}\nTest Size: \t{len(test_X)}")

Train Size: 	1095
Test Size: 	365


# Specify Model

In [9]:
iowa_model = DecisionTreeRegressor(random_state=1)

# Fit Model

In [10]:
iowa_model.fit(train_X, train_y)

DecisionTreeRegressor(random_state=1)

# Make Predictions and Calculate Mean Absolute Error

In [11]:
predictions = iowa_model.predict(test_X)
mean_abs_err = mean_absolute_error(test_y, predictions)
print(f"Validation MAE: {mean_abs_err:,.0f}")

Validation MAE: 29,653


# Compare Different Tree Sizes

In [12]:
def get_mae(max_leaf_nodes, train_X=train_X, test_X=test_X, train_y=train_y,
            test_y=test_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    predictions = model.predict(test_X)
    mae = mean_absolute_error(test_y, predictions)
    return mae

# Determine the Best Size

In [13]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

maes = {candidate: get_mae(candidate) for candidate in candidate_max_leaf_nodes}
print(maes)
print(f"\nBest Tree Size: {min(maes, key=maes.get)}")

{5: 35044.51299744237, 25: 29016.41319191076, 50: 27405.930473214907, 100: 27282.50803885739, 250: 27893.822225701646, 500: 29454.18598068598}

Best Tree Size: 100


# Fit Model Using All Data

In [14]:
final_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=0)
final_model.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=100, random_state=0)