In [47]:
import pandas as pd 
import numpy as np 
house_file = 'Housing_prices.csv'
house_data = pd.read_csv(house_file)

In [48]:
for columns in house_data:
    if house_data[columns].isnull().sum() >1000:
        house_data = house_data.drop(columns, axis = 1)

In [49]:
house_data.fillna(0, inplace= True)

In [50]:
y = house_data.SalePrice.astype(np.float32, copy= False)
features = ['LotFrontage', 'LotArea', 'MoSold','YrSold']
X = house_data[features].astype(np.float32, copy = False)

In [51]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error 

In [52]:
train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=1)

In [53]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    predict_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, predict_val)
    return (mae)

In [54]:
possible_max_leaf_nodes = [5,25,50,100,250,500]
for max_leaf_nodes in possible_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(max_leaf_nodes, my_mae)

5 51475.11845672607
25 50632.628548172936
50 51577.69772125109
100 54860.18111548307
250 60531.033244187274
500 62620.446878370654


In [55]:
score = {leaf_size: get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y) for leaf_size in possible_max_leaf_nodes}
best_size_nodes = min(score, key= score.get)
print(best_size_nodes)

5


In [56]:
house_final_model = DecisionTreeRegressor(max_leaf_nodes= best_size_nodes, random_state = 0)
house_final_model.fit(X, y)


DecisionTreeRegressor(max_leaf_nodes=5, random_state=0)

In [57]:
#using the best_size_node in all data
predictions_final = house_final_model.predict(X)
val_mae = mean_absolute_error(y, predictions_final)

In [58]:
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(random_state = 1)
forest_model.fit(train_X, train_y)
forest_predict = forest_model.predict(val_X)
forest_mae = mean_absolute_error(val_y, forest_predict)

In [59]:
forest_model_all = RandomForestRegressor(random_state = 1)
forest_model_all.fit(X, y)
forest_predict_all = forest_model_all.predict(X)
forest_mae_all = mean_absolute_error(y, forest_predict_all)

In [60]:
print(get_mae(5, train_X, val_X, train_y, val_y)) # With best size node in the specific data
print(my_mae)# Using own max_leaf_nodes 
print(val_mae) # MAE in the whole data using the best prediction
print(forest_mae)# Mae in the specific data with Random forest
print(forest_mae_all)# Mae in the whole data with Random forest


51475.11845672607
62620.446878370654
49694.58277767755
50011.58514855404
18382.91454082681
