In [1]:

import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


# Path of the file to read
iowa_file_path ="C:/Users/Ripple/Desktop/Data Science/ML/train.csv"

home_data = pd.read_csv(iowa_file_path)
# Create target object and call it y
y = home_data.SalePrice
# Create X
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Specify Model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format(val_mae))

# Using best value for max_leaf_nodes
iowa_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
iowa_model.fit(train_X, train_y)
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE for best value of max_leaf_nodes: {:,.0f}".format(val_mae))

Validation MAE when not specifying max_leaf_nodes: 29,653
Validation MAE for best value of max_leaf_nodes: 27,283


In [4]:
#function for mean absolute error
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [9]:
#using list conversion mae_vals taking as a list 
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
mae_vals=[]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for i in range(0,6):
 mae_vals.append(get_mae(candidate_max_leaf_nodes[i], train_X, val_X, train_y, val_y))
 min_mae_vals=min(mae_vals)  
 index=mae_vals.index(min_mae_vals)
print(mae_vals)    
# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size=candidate_max_leaf_nodes[index]
print(best_tree_size)

[35044.51299744237, 29016.41319191076, 27405.930473214907, 27282.50803885739, 27893.822225701646, 29454.18598068598]
100


In [6]:
#using list comprehension
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

# Write loop to find the ideal tree size from candidate_max_leaf_nodes

mae_vals=[get_mae(candidate_max_leaf_nodes[i], train_X, val_X, train_y, val_y) for i in range(0,6)]
print(mae_vals)
min_mae_vals=min(mae_vals)  
index=mae_vals.index(min_mae_vals)


# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size = candidate_max_leaf_nodes[index]
print(best_tree_size)

[35044.51299744237, 29016.41319191076, 27405.930473214907, 27282.50803885739, 27893.822225701646, 29454.18598068598]
100


In [11]:
# Fill in argument to make optimal size and uncomment
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size,random_state=1)

# fit the final model
final_model.fit(X, y)

In [12]:
#model prediction when not using best tree size
print(iowa_model.predict(val_X.head()))
print(val_y.head())

[186500. 184000. 130000.  92000. 164500.]
258     231500
267     179500
288     122000
649      84500
1233    142000
Name: SalePrice, dtype: int64


In [13]:
#model predictions when using best tree size
print(final_model.predict(val_X.head()))
print(val_y.head())
#results show predictions are much more closer

[182392.31944444 163142.5        125362.57333333  70167.58333333
 146415.0075188 ]
258     231500
267     179500
288     122000
649      84500
1233    142000
Name: SalePrice, dtype: int64


In [24]:
#predicting using Random Forest to compare how much a better model Random forest is for prediction
from sklearn.ensemble import RandomForestRegressor

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)

# fit your model
rf_model.fit(train_X,train_y)
rf_preds=rf_model.predict(val_X)
# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(val_y,rf_preds)

print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))
#we can see the validation MAE for Random Forest is much better than that of decision tree

Validation MAE for Random Forest Model: 21,857


In [38]:
# model prediction when using Random forest
print(iowa_model.predict(val_X.head()))
print(val_y.head())

[181225.35416667 173500.         122142.35714286  94060.
 148515.31111111]
258     231500
267     179500
288     122000
649      84500
1233    142000
Name: SalePrice, dtype: int64


In [27]:
#The code cell above trains a Random Forest model on train_X and train_y.
#the code cell below builds a Random Forest model and trains it on all of X and y.
# To improve accuracy, create a new Random Forest model which you will train on all training data
rf_model_on_full_data = RandomForestRegressor(random_state=1)
# fit rf_model_on_full_data on all data from the training data
rf_model_on_full_data.fit(X,y)

In [43]:
# path to file you will use for predictions
test_data_path = "C:/Users/Ripple/Desktop/Data Science/ML/test.csv"

# read test data file using pandas
test_data = pd.read_csv(test_data_path)

# create test_X which comes from test_data but includes only the columns you used for prediction.
# The list of columns is stored in a variable called features
test_data_features=['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

test_X = test_data[test_data_features]
#make predictions
#test_preds = rf_model_on_full_data.predict(test_X)
#printing the predictions for house prices for first 5 test data
print(rf_model_on_full_data.predict(test_X.head())) 

[122656.58 156789.   182959.   178102.   189049.48]
