# Introduction
**This will be your workspace for Kaggle's Machine Learning education track.**

You will build and continually improve a model to predict housing prices as you work through each tutorial.  Fork this notebook and write your code in it.

The data from the tutorial, the Melbourne data, is not available in this workspace.  You will need to translate the concepts to work with the data in this notebook, the Iowa data.

Come to the [Learn Discussion](https://www.kaggle.com/learn-forum) forum for any questions or comments. 

# Write Your Code Below



In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

test_file_path = '../input/test.csv'
main_file_path = '../input/train.csv'
data = pd.read_csv(main_file_path)
print(data.columns)

def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

#by convention, y is the prediction target
y = data.SalePrice

#by convention, X is the predictive data
predictors = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = data[predictors]

train_X, test_X, train_y, test_y = train_test_split(X, y,random_state = 0)


lowest_mae = None
node_count_with_lowest_mae = None
for max_leaf_nodes in [5, 50, 500, 5000]:
    mae = get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, mae))
    if not node_count_with_lowest_mae or lowest_mae > mae: 
        lowest_mae = mae
        node_count_with_lowest_mae = max_leaf_nodes

print("Fit using leaf count of " + str(node_count_with_lowest_mae))
model = DecisionTreeRegressor(max_leaf_nodes=node_count_with_lowest_mae, random_state=0)
model.fit(train_X, train_y)

print("The predictions are:")
predicted_prices = model.predict(test_X)
print(predicted_prices)

print("") #skip a line
print("Our Best DecisionTreeRegressor Error Rate is:")
print(mean_absolute_error(test_y, predicted_prices))

print("vs a RandomForestRegressor Error Rate of:")
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
forest_predictions = forest_model.predict(test_X)
print(mean_absolute_error(test_y, forest_predictions))

#submission
submission_train_data = pd.read_csv(main_file_path)
submission_train_y = submission_train_data.SalePrice
submission_prediction_columns = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
submission_train_X = submission_train_data[submission_prediction_columns]

submission_model = RandomForestRegressor()
submission_model.fit(submission_train_X, submission_train_y)

submission_test_data = pd.read_csv(test_file_path)
submission_test_X = submission_test_data[submission_prediction_columns]
submission_prediction = submission_model.predict(submission_test_X)

submission = pd.DataFrame({'Id': submission_test_data.Id, 'SalePrice': submission_prediction})
submission.to_csv('submission.csv', index=False)