
**Creating test and train data sets**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

main_file_path = "../input/train.csv"
data = pd.read_csv(main_file_path)

# sort for easier readding
print(data.reindex(sorted(data.columns),axis=1).columns)

target = data.SalePrice;
predictor_labels = ["LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr", "TotRmsAbvGrd"]
predictors = data[predictor_labels];

train_X, val_X, train_y, val_y = train_test_split(predictors, target, random_state = 0)

**Set up a decision tree and check mean absolute error**

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

housing_model = DecisionTreeRegressor()
housing_model.fit(train_X, train_y);

predicted_prices = housing_model.predict(val_X);
mae = mean_absolute_error(val_y, predicted_prices);

print(mae)

**Test decision tree for different depth**

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

for max_leaf_nodes in [5,50,500,5000] :
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))


**Test a random forest which uses multiple decision trees and averages the result*

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
forest_price_predictions = forest_model.predict(val_X)
print(mean_absolute_error(val_y, forest_price_predictions))

**The first submission**

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Read the data
train = pd.read_csv('../input/train.csv')

# pull data into target (y) and predictors (X)
train_y = train.SalePrice
predictor_cols = ['LotArea', 'OverallQual', 'YearBuilt', 'TotRmsAbvGrd']

# Create training predictors data
train_X = train[predictor_cols]

my_model = RandomForestRegressor()
my_model.fit(train_X, train_y)

# Read the test data
test = pd.read_csv('../input/test.csv')
# Treat the test data in the same way as training data. In this case, pull same columns.
test_X = test[predictor_cols]
# Use the model to make predictions
predicted_prices = my_model.predict(test_X)
# We will look at the predicted prices to ensure we have something sensible.
print(predicted_prices)

my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)
