# Introduction
**This will be your workspace for Kaggle's Machine Learning education track.**

You will build and continually improve a model to predict housing prices as you work through each tutorial.  Fork this notebook and write your code in it.

The data from the tutorial, the Melbourne data, is not available in this workspace.  You will need to translate the concepts to work with the data in this notebook, the Iowa data.

Come to the [Learn Discussion](https://www.kaggle.com/learn-forum) forum for any questions or comments. 

# Write Your Code Below



In [None]:
import pandas as pd

main_file_path = '../input/train.csv'
data = pd.read_csv(main_file_path)
# describe my own data
print(data.describe())

In [None]:
# print a list of columns
print("Columns of data: ")
print(data.columns)

# use the dot notation to extract SalePrice
data_SalePrice = data.SalePrice
# print out the top few lines of the variable
print("\nSalePrice of data: ")
print(data_SalePrice.head())

# pick two variables and store them to a new DataFrame
columns_Condition1_Condition2 = ['Condition1', 'Condition2']
data_Condition1_Condition2 = data[columns_Condition1_Condition2]
# describe this data
print("\ndata_Condition1_Condition2: ")
print(data_Condition1_Condition2.describe())

In [None]:
# select the target variable I want to predict
y = data_SalePrice

# select predictors for data
data_predictors = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

# select a new DataFrame of the predictors data and save to X
X = data[data_predictors]

# create a DecisionTreeRegressorModel and save it to a variable
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()

# fit the model
model.fit(X, y)

# make a few preditions
print("Making predictions for the first 5 houses:")
print(X.head(), "\n")
print("The first 5 house price in the training set:")
print(y.head(), "\n")
print("The predictions by model are")
print(model.predict(X.head()))

In [None]:
# validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# use the train_test_split command to split up your data
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)
model = DecisionTreeRegressor()

# fit the model
model.fit(train_X, train_y)

# make predictions with the validation predictors
val_predictions = model.predict(val_X)

# calculate the mean absolute error
data_mae = mean_absolute_error(val_y, val_predictions)
print(data_mae)

In [None]:
def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

best_mae = 99999999
best_max_leaf_nodes = -1
for max_leaf_nodes in range(5, 5000, 10):
    current_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    #print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, current_mae))
    if current_mae < best_mae:
        best_mae = current_mae
        best_max_leaf_nodes = max_leaf_nodes

print("Best max leaf nodes: %d  \t\t Mean Absolute Error: %d" % (best_max_leaf_nodes, best_mae))

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

In [None]:
# Read the data
train_data = pd.read_csv('../input/train.csv')
y = train_data.SalePrice

# select columns for prediction
predictor_cols = data_predictors
X = train_data[predictor_cols]

# split data into training set and validtion set(cross validation)
train_X, validation_X, train_y, validation_y = train_test_split(X, y, test_size=0.3, random_state=0)

# alternative max_leaf_nodes
alter_max_leaf_nodes = range(5, 5000, 5)

def get_forest_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(model, mae)

# select the proper max_leaf_nodes
best_mae = 99999999
best_max_leaf_nodes = -1
best_model = None
for max_leaf_nodes in alter_max_leaf_nodes:
    current_model, current_mae = get_forest_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    #print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, current_mae))
    if current_mae < best_mae:
        best_mae = current_mae
        best_max_leaf_nodes = max_leaf_nodes
        best_model = current_model
print("Best max leaf nodes: %d  \t\t Mean Absolute Error: %d" % (best_max_leaf_nodes, best_mae))

In [None]:
test = pd.read_csv('../input/test.csv')
test_X = test[predictor_cols]
predicted_prices = best_model.predict(test_X)

submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})
submission.to_csv('submission.csv', index=False)
print("Write to file OK.")