# Introduction
**This will be your workspace for Kaggle's Machine Learning education track.**

You will build and continually improve a model to predict housing prices as you work through each tutorial.  Fork this notebook and write your code in it.

The data from the tutorial, the Melbourne data, is not available in this workspace.  You will need to translate the concepts to work with the data in this notebook, the Iowa data.

Come to the [Learn Discussion](https://www.kaggle.com/learn-forum) forum for any questions or comments. 

# Write Your Code Below



## Level 1

In [1]:
# Read the data and print the summary

import pandas as pd

main_file_path = '../input/train.csv'
data = pd.read_csv(main_file_path)
print(data.describe())

In [2]:
# Print a list of the columns

print(data.columns)

In [3]:
# Extract sales prices from data

data_price = data.SalePrice

In [4]:
# Print top few lines of sales price data

print(data_price.head())

In [5]:
# Store KitchenAbvGr and Fireplaces features into new dataframe

data_misc = data[["KitchenAbvGr", "Fireplaces"]]

In [6]:
# Observe summaries of these kitchen features

data_misc.describe()

In [7]:
# Identify target variable

y = data_price

In [8]:
# Create a list of predictors

predictors = ["LotArea", 
              "YearBuilt", 
              "1stFlrSF", 
              "2ndFlrSF", 
              "FullBath", 
              "BedroomAbvGr", 
              "TotRmsAbvGrd"]

In [9]:
# Create a new dataframe consisting of predictors' data

X = data[predictors]

In [10]:
# Instantiate decision tree regressor object

from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()

In [11]:
# Fit model to training data

model.fit(X, y)

In [12]:
# Make a few predictions with the model

model.predict(X.head())

In [13]:
# Split dataset into train and dev sets

from sklearn.model_selection import train_test_split

X_train, X_dev, y_train, y_dev = train_test_split(X, y, random_state=0)

In [14]:
# Fit new model with training data

model = DecisionTreeRegressor()
model.fit(X_train, y_train)

In [15]:
# Make predictions using the model on the dev set

dev_predictions = model.predict(X_dev)

In [16]:
# Calculate model mean absolute error (MAE) on dev set

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_dev, dev_predictions))

In [17]:
# Define function to get MAE of model with given parameters

def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

In [18]:
# Approximate ideal number of max_leaf_nodes for model

import sys

best_mae = sys.maxsize 
ideal_max_leaf_nodes = None

for max_leaf_nodes in [5, 50, 500, 5000]:
    mae = get_mae(max_leaf_nodes, X_train, X_dev, y_train, y_dev)
    if mae < best_mae:
        best_mae = mae
        ideal_max_leaf_nodes = max_leaf_nodes

print("The lowest MAE of {} was found with a "
      "decision tree with {} max leaf nodes.".format(best_mae, 
                                                    ideal_max_leaf_nodes))

In [19]:
# Train and validate random forest model on data

from sklearn.ensemble import RandomForestRegressor

model_forest = RandomForestRegressor()
model_forest.fit(X_train, y_train)
forest_predictions = model_forest.predict(X_dev)
print(mean_absolute_error(y_dev, forest_predictions))

In [20]:
# Submit model predictions on test set

test = pd.read_csv("../input/test.csv")
X_test = test[predictors]
test_predictions = model.predict(X_test)
submission = pd.DataFrame({"Id": test.Id, "SalePrice": test_predictions})
submission.to_csv("submission.csv", index=False)

## Level 2

In [123]:
# Load data, and separate it into predictors and a target, using only 
# numeric predictors

train = pd.read_csv("../input/train.csv")
train_target = train["SalePrice"]
train_predictors = train.drop(["SalePrice"], axis=1)
train_numeric_predictors = train_predictors.select_dtypes(exclude=["object"])

In [124]:
# Partition training data into train and dev set

X_train, X_dev, y_train, y_dev = train_test_split(train_numeric_predictors, 
                                                  train_target, 
                                                  train_size=0.7, 
                                                  test_size=0.3, 
                                                  random_state=0)

In [125]:
# Construct function that calculates MAE score obtained by a random forest 
# model on dev set

def score_dataset(X_train, X_dev, y_train, y_dev):
    model = RandomForestRegressor(random_state=0)
    model.fit(X_train, y_train)
    predictions = model.predict(X_dev)
    return mean_absolute_error(y_dev, predictions)

In [126]:
# Drop any features with missing values in training set and obtain model 
# score with this modification

features_with_nan = [feature for feature in X_train.columns 
                             if X_train[feature].isnull().any()]
X_train_reduced = X_train.drop(features_with_nan, axis=1)
X_dev_reduced = X_dev.drop(features_with_nan, axis=1)
print("MAE from dropping features with missing values:")
print(score_dataset(X_train_reduced, X_dev_reduced, y_train, y_dev))

In [127]:
# Fill missing values with feature mean and obtain model score with 
# this modification

X_train_filled = X_train.fillna(X_train.mean())
X_dev_filled = X_dev.fillna(X_dev.mean())
print("MAE from filling missing values:")
print(score_dataset(X_train_filled, X_dev_filled, y_train, y_dev))

In [128]:
# Fill missing values with feature mean and add features indicating which 
# instances were missing features for that given feature. Proceed to 
# obtain model score with this modification

X_train_filled_plus = X_train.copy()
X_dev_filled_plus = X_dev.copy()
features_with_nan = (feature for feature in X_train.columns 
                             if X_train[feature].isnull().any())
for feature in features_with_nan:
    X_train_filled_plus[feature + "_missing"] = X_train_filled_plus[feature].isnull()
    X_dev_filled_plus[feature + "_missing"] = X_dev_filled_plus[feature].isnull()
X_train_filled_plus = X_train_filled_plus.fillna(X_train_filled_plus.mean())
X_dev_filled_plus = X_dev_filled_plus.fillna(X_dev_filled_plus.mean())
print("MAE from filling missing values with tracking:")
print(score_dataset(X_train_filled_plus, X_dev_filled_plus, y_train, y_dev))

In [129]:
# Add categorical features to train and dev sets

train_all_predictors = pd.get_dummies(train_predictors)

X_train, X_dev, y_train, y_dev = train_test_split(train_all_predictors, 
                                                  train_target, 
                                                  train_size=0.7, 
                                                  test_size=0.3, 
                                                  random_state=0)

In [130]:
# Again fill missing values with feature mean and add features indicating which 
# instances were missing features for that given feature. Proceed to 
# obtain model score with this modification

X_train_filled_plus = X_train.copy()
X_dev_filled_plus = X_dev.copy()
features_with_nan = (feature for feature in X_train.columns 
                             if X_train[feature].isnull().any())
for feature in features_with_nan:
    X_train_filled_plus[feature + "_missing"] = X_train_filled_plus[feature].isnull()
    X_dev_filled_plus[feature + "_missing"] = X_dev_filled_plus[feature].isnull()
X_train_filled_plus = X_train_filled_plus.fillna(X_train_filled_plus.mean())
X_dev_filled_plus = X_dev_filled_plus.fillna(X_dev_filled_plus.mean())
print("MAE from filling missing values with tracking and using all features:")
print(score_dataset(X_train_filled_plus, X_dev_filled_plus, y_train, y_dev))

In [131]:
# Train and validate XGBoost model on data

from xgboost import XGBRegressor

best_model = None
best_num_estimators = None
best_score = sys.maxsize

for num in [100, 500, 1000]:
    model = XGBRegressor(n_estimators=num, random_state=0)
    model.fit(X_train_filled_plus, 
              y_train, 
              early_stopping_rounds=5, 
              eval_set=[(X_dev_filled_plus, y_dev)], 
              verbose=False)
    predictions = model.predict(X_dev_filled_plus)
    mae = mean_absolute_error(y_dev, predictions)
    if mae < best_score:
        best_model = model
        best_num_estimators = num
        best_score = mae
print("MAE using XGBoost with {} estimators:".format(best_num_estimators))
print(best_score)

In [132]:
# Plot partial dependencies of several features on Gradient Boosting 
# model of training data

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence

gb_model = GradientBoostingRegressor(random_state=0)
gb_model.fit(X_train_filled_plus, y_train)
plots = plot_partial_dependence(gb_model, 
                                features=[2, 6], 
                                X=X_train_filled_plus, 
                                feature_names=X_train_filled_plus.columns)

In [145]:
# Cross validate dataset using pipelines, imputation, and 
# random forest models with different collection of predictors

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

my_pipeline = make_pipeline(Imputer(), 
                            RandomForestRegressor(random_state=0))
scores_num = cross_val_score(my_pipeline, 
                             train_numeric_predictors, 
                             train_target, 
                             scoring="neg_mean_absolute_error")
print("Mean MAE score using cross validation, pipelines, imputation, "
      "and random forests with just numeric predictors:")
print(-scores_num.mean())
scores_all = cross_val_score(my_pipeline, 
                             train_all_predictors, 
                             train_target, 
                             scoring="neg_mean_absolute_error")
print("Mean MAE score using cross validation, pipelines, "
      "imputation, and random forests with all predictors:")
print(-scores_all.mean())