Running through the tutorial. 

Import our libraires used in the project.

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

Import the data and describe it.

In [None]:
main_file_path = '../input/train.csv'
data = pd.read_csv(main_file_path)
print(data.describe())

Look at the coluns.

In [None]:
print(data.columns)

Look at some of our targets or sale prices.

In [None]:
data_price = data.SalePrice
print(data_price.head())

Combine two columns ad describe them.

In [None]:
columns = ['LotFrontage', 'LotArea']
two_columns_of_data = data[columns]
print(two_columns_of_data.describe())

Create our target

In [None]:
y = data.SalePrice

Get the columns to use to predict y.

In [None]:
varible_columns = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = data[varible_columns]

Make a decision tree using X and y.

In [None]:
data_model = DecisionTreeRegressor()
data_model.fit(X, y)

Show some of our predictions using the decision tree.

In [None]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(data_model.predict(X.head()))

Find the mean absolute error for our decision tree predicitions.

In [None]:
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)
model = DecisionTreeRegressor()
model.fit(train_X, train_y)
val_predictions = model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

Create a funciton that looks at the mean absolute error to use to look at different models quickly.

In [None]:
def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

Look at multiple decisison tree models with max leafs 5, 50, 500, 5000 and print the mean absolute error values.

In [None]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Create a random forest model for our data and look at MAE.

In [None]:
forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, preds))

Run a final model and save it to predicited prices.

In [None]:
predictor_cols = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
my_model = RandomForestRegressor()
my_model.fit(train_X, train_y)
test = pd.read_csv('../input/test.csv')
test_X = test[predictor_cols]
predicted_prices = my_model.predict(test_X)
print(predicted_prices)

Define score dataset that looks at the MAE after adding in columns with NAs.

In [None]:
def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

Look at the MAE after adding columns with NAs.

In [None]:
target = data.SalePrice
predictors = data.drop(['SalePrice'], axis=1) 
numeric_predictors = predictors.select_dtypes(exclude=['object'])
cols_with_missing = [col for col in train_X.columns 
                                 if train_X[col].isnull().any()]
reduced_X_train = train_X.drop(cols_with_missing, axis=1)
reduced_X_test  = val_X.drop(cols_with_missing, axis=1)
print("Mean Absolute Error from dropping columns with Missing Values:")
print(score_dataset(reduced_X_train, reduced_X_test, train_y, val_y))

In [None]:
target = data.SalePrice
predictors = data.drop(['SalePrice'], axis=1) 
numeric_predictors = predictors.select_dtypes(exclude=['object'])

X_train, X_test, y_train, y_test = train_test_split(numeric_predictors, 
                                                    target,
                                                    train_size=0.7, 
                                                    test_size=0.3, 
                                                    random_state=0)

imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()

cols_with_missing = (col for col in X_train.columns 
                                 if X_train[col].isnull().any())
for col in cols_with_missing:
    imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()

my_imputer = Imputer()
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus)

print("Mean Absolute Error from Imputation while Track What Was Imputed:")
print(score_dataset(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test))

In [None]:
train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')

cols_with_missing = [col for col in train_data.columns 
                                 if train_data[col].isnull().any()]  

candidate_train_predictors = train_data.drop(['Id', 'SalePrice'] + cols_with_missing, axis=1)
candidate_test_predictors = test_data.drop(['Id'] + cols_with_missing, axis=1)

low_cardinality_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].nunique() < 10 and
                                candidate_train_predictors[cname].dtype == "object"]
numeric_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].dtype in ['int64', 'float64']]
my_cols = low_cardinality_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]
test_predictors = candidate_test_predictors[my_cols]

In [None]:
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)

def get_mae2(X, y):
    return -1 * cross_val_score(RandomForestRegressor(50), 
                                X, y, 
                                scoring = 'neg_mean_absolute_error').mean()

predictors_without_categoricals = train_predictors.select_dtypes(exclude=['object'])

mae_without_categoricals = get_mae2(predictors_without_categoricals, target)

mae_one_hot_encoded = get_mae2(one_hot_encoded_training_predictors, target)

print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals)))
print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded)))


In [None]:
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
one_hot_encoded_test_predictors = pd.get_dummies(test_predictors)
final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
                                                                    join='left', 
                                                                    axis=1)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

data = pd.read_csv('../input/train.csv')
data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25)

my_imputer = Imputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)

In [None]:
my_model = XGBRegressor()

my_model.fit(train_X, train_y, verbose=False)

In [None]:
predictions = my_model.predict(test_X)

print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

In [None]:
my_model = XGBRegressor(n_estimators=1000)
my_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(test_X, test_y)], verbose=False)

In [None]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(test_X, test_y)], verbose=False)

Create a submission for comps.

In [None]:
#my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions})
#my_submission.to_csv('submission.csv', index=False)