<h1>Decision Tree Model</h1>

In [None]:
import pandas as pd

main_file_path = '../input/train.csv'
data = pd.read_csv(main_file_path)
print(data.describe())

In [None]:
# show all data features
print(data.columns)

In [None]:
# store prices separately as iowa_price_data
iowa_price_data = data.SalePrice
print(iowa_price_data.head())

In [None]:
# separate out size features
size_columns = ['LotArea', '1stFlrSF', '2ndFlrSF']
housing_size = data[size_columns]
housing_size.describe()

In [None]:
# prediction target (we're predicting price)
y = data.SalePrice

# parameters we're using to predict price
iowa_predictors = ['LotArea', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'YearBuilt']
X = data[iowa_predictors]

# define model
from sklearn.tree import DecisionTreeRegressor

iowa_model = DecisionTreeRegressor()

# fit model
iowa_model.fit(X, y)

In [None]:
# predict 5 house prices
print("Make predictions for the following 5 houses:")
print(X.head())
print("The prices are:")
print(iowa_model.predict(X.head()))

In [None]:
# test accuracy of model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# split data into training set and validation set
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# define model
iowa_model = DecisionTreeRegressor()

# fit model with training set
iowa_model.fit(train_X, train_y)

# test accuracy with validation set
val_predictions = iowa_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

In [None]:
# create utility function to help decide max_leaf_nodes for our model
from sklearn.metrics import mean_absolute_error

def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print ("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
iowa_pred = forest_model.predict(val_X)
print(mean_absolute_error(val_y, iowa_pred))

<h2>Random Forest Regression (Submission 1)</h2>

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

train = pd.read_csv('../input/train.csv')
train_y = train.SalePrice
features = ['LotArea', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'YearBuilt']
train_X = train[features]
my_model = RandomForestRegressor()
my_model.fit(train_X, train_y)

In [None]:
test = pd.read_csv('../input/test.csv')
test_X = test[features]
price_predictions = my_model.predict(test_X)
print(price_predictions)

In [None]:
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': price_predictions})
#my_submission.to_csv('submission_csv', index=False)

<h1>Cleaning Missing Data by Dropping Categoricals and Imputing Remainder</h1>

In [4]:
import pandas as pd

# load data
data = pd.read_csv('../input/train.csv')

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# set target(y) and predictors of target(X)
target = data.SalePrice
predictors = data.drop(['SalePrice'], axis = 1)

# drop non-numeric data (objects)
numeric_predictors = predictors.select_dtypes(exclude = ['object'])

# split data in train and test 
train_X, test_X, train_y, test_y = train_test_split(numeric_predictors, target, train_size = 0.7, test_size = 0.3, random_state = 0)

# copy dataset before imputing
imputed_train_X = train_X.copy()
imputed_test_X = test_X.copy()

# store in variable columns with missing values
cols_with_missing = (col for col in train_X.columns
                        if train_X[col].isnull().any())

# store binary values in separate column where missing values in cols_with_missing occur
for col in cols_with_missing: 
    imputed_train_X[col + '_was_missing'] = imputed_train_X[col].isnull()
    imputed_test_X[col + '_was_missing'] = imputed_test_X[col].isnull()
    
from sklearn.preprocessing import Imputer

my_imputer = Imputer()
imputed_train_X = my_imputer.fit_transform(imputed_train_X)
imputed_test_X = my_imputer.transform(imputed_test_X)

def score_dataset(X_train, X_test, y_train, y_test): 
    model = RandomForestRegressor() 
    model.fit(X_train, y_train) 
    preds = model.predict(X_test) 
    return mean_absolute_error(y_test, preds)

# check MAE using all numeric parameters
print('Mean Absolute Error from Imputation while tracking missing what was Imputed')
print(score_dataset(imputed_train_X, imputed_test_X, train_y, test_y))


<h1>Using Categorical Data with One Hot Encoding</h1>

In [None]:
train_X, test_X, train_y, test_y = train_test_split(predictors, target, random_state=0)

In [28]:
# one hot encode train and test data then align columns
one_hot_encoded_train_X = pd.get_dummies(train_X)
one_hot_encoded_test_X = pd.get_dummies(test_X)
final_train_X, final_test_X = one_hot_encoded_train_X.align(one_hot_encoded_test_X, join='left', axis= 1)

<h1>***Top Dawg Model RN (XGBoost) [Submission2]</h1>

In [84]:
import pandas as pd

data = pd.read_csv('../input/train.csv')
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1)
X = pd.get_dummies(X)
Brando_Picks = ['LotArea', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'YearBuilt', 'MSSubClass', 'OverallQual', 'OverallCond', 'MSZoning', 'Neighborhood', 'HouseStyle', 'BldgType', 'Condition1', 'Condition2'] 
predictors = data[Brando_Picks]
predictors = pd.get_dummies(predictors)


In [85]:
from sklearn.preprocessing import Imputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

my_pipeline = make_pipeline(Imputer(), XGBRegressor(n_estimators = 1000, learning_rate = 0.05,early_stopping_rounds = 5, eval_set = [(X, y)], verbose = False ))
my_pipeline.fit(X, y)
my_second_pipeline = make_pipeline(Imputer(), XGBRegressor(n_estimators = 1000, learning_rate = 0.05, early_stopping_rounds = 5, eval_set = [(predictors, y)], verbose = False))
my_second_pipeline.fit(predictors, y)

In [86]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(my_pipeline, X, y, scoring='neg_mean_absolute_error')
short_list_scores = cross_val_score(my_second_pipeline, predictors, y, scoring='neg_mean_absolute_error')
print(scores)
print(short_list_scores)

In [87]:
print('XGB Cross Validation MAE: %d' %(-1 * scores.mean()))
print('XGB Short List Cross Val MAE: %d' %(-1 * short_list_scores.mean()))

In [91]:
# Submission build
test = pd.read_csv('../input/test.csv')
testX = pd.get_dummies(test)
X, testX = X.align(testX, join='left', axis = 1)
predictions = my_pipeline.predict(testX)
print(predictions)

my_second_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions})
my_second_submission.to_csv('secondsubmission_csv', index=False)

<h1>Partial Dependence Plots</h1>

In [14]:
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn.ensemble import GradientBoostingRegressor

def get_some_data():
    data = pd.read_csv('../input/train.csv')
    cols_to_use = ['LotArea', 'OverallQual', 'OverallCond']
    y = data.SalePrice
    X = data[cols_to_use]
    my_imputer = Imputer()
    imputed_X = my_imputer.fit_transform(X)
    return imputed_X, y
    
X, y = get_some_data()
my_model = GradientBoostingRegressor()
my_model.fit(X,y)
my_plot = plot_partial_dependence(my_model, features=[0,1,2], X=X, feature_names=['LotArea', 'OverallQual', 'OverallCond'], grid_resolution=10)
