# Introduction
**This will be your workspace for Kaggle's Machine Learning education track.**

You will build and continually improve a model to predict housing prices as you work through each tutorial.  Fork this notebook and write your code in it.

The data from the tutorial, the Melbourne data, is not available in this workspace.  You will need to translate the concepts to work with the data in this notebook, the Iowa data.

Come to the [Learn Discussion](https://www.kaggle.com/learn-forum) forum for any questions or comments. 

# Write Your Code Below



In [20]:
import pandas as pd

# Print columns
print("###################### Columns ##############################\n\n")
main_file_path = '../input/train.csv'
data = pd.read_csv(main_file_path)
print(data.columns)
print("\n\n")

# Select the data by columns
print("###################### Single Column output ##############################\n\n")
data_by_price = data.SalePrice
print(data_by_price.head())
print("\n\n")

# Select multiple columns
print("###################### Multi Column output ############################## \n\n")
columns_of_interest = ['SalePrice', 'YrSold']
data_by_two_columns = data[columns_of_interest]
print(data_by_two_columns)

# Describe the data set
print(data_by_two_columns.describe())
print("\n\n")

In [34]:
# Choose the target
y = data.SalePrice

# Choose predictors
predictors = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath','BedroomAbvGr','TotRmsAbvGrd']
X = data[predictors]


In [6]:
#Using scikit-learn to create and predict models

from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()

# Fit model
model.fit(X,y)

In [35]:
# Predict using the created model
print("Making predictions for these values")
print(X.head())

print("Predictions are...")
print(model.predict(X.head()))

In [36]:
from sklearn.metrics import mean_absolute_error

# Predict only using the train set
predicted_vals = model.predict(X)
mean_absolute_error(y,predicted_vals)

In [37]:
from sklearn.model_selection import train_test_split

train_X,val_X,train_y,val_y = train_test_split(X,y,random_state = 0)

model = DecisionTreeRegressor()

model.fit(train_X,train_y)

val_predictions = model.predict(val_X)
print(mean_absolute_error(val_y,val_predictions))

In [38]:
# Try to minimize the cost

from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes,predictors_train,predictors_val,targ_train,targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes,random_state=0)
    model.fit(predictors_train,targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val,preds_val)
    return(mae)

In [39]:
for max_leaf_nodes in [5,50,500,5000]:
    my_mae = get_mae(max_leaf_nodes,train_X,val_X,train_y,val_y)
    print("MAx leaf nodes : %d \t\t MAE : %d"%(max_leaf_nodes, my_mae))

In [40]:
# Random forest regressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor()
forest_model.fit(train_X,train_y)

forest_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y,forest_preds))

In [41]:
# Submitting the kernel

train = pd.read_csv('../input/train.csv')

train_y = train.SalePrice

predictor_cols = ['LotArea', 'OverallQual','YearBuilt','TotRmsAbvGrd']
train_X = train[predictor_cols]

sub_model = RandomForestRegressor()
sub_model.fit(train_X,train_y)



In [42]:

# Read the test data
test = pd.read_csv('../input/test.csv')

test_X = test[predictor_cols]

sub_preds = sub_model.predict(test_X)

my_submission = pd.DataFrame({ 'Id': test.Id,'SalePrice': sub_preds})

print(my_submission)
# Convert to csv
my_submission.to_csv('submission.csv',index=False)