# Introduction
**This will be your workspace for Kaggle's Machine Learning education track.**

You will build and continually improve a model to predict housing prices as you work through each tutorial.  Fork this notebook and write your code in it.

The data from the tutorial, the Melbourne data, is not available in this workspace.  You will need to translate the concepts to work with the data in this notebook, the Iowa data.

Come to the [Learn Discussion](https://www.kaggle.com/learn-forum) forum for any questions or comments. 

# Write Your Code Below



In [9]:
import pandas as pd

# path to the input file 
main_file_path = '../input/train.csv'
# read csv file and saving it as data frame
data = pd.read_csv(main_file_path)
# summary of how the data looks like
print(data.describe())

In [10]:
# a quick look at all the column names in the csv file 
print(data.columns)

In [11]:
# select a single column of data 
data_price = data.SalePrice

# print the data 
print(data_price.head())

In [12]:
# selecting multiple columns 
columns = ['LotArea','LotFrontage']
two_columns_data = data[columns]

In [13]:
# describe the data
two_columns_data.describe()

In [14]:
#  label for dataset 
y = data.SalePrice

In [16]:
feature_columns = ['LotArea','YearBuilt','1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
# features
X = data[feature_columns]

In [18]:
# print first five values 
X.head()

In [22]:
# import our decision tree regession model
from sklearn.tree import DecisionTreeRegressor

In [23]:
model = DecisionTreeRegressor()

# fit training data to our model
model.fit(X,y)

In [25]:
# prediction using model
# using first five rows from features to predict 
print(X.head())

# predictions
print(model.predict(X.head()))

In [28]:
# to see how our model perform we will calculate error
# error here is mean_absolute error 

# import mean absolute error from sklearn.metrics
from sklearn.metrics import mean_absolute_error

predicted_labels = model.predict(X)
# error is found between predicted labels and real labels 
print(mean_absolute_error(y, predicted_labels))

## Training model by spliting the data into training and test sets

In [45]:
# train_test_split import splits the training and testing data from our features 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)


In [46]:
print("Shape of training data:\n\tX:{} \t y:{}".format(X_train.shape,  y_train.shape))
print("Shape of testing data:\n\tX:{} \t y:{}".format(X_test.shape,  y_test.shape))

### Fit the model with training data

In [47]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()

# fit the model with training data 
model.fit(X_train, y_train)

In [48]:
# predicting the model using testing data 
# after that checking our how well we are doing on testing data using mean absolute error
prediction_labels = model.predict(X_test)

In [49]:
from sklearn.metrics import mean_absolute_error
# using predicted labels to calculate mean abolute error
# to see how well we are doing on testing data 
error = mean_absolute_error(y_test, prediction_labels)

In [50]:
print(error)

In [51]:
# choosing maximum leaf nodes in tree using trial and error
# a utility function to learn mae using different leaf nodes

def get_mae(max_leaf_nodes, X_train, y_train, X_val, y_val):
    pred_model = DecisionTreeRegressor()
    pred_model.fit(X_train, y_train)
    predicted_labels = pred_model.predict(X_val)
    return (mean_absolute_error(y_val, predicted_labels))


In [55]:
# testing for different leaf nodes sizes 
error_dict = {}
for max_leaf_node in [5, 50, 500, 5000, 10000, 150000]:
    error = get_mae(max_leaf_node, X_train, y_train, X_test, y_test)
    error_dict[max_leaf_node] = error
    print("Max leaf node: {} \t mean absolute error: {}".format(max_leaf_node, error) )

### Prediction using Random Forest Algorithm

In [62]:
# train the data using RandomForest Algorithm 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor()
model.fit(X_train, y_train)
pred_labels = model.predict(X_test)
error = mean_absolute_error(y_test, pred_labels)
print(error)

# Writing code for submission


In [64]:
# imports 
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [65]:
# training and test data 
training_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')

# preparing data 
# training data
y_train = training_data.SalePrice
predictor_cols = ['LotArea', 'OverallQual', 'YearBuilt', 'TotRmsAbvGrd']
X_train = training_data[predictor_cols]

# testing data
X_test = test_data[predictor_cols]

In [66]:
# model 
model = RandomForestRegressor()
model.fit(X_train, y_train)
pred_labels = model.predict(X_test)
print(pred_labels)

In [68]:
# submitting to csv file
my_submission = pd.DataFrame({'Id': test_data.Id, 'SalePrice': pred_labels})

# submit to csv file
my_submission.to_csv('submission.csv', index=False)