# Introduction
**This will be your workspace for Kaggle's Machine Learning education track.**

You will build and continually improve a model to predict housing prices as you work through each tutorial.  Fork this notebook and write your code in it.

The data from the tutorial, the Melbourne data, is not available in this workspace.  You will need to translate the concepts to work with the data in this notebook, the Iowa data.

Come to the [Learn Discussion](https://www.kaggle.com/learn-forum) forum for any questions or comments. 

# Write Your Code Below



In [None]:
import pandas as pd

train_file_path = '../input/train.csv'
#read_csv方法创建了一个能被pandas读取处理的，名为DataFrames的类型。
train_data = pd.read_csv(train_file_path)
#DataFrame类的describe属性，是DataFrame中所有的数据。可以用print来展示。
print(train_data.describe())

In [None]:


#DataFrame类的columns参数，是列首标签。
print(train_data.columns)



In [None]:
#选择单列的方法

#用类似索引的方式。
#train_LotFrontage_data = train_data.LotFrontage
train_LotFrontage_data = train_data['LotFrontage']
#head()命令将前5行的内容输出
print(train_LotFrontage_data.head())

In [None]:
#选择多列的方法。

#用包含多个列首名称的list，代替单独的列首名称，来完成多列的选择。
columns_of_interest = ['OpenPorchSF','2ndFlrSF', '1stFlrSF','YearRemodAdd']
columns_data_of_interest = train_data[columns_of_interest]
print(columns_data_of_interest.head())

print(columns_data_of_interest.describe())

In [None]:
#选择Prediction Target
#国际惯例将Prediction Target重命名为 y (lower case)
y = train_data.SalePrice


#选择Predictors
#国际惯例将Predictors重命名为 X (upper case)
#除Preriction Target之外的任意变量均可作为Predicter
Iowa_predicters = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = train_data[Iowa_predicters]

In [None]:
#Build Models
'''
4 steps to building and using a model:
    Define: What type of model will it be? A decision tree? Some other type of model? Some other parameters of the model type are specified too.
    Fit: Capture patterns from provided data. This is the heart of modeling.
    Predict: Just what it sounds like
    Evaluate: Determine how accurate the model's predictions are.
'''

from sklearn.tree import DecisionTreeRegressor

#define a model
Iowa_model = DecisionTreeRegressor()

#fit the model
Iowa_model.fit(X, y)

In [None]:
#Measure of model quality. Mean Absolute Error.
#error=actual−predicted
from sklearn.metrics import mean_absolute_error

predicted_home_prices = Iowa_model.predict(X)
#mean_absolute_error(actual_value, predicted_value)
mean_absolute_error(y, predicted_home_prices)

In [None]:
#use the built-in method to split data into prediction data and validation data.  
# split data into training and validation data, for both predictors and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.

from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

#fit the model with train data
Iowa_model.fit(train_X, train_y)

#get predictions on validation data
validation_predictions = Iowa_model.predict(val_X)

#measure the quality of the model with MAE
mean_absolute_error(val_y, validation_predictions)

In [None]:



def get_mae(train_predictors, val_predictors, train_target, val_target, max_leaf_nodes):
    new_model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state = 0)
    #new_model = DecisionTreeRegressor()
    new_model.fit(train_predictors, train_target)
    val_predictions = new_model.predict(val_predictors)
    mae = mean_absolute_error(val_target, val_predictions)
    return (mae)


for max_leaf_nodes in [5, 50, 100, 500, 800, 1000, 3000]:
    my_mae = get_mae(train_X, val_X, train_y, val_y, max_leaf_nodes)
    print('Max tree nodes: %d \t\tMean absolute error: %d'%(max_leaf_nodes, my_mae)) 

In [None]:
#Random Forest, a model better than Decision Tree.
from sklearn.ensemble import RandomForestRegressor
#define a model
Iowa_new_model = RandomForestRegressor()

Iowa_new_model.fit(train_X, train_y)

val_pre = Iowa_new_model.predict(val_X)

print(mean_absolute_error(val_y, val_pre))

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

#read the training file
train_file_path = '../input/train.csv'
train_data = pd.read_csv(train_file_path)

#select traget and predictors
y = train_data.SalePrice
Iowa_predictors = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
train_X = train_data[Iowa_predictors]


Iowa_model = RandomForestRegressor()
Iowa_model.fit(train_X, y)

#read the test file
test_file_path = '../input/test.csv'
test_data = pd.read_csv(test_file_path)
# Treat the test data in the same way as training data
test_X = test_data[Iowa_predictors]
#predict sale prices
predicted_prices = Iowa_model.predict(test_X)

#turn the result into DataFrame type
my_submission = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predicted_prices})
#save it as csv file
my_submission.to_csv('submission.csv', index=False)
print(predicted_prices)