Sample of code below.

In [29]:
import pandas as pd

main_file_path = '../input/train.csv'
data = pd.read_csv(main_file_path)
print('hello world')

In [30]:
import pandas as pd

In [31]:
# Save filepath to variable for easier access
melbourne_file_path = '../input/train.csv'
# read the data and store in dataframe titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path)
# Print a summary of the data
print(melbourne_data.describe())

In [32]:
# See what is in this dataset
print(melbourne_data.columns)

In [33]:
## Subsetting data
# Select a single column
# Store prices separately
melbourne_price_data = melbourne_data.SalePrice
print(melbourne_price_data.head())

In [34]:
# Selecting multiple columns
columns_of_interest = ['YrSold', 'LotArea']
two_columns_of_data = melbourne_data[columns_of_interest]
two_columns_of_data.describe()

In [35]:
## Build a model
# step 1 - Choosing the prediction target
# Specify the prediction target (column of data) and store it as y
# We want to predict price of houses so use:
y = melbourne_data.SalePrice

# Choosing predictors
# Select variables we want to use to predict price.
# To start with we'll use:
model_predictors = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
# Store predictors as x
x = melbourne_data[model_predictors]

# Build the model using scikit-learn library
# Steps for building and using model:
    # Define: What type of model will it be? A decision tree? Some other type of model? Some other parameters of the model type are specified too.
    # Fit: Capture patterns from provided data. This is the heart of modeling.
    # Predict: Just what it sounds like
    # Evaluate: Determine how accurate the model's predictions are.
from sklearn.tree import DecisionTreeRegressor

# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(x,y)




In [36]:
# Test how well the model works on the training data
print("Making predictions for the following 5 houses:")
print(x.head())
print("The predictions are")
print(melbourne_model.predict(x.head()))


In [37]:
# Model validation using mean absolute error
# Check model MAE using data from the training set (bad idea)
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(x)
mean_absolute_error(y, predicted_home_prices)

In [42]:
# Need to test model accuracy using validation data
# scikit-learn library has a function tran_test_split to break up the data into two pieces
from sklearn.model_selection import train_test_split

# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_x, val_x, train_y, val_y = train_test_split(x, y, random_state = 0)
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(train_x, train_y)

# get predicted prices on validated data
val_predictions = melbourne_model.predict(val_x)
print(mean_absolute_error(val_y, val_predictions))

In [39]:
# Note the difference in error by using the flawed method (validating model data)
# and the right method (splitting into train, validate datasets)

In [44]:
## Experimenting with different models
# Avoid overfitting/underfitting the data using max_leaf_nodes

from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

In [45]:
# The data is loaded into train_x, val_x, train_y, val_y
# We use a for-loop to compare accuracy of models built with different values for max_leaf_nodes
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_x, val_x, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))


In [None]:
# Best model is with 50 leaf nodes (gives the lowest mean absolute error: 27825)

In [46]:
# Decision trees are basic and have problems: too many leaves - overfitting, not enough leaves - underfitting
# Random forest is more sophisticated decision tree method. Uses many trees and has better accuracy
# Build a random forest based on the data:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor()
forest_model.fit(train_x, train_y)
melb_preds = forest_model.predict(val_x)
print(mean_absolute_error(val_y, melb_preds))

In [None]:
# Note how much better random forest performed vs decision tree (24708 vs 27825)
# There are parameters to make prediction more accurate but random forest works pretty well without tuning

In [50]:
## Posting model to machine learning competition on kaggle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Read the data
train = pd.read_csv('../input/train.csv')

# Pull data into target (y) and predictors (X)
train_y = train.SalePrice
predictor_cols = ['1stFlrSF', '2ndFlrSF','LotArea','YearBuilt','TotRmsAbvGrd']

# Create training predictors data
train_X = train[predictor_cols]

my_model = RandomForestRegressor()
my_model.fit(train_X, train_y)

In [51]:
# Add test data
test = pd.read_csv('../input/test.csv')
# Treat the test data in the same way as the training data. In this case, pull same columns
test_X = test[predictor_cols]
# Use model to make predictions
predicted_prices = my_model.predict(test_X)
# Look at predicted prices to make sure they make sense
print(predicted_prices)

In [54]:
# Prepare submission file
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})
# Can use any filename. Choose 'submission' for this
my_submission.to_csv('submission.csv', index = False)