# House Price Model Using AWS SageMaker

This will use the a Random Forrest regression to predic the house price.

## Setup
Specify the S3 bucket with the data to be used for train the model.


In [41]:
# S3 prefix
prefix = 'Scikit-house'

import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()





In [42]:
import numpy as np
import sklearn.cluster
import pickle
import gzip
import urllib.request
import json
#import mxnet as mx
import boto3
import time
import io
import os
## New for this Project
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split



## Data 

Download data from S3 Bucket and copy into the '/home/ec2-user/Sagemaker/data' folder

In [43]:
s3 = boto3.Session().resource('s3')

print (sagemaker_session.default_bucket())

sagemaker-us-east-2-029880428228


In [44]:

s3.meta.client.download_file('mlbuckethose', 'ml_data/DataForModeling.csv', '/home/ec2-user/SageMaker/data/DataForModeling.csv')

In [45]:
WORK_DIRECTORY = 'data'

train_input = sagemaker_session.upload_data(WORK_DIRECTORY, key_prefix="{}/{}".format(prefix, WORK_DIRECTORY) )

Read the data into a dataframe

In [46]:
from sagemaker.sklearn.estimator import SKLearn

script_path = 'sklearn-randomforest.py'

sklearn = SKLearn(
    entry_point=script_path,
    train_instance_type="ml.c4.xlarge",
    role=role,
    sagemaker_session=sagemaker_session
    )

In [57]:
sklearn.fit({'train': train_input})

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2019-03-18-01-40-09-396


2019-03-18 01:40:09 Starting - Starting the training job...
2019-03-18 01:40:10 Starting - Launching requested ML instances......
2019-03-18 01:41:11 Starting - Preparing the instances for training...
2019-03-18 01:42:06 Downloading - Downloading input data
2019-03-18 01:42:06 Training - Training image download completed. Training in progress.
2019-03-18 01:42:06 Uploading - Uploading generated training model.
[31m2019-03-18 01:42:01,033 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[31m2019-03-18 01:42:01,036 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-03-18 01:42:01,050 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[31m2019-03-18 01:42:01,301 sagemaker-containers INFO     Module sklearn-randomforest does not provide a setup.py. [0m
[31mGenerating setup.py[0m
[31m2019-03-18 01:42:01,302 sagemaker-containers INFO     Generating setup.cfg[0m
[31m2019-03-

## Using the trained model to make inference requests <a class="anchor" id="inference"></a>

### Deploy the model <a class="anchor" id="deploy"></a>

Deploying the model to SageMaker hosting just requires a `deploy` call on the fitted model. This call takes an instance count and instance type.

In [58]:
predictor = sklearn.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2019-03-18-01-40-09-396
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2019-03-18-01-40-09-396


---------------------------------------------------------------!

### Choose some data and use it for a prediction <a class="anchor" id="prediction_request"></a>

In order to do some predictions, we'll extract some of the data we used for training and do predictions against it. This is, of course, bad statistical practice, but a good way to see how the mechanism works.

In [60]:
df = pd.read_csv('/home/ec2-user/SageMaker/data/DataForModeling.csv')
df = df[:50]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 34 columns):
MSSubClass       50 non-null int64
LotArea          50 non-null int64
OverallQual      50 non-null int64
OverallCond      50 non-null int64
YearBuilt        50 non-null int64
YearRemodAdd     50 non-null int64
BsmtFinSF1       50 non-null int64
BsmtFinSF2       50 non-null int64
BsmtUnfSF        50 non-null int64
TotalBsmtSF      50 non-null int64
1stFlrSF         50 non-null int64
2ndFlrSF         50 non-null int64
LowQualFinSF     50 non-null int64
GrLivArea        50 non-null int64
BsmtFullBath     50 non-null int64
BsmtHalfBath     50 non-null int64
FullBath         50 non-null int64
HalfBath         50 non-null int64
BedroomAbvGr     50 non-null int64
KitchenAbvGr     50 non-null int64
TotRmsAbvGrd     50 non-null int64
Fireplaces       50 non-null int64
GarageCars       50 non-null int64
GarageArea       50 non-null int64
WoodDeckSF       50 non-null int64
OpenPorchSF      50 no

In [61]:
if "SalePrice" in df.columns:
    yData = df.SalePrice
    del df["SalePrice"]
xData = df

In [49]:
df = pd.read_csv('/home/ec2-user/SageMaker/data/DataForModeling.csv')

In [62]:
print(predictor.predict(xData.values))
print(yData.values)

[210670.  177600.  225150.  145290.  257150.  144480.  308190.2 219900.
 141140.  123500.  129950.  355126.1 132450.  259530.  156725.  149697.9
 144850.   93080.  155775.  135400.  316661.9 145135.  218279.3 141310.
 148800.  258429.  131240.  298089.9 187900.   68490.   56850.  144380.
 213984.3 165600.  291906.1 300921.3 144400.  149015.  116500.   87297.1
 158200.  147000.  142050.  122770.  139900.  316788.5 271118.  247270.
 115983.4 127700. ]
[208500 181500 223500 140000 250000 143000 307000 200000 129900 118000
 129500 345000 144000 279500 157000 132000 149000  90000 159000 139000
 325300 139400 230000 129900 154000 256300 134800 306000 207500  68500
  40000 149350 179900 165500 277500 309000 145000 153000 109000  82000
 160000 170000 144000 130250 141000 319900 239686 249700 113000 127000]


### Endpoint cleanup <a class="anchor" id="endpoint_cleanup"></a>

When you're done with the endpoint, you'll want to clean it up.

In [63]:
sklearn.delete_endpoint()

INFO:sagemaker:Deleting endpoint with name: sagemaker-scikit-learn-2019-03-18-01-40-09-396


## Appendix Script to be used by Sagemaker

## Create a Scikit-learn script to train with <a class="anchor" id="create_sklearn_script"></a>
SageMaker can now run a scikit-learn script using the `SKLearn` estimator.

```python
from __future__ import print_function

import argparse
import os
import pandas as pd
from sklearn import svm
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
#from sklearn.cross_validation import train_test_split


if __name__ == '__main__':
    parser = argparse.ArgumentParser()


    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])

    args = parser.parse_args()

    # Take the set of files and read them all into a single pandas dataframe
    input_files = [ os.path.join(args.train, file) for file in os.listdir(args.train) ]
    if len(input_files) == 0:
        raise ValueError(('There are no files in {}.\n' +
                          'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                          'the data specification in S3 was incorrectly specified or the role specified\n' +
                          'does not have permission to access the data.').format(args.train, "train"))
    raw_data = [ pd.read_csv(file,  engine="python") for file in input_files ]
    train_data = pd.concat(raw_data)
    
    if "SalePrice" in train_data.columns:
        yData = train_data.SalePrice
        del train_data["SalePrice"]
    
    xData = train_data
    # labels are in the first column
    #train_y = train_data.ix[:,0]
    #train_X = train_data.ix[:,1:]
    
    #X_train, X_test, y_train, y_test = train_test_split(all_X,all_y)
    
    
    

    # Here we support a single hyperparameter, 'max_leaf_nodes'. Note that you can add as many
    # as your training my require in the ArgumentParser above.
   
    # Now use scikit-learn's decision tree classifier to train the model.
    rf=RandomForestRegressor()
    rf.fit(xData,yData)

    # Print the coefficients of the trained classifier, and save the coefficients
    joblib.dump(rf, os.path.join(args.model_dir, "model.joblib"))


def model_fn(model_dir):
    """Deserialized and return fitted model
    
    Note that this should have the same name as the serialized model in the main method
    """
    rf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return rf
```