## What if we want to this the other way around, to use the same dataset but for classification problem.

In [22]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

%matplotlib inline

In [3]:
data = pd.read_csv('kc_house_data.csv')
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180.0,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170.0,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770.0,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050.0,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680.0,0,1987,0,98074,47.6168,-122.045,1800,7503


### Now we are trying to use the same data for classification, let's make GRADE as our target. So, we gonna train a model that will be able to predict the GRADE of a house based on few features.

TO simplify the example let's just do this based on the following attributes:
- price
- bedrooms
- sqft_living
- yr_built
- yr_renovated

In [8]:
columns = ['price','sqft_living',  'bedrooms', 'yr_built', 'yr_renovated']

features = data[list(columns)].values
target = data['grade'].values

xtrain, xtest, ytrain, ytest =train_test_split(features, target, test_size = 0.3)

Cool, now let's create a decision tree algorithm to learn to predict the grade of the house.

In [9]:
# creating the tree
dtc = DecisionTreeClassifier(random_state=0)

# Train
dtc.fit(xtrain, ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [16]:
# Let's calculate accuracy

training_accuracy = accuracy_score(ytrain, dtc.predict(xtrain))
testing_accuracy = accuracy_score(ytest, dtc.predict(xtest))

print("Training  set accuracy of the Decision Tree = {:.2f} %".format(training_accuracy*100))
print("Testing set accuracy of the Decision Tree = {:.2f} %".format(testing_accuracy*100))

Training  set accuracy of the Decision Tree = 99.95 %
Testing set accuracy of the Decision Tree = 53.50 %


# Our model is performaing poorly in the testing set, this shouldn't happen. But let's try logistic regression model.

In [26]:
logreg = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
logreg.fit(xtrain, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [27]:
# Let's calculate accuracy

training_accuracy = accuracy_score(ytrain, logreg.predict(xtrain))
testing_accuracy = accuracy_score(ytest, logreg.predict(xtest))

print("Training  set accuracy of the Logistic Regression = {:.2f} %".format(training_accuracy*100))
print("Testing set accuracy of the Logistic Regression = {:.2f} %".format(testing_accuracy*100))

Training  set accuracy of the Logistic Regression = 45.24 %
Testing set accuracy of the Logistic Regression = 44.82 %


### The model is performing awfully here, of course this is bad. But let go ahead. We want to see how to use this model to predict the grade of a house.

If a house has:
- price = 239000
- sqft_living = 2300
- bedrooms = 4
- year built = 1991
- year renovated = 2002

We want to predict the above house

In [61]:
# see how we can use the model, using the predict method
# pass the following data:
## 'price','sqft_living',  'bedrooms', 'yr_built', 'yr_renovated'
logreg.predict(np.array([[239000,2300, 4, 1991, 2002]]).reshape(1,5))

array([7])

This house is Grade = 7 According to our not so good model.

BUT did you see we are able to predict using the model we have trained??

Suppose we are happy with the performance, we should think about deploying the model. To deploy we have to:
- Serialize the model (export into a program)
- build an API.

We can do this in the following two steps.

# Serialize the model

In [63]:
from sklearn.externals import joblib

# Save/export the model
model_filename = 'houseGrade-v1.0.pkl'
print("Saving model to {}...".format(model_filename))
joblib.dump(logreg, model_filename)

Saving model to houseGrade-v1.0.pkl...


['houseGrade-v1.0.pkl']

## Building an API using Flask (Python we development library)

- creat a new file (text file or somehting)
- save file as houseapi.py
- go to the termnial or command line
- type:
python houseapi.py

Your API will run and you can open the url in your browser
most probably will run on http://127.0.0.1:5000/

In [None]:
# from flask import Flask, request, jsonify
from sklearn.externals import joblib

app = Flask(__name__)

# Load the model
MODEL = joblib.load('houseGrade-v1.0.pkl')
MODEL_LABELS = [1,2,3,4,5,6,7,8,9,10,11,12]

@app.route('/predict')
def predict():
    # Retrieve query parameters related to this request.
    #'price','sqft_living',  'bedrooms', 'yr_built', 'yr_renovated'
    price = request.args.get('price')
    sqft = request.args.get('sqft_living')
    bedrooms = request.args.get('bedrooms')
    yearBuilt = request.args.get('yr_built')
    yearRenovated = request.args.get('yr_renovated')
    
    # Our model expects a list of records
    features = [[price, sqft, bedrooms, yearBuilt, yearRenovated]]


    # Use the model to predict the class
    label_index = MODEL.predict(features)
    label_conf = MODEL.predict_proba(features)

    # Retrieve the iris name that is associated with the predicted class
    label = MODEL_LABELS[label_index[0]]
    # Create and send a response to the API caller

    return jsonify(status='complete', label=label,  label_conf = ''.join(str(label_conf)))



if __name__ == '__main__':
    app.run(debug=True)

now let's test your API with our new house

price=436000&sqft_living=3000&bedrooms=5&yr_built2000&yr_renovated=2010

In [75]:
#would expect something similar like this
logreg.predict(np.array([[436000,3000, 5, 2000, 2010]]).reshape(1,5))

array([7])