# Modelización (regresión lineal) y disponibilización como API en EC2

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from flask import Flask, jsonify

#### 1. Carga de datos (precios de casas en Boston)

In [2]:
boston = datasets.load_boston()
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

#### 2. Observación de los datos

In [3]:
ind = 0
print("---VARIABLES EXPLICATIVAS---")
for vble_name, value in zip(boston['feature_names'], boston['data'][ind]):
    print(vble_name + ": " + str(value))
print("\n---VARIABLE OBJETIVO---")
print("MEDV: " + str(boston['target'][ind]))

---VARIABLES EXPLICATIVAS---
CRIM: 0.00632
ZN: 18.0
INDUS: 2.31
CHAS: 0.0
NOX: 0.538
RM: 6.575
AGE: 65.2
DIS: 4.09
RAD: 1.0
TAX: 296.0
PTRATIO: 15.3
B: 396.9
LSTAT: 4.98

---VARIABLE OBJETIVO---
MEDV: 24.0


#### 3. Separación en un conjunto de entrenamiento (70%) y otro de test (30%)

In [4]:
X = boston['data']
y = boston['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1018)

#### 4. Instanciación de un modelo de regresión lineal

In [5]:
regr = linear_model.LinearRegression()

#### 5. Entrenamiento (construcción) y guardado del modelo

In [10]:
regr.fit(X_train, y_train)
joblib.dump(regr, 'model.pkl')

print("   COEFICIENTES DE LA REGRESIÓN   ")
print("----------------------------------")
for vble_name, value in zip(boston['feature_names'], regr.coef_):
    print(vble_name + ": " + str(value))

   COEFICIENTES DE LA REGRESIÓN   
----------------------------------
CRIM: -0.11193902050260902
ZN: 0.05674996301585538
INDUS: 0.06669819526661612
CHAS: 3.405086305207178
NOX: -18.74445023841001
RM: 2.986471486055574
AGE: -0.008142450521635392
DIS: -1.7022330690842205
RAD: 0.3024273631181672
TAX: -0.012318143478286407
PTRATIO: -0.9390676040160599
B: 0.00836367800618106
LSTAT: -0.5558901137172408


#### 6. Validación (testeo) del modelo

In [7]:
y_pred = regr.predict(X_test)
msq = np.round(mean_squared_error(y_test, y_pred), 2)
r2  = np.round(100 * r2_score(y_test, y_pred), 2)
print("Error cuadrático medio: " + str(msq))
print("Varianza explicada por el modelo: " + str(r2) + "%")

Error cuadrático medio: 20.8
Varianza explicada por el modelo: 77.15%


#### 7. Disponibilización del modelo como API

In [None]:
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    json_ = request.json
    query = pd.DataFrame(json_).values
    prediction = regr.predict(query)
    return jsonify({'prediction': list(prediction)})
    
if __name__ == '__main__':
    regr = joblib.load('model.pkl')
    app.run(port=8080)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8080/ (Press CTRL+C to quit)
