# Modelización mediante una regresión lineal

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

#### 1. Carga de datos (precios de casas en Boston)

In [2]:
boston = datasets.load_boston()
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

#### 2. Observación de los datos

In [3]:
ind = 0
print("---VARIABLES EXPLICATIVAS---")
for vble_name, value in zip(boston['feature_names'], boston['data'][ind]):
    print(vble_name + ": " + str(value))
print("\n---VARIABLE OBJETIVO---")
print("MEDV: " + str(boston['target'][ind]))

---VARIABLES EXPLICATIVAS---
CRIM: 0.00632
ZN: 18.0
INDUS: 2.31
CHAS: 0.0
NOX: 0.538
RM: 6.575
AGE: 65.2
DIS: 4.09
RAD: 1.0
TAX: 296.0
PTRATIO: 15.3
B: 396.9
LSTAT: 4.98

---VARIABLE OBJETIVO---
MEDV: 24.0


#### 3. Separación en un conjunto de entrenamiento (70%) y otro de test (30%)

In [4]:
X = pd.DataFrame(boston['data'], columns = boston['feature_names'])
y = pd.DataFrame(boston['target'], columns = ['MEDV'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1018)
X_train.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
118,0.13058,0.0,10.01,0.0,0.547,5.872,73.1,2.4775,6.0,432.0,17.8,338.63,15.37
392,11.5779,0.0,18.1,0.0,0.7,5.036,97.0,1.77,24.0,666.0,20.2,396.9,25.68
61,0.17171,25.0,5.13,0.0,0.453,5.966,93.4,6.8185,8.0,284.0,19.7,378.08,14.44
381,15.8744,0.0,18.1,0.0,0.671,6.545,99.1,1.5192,24.0,666.0,20.2,396.9,21.08
53,0.04981,21.0,5.64,0.0,0.439,5.998,21.4,6.8147,4.0,243.0,16.8,396.9,8.43


#### 4. Instanciación de un modelo de regresión lineal

In [5]:
regr = linear_model.LinearRegression()

#### 5. Entrenamiento (construcción) y guardado del modelo

In [6]:
regr.fit(X_train, y_train)
joblib.dump(regr, 'model.pkl')

print("   COEFICIENTES DE LA REGRESIÓN   ")
print("----------------------------------")
print(regr.coef_)

   COEFICIENTES DE LA REGRESIÓN   
----------------------------------
[[-1.10329111e-01  5.66997361e-02  6.72411687e-02  3.40675302e+00
  -1.87922895e+01  2.97989171e+00 -8.07715162e-03 -1.70215951e+00
   3.01797822e-01 -1.23099758e-02 -9.39754754e-01  8.50202554e-03
  -5.56888218e-01]]


#### 6. Validación (testeo) del modelo

In [7]:
y_pred = regr.predict(X_test)
msq = np.round(mean_squared_error(y_test, y_pred), 2)
r2  = np.round(100 * r2_score(y_test, y_pred), 2)
print("Error cuadrático medio: " + str(msq))
print("Varianza explicada por el modelo: " + str(r2) + "%")

Error cuadrático medio: 20.8
Varianza explicada por el modelo: 77.14%


#### 7. Convertimos los datos de test a JSON

In [8]:
data = X_test.to_json(orient='records')
data[0:1000]

'[{"CRIM":0.10574,"ZN":0.0,"INDUS":27.74,"CHAS":0.0,"NOX":0.609,"RM":5.983,"AGE":98.8,"DIS":1.8681,"RAD":4.0,"TAX":711.0,"PTRATIO":20.1,"B":390.11,"LSTAT":18.07},{"CRIM":0.37578,"ZN":0.0,"INDUS":10.59,"CHAS":1.0,"NOX":0.489,"RM":5.404,"AGE":88.6,"DIS":3.665,"RAD":4.0,"TAX":277.0,"PTRATIO":18.6,"B":395.24,"LSTAT":23.98},{"CRIM":0.06211,"ZN":40.0,"INDUS":1.25,"CHAS":0.0,"NOX":0.429,"RM":6.49,"AGE":44.4,"DIS":8.7921,"RAD":1.0,"TAX":335.0,"PTRATIO":19.7,"B":396.9,"LSTAT":5.98},{"CRIM":88.9762,"ZN":0.0,"INDUS":18.1,"CHAS":0.0,"NOX":0.671,"RM":6.968,"AGE":91.9,"DIS":1.4165,"RAD":24.0,"TAX":666.0,"PTRATIO":20.2,"B":396.9,"LSTAT":17.21},{"CRIM":0.05023,"ZN":35.0,"INDUS":6.06,"CHAS":0.0,"NOX":0.4379,"RM":5.706,"AGE":28.4,"DIS":6.6407,"RAD":1.0,"TAX":304.0,"PTRATIO":16.9,"B":394.02,"LSTAT":12.43},{"CRIM":0.26169,"ZN":0.0,"INDUS":9.9,"CHAS":0.0,"NOX":0.544,"RM":6.023,"AGE":90.4,"DIS":2.834,"RAD":4.0,"TAX":304.0,"PTRATIO":18.4,"B":396.3,"LSTAT":11.72},{"CRIM":1.61282,"ZN":0.0,"INDUS":8.14,"CHAS":0