# Regresión lineal con Sklearn 

In [None]:
# we need to install libs that are not included int the datascience kernel
pip install matplotlib seaborn

In [None]:
import os
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn

%matplotlib inline

In [None]:
data = sklearn.datasets.load_boston()
raw_dataset = pd.DataFrame(data['data'], columns=data['feature_names'])
raw_dataset.loc[:, 'MEDV'] = data['target']
dataset = raw_dataset.copy()

In [None]:
dataset.tail()

In [None]:
dataset.shape

### Dataset description

Number of Instances: 506 

Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

Attribute Information:

|Feature|Description|
|---|---| 
|CRIM |     per capita crime rate by town| 
|          ZN  |      proportion of residential land zoned for lots over 25,000 sq.ft.| 
|          INDUS |    proportion of non-retail business acres per town| 
|          CHAS  |    Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)| 
|           NOX  |     nitric oxides concentration (parts per 10 million)| 
|           RM  |      average number of rooms per dwelling| 
|           AGE  |     proportion of owner-occupied units built prior to 1940| 
|           DIS  |     weighted distances to five Boston employment centres| 
|           RAD |      index of accessibility to radial highways| 
|          TAX  |     full-value property-tax rate per 10000 USD | 
|          PTRATIO|   pupil-teacher ratio by town| 
|           B    |     1000(Bk - 0.63)^2 where Bk is the proportion of black people by town| 
|           LSTAT |    % lower status of the population| 
|           MEDV  |    Median value of owner-occupied homes in $1000's| 




## División train y test

In [None]:
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [None]:
train_dataset.shape

In [None]:
test_dataset.shape

# Data exploration

In [None]:
import seaborn as sns

In [None]:
train_dataset.describe().transpose()

In [None]:
fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(train_dataset.corr(), annot=True, ax=ax) 

# Data preprocessing

- Separar la etiqueta o valor a predecir de las features.

In [None]:
x_train = train_dataset.copy()
x_test = test_dataset.copy()

y_train = x_train.pop('MEDV')
y_test = x_test.pop('MEDV')

In [None]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

Normalization

In [None]:
x_train.describe().transpose()[['mean', 'std']]

- Es una buena práctica normalizar las features para que esten todas en el mismo rango.

In [None]:
train_mean = x_train.mean()
train_std = x_train.std()

x_train = (x_train - train_mean) / train_std
x_test = (x_test - train_mean) / train_std

In [None]:
x_train.describe().transpose()[['mean', 'std']]

# Linear Regression Model

In [None]:
# Create linear regression object
regr = sklearn.linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(x_train, y_train)

# Make predictions using the training and testing sets
y_pred_train = regr.predict(x_train)
y_pred_test = regr.predict(x_test)

In [None]:
# The coefficients
coeff = pd.Series(regr.coef_, index=x_train.columns)
coeff

In [None]:
coeff.plot.bar()

In [None]:
mse_train =  sklearn.metrics.mean_squared_error(y_pred_train, y_train)
print(f"Mean squared error train: {mse_train}")
mse_test =  sklearn.metrics.mean_squared_error(y_pred_test, y_test)
print(f"Mean squared error test: {mse_test}")

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
# Plot outputs
ax.scatter(y_pred_train, y_train, color="black")
ax.set_xlim([0, 60])
ax.set_ylim([0, 60])