In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from IPython.display import display, Markdown

In [2]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [3]:
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names) # Carregando os dados associados à chave 'data' da dict no dataframe pandas e associando as colunas do dataframe aos valores associados à chave 'feature_names'
boston["MEDV"] = boston_dataset.target
boston

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [4]:
x = pd.DataFrame(np.c_[boston["LSTAT"], boston["RM"]], columns = ["LSTAT","RM"])
y = boston["MEDV"]

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

y_train = np.array(y_train).reshape(-1,1)
y_test = np.array(y_test).reshape(-1,1)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(404, 2) (404, 1)
(102, 2) (102, 1)


In [6]:
class LinearRegression:
    def __init__(self, lr=0.001, n_epochs=500):
        self.lr=lr
        self.n_epochs=n_epochs
        
    def fit(self, x_train, y_train):
        self.costs = []
        self.weights = np.zeros((x_train.shape[1], 1))
        
        for _ in range(self.n_epochs):
            output = np.dot(x_train, self.weights)
            error = output - y_train
            
            gradient_vector = (np.dot(x_train.T, error))
            self.weights -= ((self.lr/x.shape[0]) * gradient_vector)
            
            rmse = math.sqrt(np.sum((error**2)) / (x_train.shape[0]))
            self.costs.append(rmse)
            
    def predict(self, x):
        return np.dot(x, self.weights)
        

In [7]:
lr = 0.01
n_epochs = 100

In [8]:
model = LinearRegression(lr=lr, n_epochs=n_epochs)
model.fit(x_train, y_train)

In [9]:
# Avaliação do Modelo com os dados de Treinamento

y_train_predict = model.predict(x_train)  
rmse = (np.sqrt(mean_squared_error(y_train, y_train_predict)))  
r2 = r2_score(y_train, y_train_predict) 

display(Markdown("**Performance do modelo para dados de treinamento**"))
print('RMSE: {}'.format(rmse))
print('R2: {}'.format(r2))
print("\n")
print("--------------------------------------")

# Avaliação do Modelo com os dados de Teste

y_test_predict = model.predict(x_test)
rmse = (np.sqrt(mean_squared_error(y_test, y_test_predict)))
r2 = r2_score(y_test, y_test_predict)

display(Markdown("**Performance do modelo para dados de teste**"))
print('RMSE: {}'.format(rmse))
print('R2: {}'.format(r2))
print("\n")

**Performance do modelo para dados de treinamento**

RMSE: 5.641311742048242
R2: 0.6295253868951043


--------------------------------------


**Performance do modelo para dados de teste**

RMSE: 5.076660395257896
R2: 0.6708237669784485


