# Notebook de Treino de Modelo Regressão Linear  p/ previsão de valores de imóveis

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle

## 1) Carregando Dataset

In [2]:
df = pd.read_csv('dataset/casas.csv')
df.head()

Unnamed: 0,tamanho,ano,garagem,preco
0,159.0,2003,2,208500
1,117.0,1976,2,181500
2,166.0,2001,2,223500
3,160.0,1915,3,140000
4,204.0,2000,3,250000


#### Vericando tipo e quantidade de atributos

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   tamanho  1460 non-null   float64
 1   ano      1460 non-null   int64  
 2   garagem  1460 non-null   int64  
 3   preco    1460 non-null   int64  
dtypes: float64(1), int64(3)
memory usage: 45.8 KB


#### Exibindo uma estatistica descriva da base

In [4]:
df.describe().round(2)

Unnamed: 0,tamanho,ano,garagem,preco
count,1460.0,1460.0,1460.0,1460.0
mean,140.8,1971.27,1.77,180921.2
std,48.82,30.2,0.75,79442.5
min,31.0,1872.0,0.0,34900.0
25%,105.0,1954.0,1.0,129975.0
50%,136.0,1973.0,2.0,163000.0
75%,165.0,2000.0,2.0,214000.0
max,524.0,2010.0,4.0,755000.0


## 2) Separando Atributos Descritivos e Alvo

In [5]:
features = list(df.drop(columns=['preco']).columns.values)
target = 'preco'

X = df[features].values
y = df[target].values

print(X.shape, y.shape)

(1460, 3) (1460,)


## 3) Dividindo o dataset em subconjuntos de treino e de teste

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=123,
    shuffle=True,
)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1168, 3) (1168,)
(292, 3) (292,)


#### Escalonando Atributos Preditivos

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 4) Instanciando Modelo de RLM

In [8]:
model = LinearRegression()
model

LinearRegression()

#### Ajuste do Modelo com a base de Treino

In [9]:
model.fit(X=X_train, y=y_train)
model

LinearRegression()

#### Parâmetros Ajustados

In [10]:
model.coef_

array([42151.20471116, 24014.20261195, 18934.28162939])

In [11]:
model.intercept_

180717.06934931516

## 5) Verificando Assertividade do Modelo Ajustado

#### Assertividade na Base de Treino

In [12]:
y_train_pred = model.predict(X_train)

rmse_train = mean_squared_error(y_true=y_train, y_pred=y_train_pred, squared=False)
r2_train = r2_score(y_true=y_train, y_pred=y_train_pred)

#### Assertividade na Base de Teste

In [13]:
y_test_pred = model.predict(X_test)

rmse_test = mean_squared_error(y_true=y_test, y_pred=y_test_pred, squared=False)
r2_test = r2_score(y_true=y_test, y_pred=y_test_pred)

In [14]:
print(' >> Assertividade do Modelo Ajustado nas Bases:')
print()
print('    Treino:')
print(f'   - RMSE: {rmse_train:.3f}')
print(f'   - R2:   {r2_train:.3f}')
print()
print('    Teste:')
print(f'   - RMSE: {rmse_test:.3f}')
print(f'   - R2:   {r2_test:.3f}')

 >> Assertividade do Modelo Ajustado nas Bases:

    Treino:
   - RMSE: 45229.794
   - R2:   0.677

    Teste:
   - RMSE: 42263.745
   - R2:   0.711


## 6) Salvando Modelo Ajustado

In [15]:
with open(file='model/lr_model.pkl', mode='wb') as fp:
    pickle.dump(obj=[model, scaler, features], file=fp)