# Linear Regression - Melbourne Housing Market

pt:
- Aprendizado supervisionado
- Regressão linear multivariada

en:
- Supervised Learning
- Multivariate Linear Regression

## Importando Bibliotecas

- Importing libs

In [1]:
import warnings

import plotly.express as px
from sklearn import metrics

import matplotlib.pyplot as plt
import numpy as np
from pandas_profiling import ProfileReport
import pandas as pd
import plotly.express as px
import seaborn as sns
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
# DESABILITANDO OS WARNINGS

warnings.filterwarnings('ignore')

In [3]:
# DEFININDO CONFIGURAÇÕES PARA OS PLOTS

sns.set(rc={'figure.figsize':(12, 8)})

In [4]:
# DEFININDO CONFIGURAÇÕES DO PANDAS

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [15]:
dir_data = 'DATA/Melbourne_housing_FULL.csv'

## Funções úteis

- Utils Functions

In [6]:
def heatmap_plot_corr(data_corr):
    
    """
    
        FUNÇÃO PARA REALIZAR O PLOT DE MAPAS DE CALOR
        COM CONFIGURAÇÕES QUE PERMITEM UMA MELHOR VISUALIZAÇÃO
        DE CORRELAÇÃO ENTRE VARIÁVEIS
        
        # Arguments
            data_corr       - Required : Dados correlacionados (DataFrame)
        
        # Returns
    
    """
    
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # mask
    mask = np.triu(np.ones_like(data_corr, dtype=np.bool))
    
    # adjust mask and df
    mask = mask[1:, :-1]
    corr = data_corr.iloc[1:,:-1].copy()
    
    # plot heatmap
    sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap='Blues',
               vmin=-1, vmax=1, cbar_kws={"shrink": .8})
    
    # yticks
    plt.yticks(rotation=0)
    plt.show()

In [7]:
def get_variables_model_x_y(dataframe, 
                            list_predict=None, 
                            target=None):
    
    """
    
        FUNÇÃO PARA OBTER OS DADOS EM FORMATO (DATAFRAME/ARRAY) 
        DAS VARIÁVEIS PREDITORAS E VARIÁVEL TARGET DO MODELO
        
        # Arguments
            dataframe       - Required : Dados para o modelo (DataFrame)
            list_predict    - Optional : Variáveis preditoras (List | Tuple)
            target          - Optional : Variável Target (String)
        
        # Returns
            X               - Required : Variáveis independentes (DataFrame)
            y               - Required : Variáveis dependentes (DataFrame)
    
    """
    
    # INICIALIZANDO AS VARIÁVEIS USADAS NO MODELO
    X = None
    y = None
    
    try:
        
        if target is not None:
        
            """
                VERIFICANDO SE O VALOR DE PREDITORAS 
                É UMA LISTA (CONTENDO AS COLUNAS DESEJADAS)
            """

            if isinstance(list_predict, (list, tuple)):
                X = dataframe[list_predict]
            else:
                X = dataframe.drop(target, axis=1)

            y = dataframe[target]
        
    except Exception as ex:
        print(ex)
        
    return X, y

In [8]:
def get_value_corr_between_variables(data_corr, 
                                     variable_one, 
                                     variable_two):
    
    """
    
        FUNÇÃO PARA OBTER O VALOR DE CORRELAÇÃO ENTRE DUAS VARIÁVEIS.
        
        USA O CÁLCULO DE CORRELAÇÃO PARA TODAS VARIÁVEIS
        E FILTRA O VALOR ENTRE DUAS VARIÁVEIS ESPECIFICADAS
        
        # Arguments
            data_corr           - Required : Correlações obtidas (DataFrame)
            variable_one        - Optional : Variável 1 (String)
            variable_two        - Optional : Variável 2 (String)
        
        # Returns
            corr_variables      - Required : Valor de correlação
                                             entre as variáveis (Float)
    
    """
    
    # INICIANDIO AS VARIÁVEIS DE RESULTADO
    corr_variables = None
    
    # VERIFICANDO SE AS VARIÁVEIS ESTÃO NO DATAFRAME
    if (variable_one in data_corr.index) and (variable_two in data_corr.index):
    
        corr_variables = customers_corr[customers_corr.index == variable_one][variable_two].values[0]
        
    return corr_variables

In [9]:
def get_data_profile(data, title="Report", name_save_html="your_report.html"):
    
    """
    
        FUNÇÃO PARA OBTER O RELATÓRIO DO PERFIL DA BASE EM FORMATO HTML
        
        # Arguments
            data              - Required : Dados a serem analisados (DataFrame)
            title             - Optional : Título do relatório (String)
            name_save_html    - Optional : Nome para save do relatório (String)
        
        # Returns
    
    """
    
    # ANALISANDO A BASE
    profile = ProfileReport(data, title=title)
    
    # EXPORTANDO PARA HTML
    profile.to_file(name_save_html)


In [25]:
def convert_all_columns_uppercase(dataframe):
    
    """
    
        FUNÇÃO PARA CONVERTER TODAS AS COLUNAS DE UM DATAFRAME
        PARA UPPERCASE
        
        # Arguments
            dataframe       - Required : Dataframe a ser utilizado (DataFrame)
            
        # Returns
            dataframe_upper       - Required : Dataframe após a função (DataFrame)
    
    """
    
    dataframe_upper = dataframe.copy()
    
    try:
        dataframe_upper.columns = [column.upper() for column in dataframe_upper.columns]
        
        return dataframe_upper
    
    except Exception as ex:
        print(ex)
        
    return dataframe

In [49]:
def find_outliers_IQR(dataframe, column=None):
    
    """
    
        FUNÇÃO PARA OBTER OS OUTLIERS DE UMA VARIÁVEL
        UTILIZANDO O MÉTODO: INTERVALO INTERQUARTIL (IQR)
        
        # Arguments
            dataframe            - Required : Dataframe a ser analisado (DataFrame)
            column               - Required : Coluna a ser analisada (String)
            
        # Returns
            outliers             - Required : Lista de outliers (List)
    
    """
    
    # INICIANDO A VARIÁVEL DE RETORNO
    outliers = []
    lower = upper = None
    
    if column and column in dataframe.columns:

        q1 = dataframe[column].quantile(0.25)

        q3 = dataframe[column].quantile(0.75)

        IQR = q3-q1
        
        lower = q1 - 1.5*IQR
        upper = q3 + 1.5*IQR

        outliers = dataframe[column][((dataframe[column]<(lower)) | (dataframe[column]>(upper)))]

    return lower, upper, outliers

## Variáveis Globais

- Global Variables

In [10]:
# SEMENTE PARA RANDOMIZAÇÃO PSEUDOALEATÓRIA DOS DADOS
SEED = 42

In [11]:
# PROPORÇÃO ENTRE SEPARAÇÃO DE DADOS PARA TREINAMENTO DO MODELO E TESTE
# PERCENTUAL DE DADOS PARA TESTE
test_size = 0.3

In [12]:
# VARIÁVEL TARGET
variable_target = ""

In [13]:
# LISTA DE COLUNAS PREDITORAS (VARIÁVEIS INDEPENDENTES)
list_columns_predict = []

## Obter dados

- Get data

pt:
- Suburb: Suburb
- Address: Endereço
- Rooms: Número de cômodos
- Type: Tipo
    1. h - Casa, Chalé, Vila, Semi, Terraço;
    2. u - Unitário, Duplex;
    3. t - Sobrado;

en:
- Suburb: Suburb
- Address: Address
- Rooms: Number of rooms
- Type:
    1. h - House, Cottage, Villa, Semi, Terrace;
    2. u - Unit, Duplex;
    3. t - Townhouse;

In [16]:
df_housing = pd.read_csv(dir_data)

In [26]:
# TODAS AS COLUNAS PARA UPPERCASE

df_housing = convert_all_columns_uppercase(df_housing)

In [27]:
df_housing.head()

Unnamed: 0,SUBURB,ADDRESS,ROOMS,TYPE,PRICE,METHOD,SELLERG,DATE,DISTANCE,POSTCODE,BEDROOM2,BATHROOM,CAR,LANDSIZE,BUILDINGAREA,YEARBUILT,COUNCILAREA,LATTITUDE,LONGTITUDE,REGIONNAME,PROPERTYCOUNT
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,2.0,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,3.0,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [28]:
df_housing.describe()

Unnamed: 0,ROOMS,PRICE,DISTANCE,POSTCODE,BEDROOM2,BATHROOM,CAR,LANDSIZE,BUILDINGAREA,YEARBUILT,LATTITUDE,LONGTITUDE,PROPERTYCOUNT
count,34857.0,27247.0,34856.0,34856.0,26640.0,26631.0,26129.0,23047.0,13742.0,15551.0,26881.0,26881.0,34854.0
mean,3.031012,1050173.0,11.184929,3116.062859,3.084647,1.624798,1.728845,593.598993,160.2564,1965.289885,-37.810634,145.001851,7572.888306
std,0.969933,641467.1,6.788892,109.023903,0.98069,0.724212,1.010771,3398.841946,401.26706,37.328178,0.090279,0.120169,4428.090313
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.19043,144.42379,83.0
25%,2.0,635000.0,6.4,3051.0,2.0,1.0,1.0,224.0,102.0,1940.0,-37.86295,144.9335,4385.0
50%,3.0,870000.0,10.3,3103.0,3.0,2.0,2.0,521.0,136.0,1970.0,-37.8076,145.0078,6763.0
75%,4.0,1295000.0,14.0,3156.0,4.0,2.0,2.0,670.0,188.0,2000.0,-37.7541,145.0719,10412.0
max,16.0,11200000.0,48.1,3978.0,30.0,12.0,26.0,433014.0,44515.0,2106.0,-37.3902,145.52635,21650.0


In [29]:
df_housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SUBURB         34857 non-null  object 
 1   ADDRESS        34857 non-null  object 
 2   ROOMS          34857 non-null  int64  
 3   TYPE           34857 non-null  object 
 4   PRICE          27247 non-null  float64
 5   METHOD         34857 non-null  object 
 6   SELLERG        34857 non-null  object 
 7   DATE           34857 non-null  object 
 8   DISTANCE       34856 non-null  float64
 9   POSTCODE       34856 non-null  float64
 10  BEDROOM2       26640 non-null  float64
 11  BATHROOM       26631 non-null  float64
 12  CAR            26129 non-null  float64
 13  LANDSIZE       23047 non-null  float64
 14  BUILDINGAREA   13742 non-null  float64
 15  YEARBUILT      15551 non-null  float64
 16  COUNCILAREA    34854 non-null  object 
 17  LATTITUDE      26881 non-null  float64
 18  LONGTI

### Obtendo o PROFILE da base

In [21]:
get_data_profile(data=df_housing, 
                 title="RLT_HOUSING_MARKET", 
                 name_save_html="RLT_HOUSING_MARKET.html")

Summarize dataset:   0%|          | 0/34 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Análise exploratória de dados

- Exploratory data analysis

### Cômodos

- Rooms

In [32]:
# HISTOGRAM - ROOMS

fig = px.histogram(df_housing, x="ROOMS", nbins=20)
fig.show()

In [35]:
# BOXPLOT - ROOMS

fig = px.box(df_housing, y="ROOMS", points="all")
fig.show()

In [53]:
# USANDO O MÉTODO IQR

lower_outlier, upper_outlier, outliers = find_outliers_IQR(df_housing, column="ROOMS")

In [54]:
lower_outlier, upper_outlier, len(outliers)

(-1.0, 7.0, 33)