In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 5.5
fig_height = 3.5
fig_format = 'pdf'
fig_dpi = 300
interactivity = ''
is_shiny = False
is_dashboard = False
plotly_connected = True

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  if plotly_connected:
    pio.renderers.default = "notebook_connected"
  else:
    pio.renderers.default = "notebook"
  for template in pio.templates.keys():
    pio.templates[template].layout.margin = dict(t=30,r=0,b=0,l=0)
except Exception:
  pass

# disable itables paging for dashboards
if is_dashboard:
  try:
    from itables import options
    options.dom = 'fiBrtlp'
    options.maxBytes = 1024 * 1024
    options.language = dict(info = "Showing _TOTAL_ entries")
    options.classes = "display nowrap compact"
    options.paging = False
    options.searching = True
    options.ordering = True
    options.info = True
    options.lengthChange = False
    options.autoWidth = False
    options.responsive = True
    options.keys = True
    options.buttons = []
  except Exception:
    pass
  
  try:
    import altair as alt
    # By default, dashboards will have container sized
    # vega visualizations which allows them to flow reasonably
    theme_sentinel = '_quarto-dashboard-internal'
    def make_theme(name):
        nonTheme = alt.themes._plugins[name]    
        def patch_theme(*args, **kwargs):
            existingTheme = nonTheme()
            if 'height' not in existingTheme:
              existingTheme['height'] = 'container'
            if 'width' not in existingTheme:
              existingTheme['width'] = 'container'

            if 'config' not in existingTheme:
              existingTheme['config'] = dict()
            
            # Configure the default font sizes
            title_font_size = 15
            header_font_size = 13
            axis_font_size = 12
            legend_font_size = 12
            mark_font_size = 12
            tooltip = False

            config = existingTheme['config']

            # The Axis
            if 'axis' not in config:
              config['axis'] = dict()
            axis = config['axis']
            if 'labelFontSize' not in axis:
              axis['labelFontSize'] = axis_font_size
            if 'titleFontSize' not in axis:
              axis['titleFontSize'] = axis_font_size  

            # The legend
            if 'legend' not in config:
              config['legend'] = dict()
            legend = config['legend']
            if 'labelFontSize' not in legend:
              legend['labelFontSize'] = legend_font_size
            if 'titleFontSize' not in legend:
              legend['titleFontSize'] = legend_font_size  

            # The header
            if 'header' not in config:
              config['header'] = dict()
            header = config['header']
            if 'labelFontSize' not in header:
              header['labelFontSize'] = header_font_size
            if 'titleFontSize' not in header:
              header['titleFontSize'] = header_font_size    

            # Title
            if 'title' not in config:
              config['title'] = dict()
            title = config['title']
            if 'fontSize' not in title:
              title['fontSize'] = title_font_size

            # Marks
            if 'mark' not in config:
              config['mark'] = dict()
            mark = config['mark']
            if 'fontSize' not in mark:
              mark['fontSize'] = mark_font_size

            # Mark tooltips
            if tooltip and 'tooltip' not in mark:
              mark['tooltip'] = dict(content="encoding")

            return existingTheme
            
        return patch_theme

    # We can only do this once per session
    if theme_sentinel not in alt.themes.names():
      for name in alt.themes.names():
        alt.themes.register(name, make_theme(name))
      
      # register a sentinel theme so we only do this once
      alt.themes.register(theme_sentinel, make_theme('default'))
      alt.themes.enable('default')

  except Exception:
    pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass

# interactivity
if interactivity:
  from IPython.core.interactiveshell import InteractiveShell
  InteractiveShell.ast_node_interactivity = interactivity

# NOTE: the kernel_deps code is repeated in the cleanup.py file
# (we can't easily share this code b/c of the way it is run).
# If you edit this code also edit the same code in cleanup.py!

# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'/home/cowvin/Documents/tcc_realestate/tcc_escrita':
  os.chdir(r'/home/cowvin/Documents/tcc_realestate/tcc_escrita')

# reset state
%reset

# shiny
# Checking for shiny by using False directly because we're after the %reset. We don't want
# to set a variable that stays in global scope.
if False:
  try:
    import htmltools as _htmltools
    import ast as _ast

    _htmltools.html_dependency_render_mode = "json"

    # This decorator will be added to all function definitions
    def _display_if_has_repr_html(x):
      try:
        # IPython 7.14 preferred import
        from IPython.display import display, HTML
      except:
        from IPython.core.display import display, HTML

      if hasattr(x, '_repr_html_'):
        display(HTML(x._repr_html_()))
      return x

    # ideally we would undo the call to ast_transformers.append
    # at the end of this block whenver an error occurs, we do 
    # this for now as it will only be a problem if the user 
    # switches from shiny to not-shiny mode (and even then likely
    # won't matter)
    import builtins
    builtins._display_if_has_repr_html = _display_if_has_repr_html

    class _FunctionDefReprHtml(_ast.NodeTransformer):
      def visit_FunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

      def visit_AsyncFunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

    ip = get_ipython()
    ip.ast_transformers.append(_FunctionDefReprHtml())

  except:
    pass

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v

  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define



`set_matplotlib_formats` is deprecated since IPython 7.23, directly use `matplotlib_inline.backend_inline.set_matplotlib_formats()`





In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.model_selection import KFold

rename = {
  'Area': 'Área',
  'Area servico': 'Área de serviço',
  'Condominio': 'Condomínio',
  'Espaco gourmet': 'Espaço gourmet',
  'Iptu': 'IPTU',
  'Salao de festa': 'Salão de festa',
  'Area aluguel': 'Área de aluguel',
  'Valor aluguel': 'Valor de aluguel'
}

train_df = pd.read_csv('../data/cleaned/train.csv').drop(columns=['qnt_beneficio'])
test_df = pd.read_csv('../data/cleaned/test.csv').drop(columns=['qnt_beneficio'])

train_df.columns = train_df.columns\
  .str.capitalize().str.replace('_', ' ')
test_df.columns = test_df.columns\
  .str.capitalize().str.replace('_', ' ')

train_df = train_df.rename(columns=rename)
test_df = test_df.rename(columns=rename)

In [3]:
#| echo: true
#| eval: false

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(
  X, y,
  test_size=0.2,
  random_state=42)

numerical_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include=object).columns

pipeline = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])),
    ('model', RandomForestRegressor(
        n_estimators=100,
        random_state=42))
])

pipeline.fit(X_train, y_train)

In [4]:
def plot_cv_indices(cv, X, ax, n_splits, lw=10):
    for ii, (tr, tt) in enumerate(cv.split(X=X)):
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )

    yticklabels = list(range(n_splits))
    ax.set(
        yticks=np.arange(n_splits) + 0.5,
        yticklabels=yticklabels,
        xlabel="Posição da observação",
        ylabel="CV iteração",
        ylim=[n_splits + 0.2, -0.2],
        xlim=[0, 100],
    )
    ax.set_title("K-Fold", fontsize=15)
    return ax

In [5]:
#| label: fig-kfold
#| fig-cap: Visualização de K-Fold com 20 folds.

fig, ax = plt.subplots()
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
rng = np.random.RandomState(1338)
X = rng.randn(100, 10)
cv = KFold(20)
plot_cv_indices(cv, X, ax, 20)
plt.show()

<Figure size 1650x1050 with 1 Axes>

In [6]:
#| echo: true
#| eval: false

from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True)

model = RandomForestClassifier()

kf = KFold(n_splits=20)

scores = cross_val_score(
  estimator=model,
  X=X, y=y,
  scoring='neg_mean_squared_error',
  cv=kf)

mse_scores = - scores
mean_mse = mse_scores.mean()

print(f'Média de acurácia: {mean_mse:.4f}')

In [7]:
#| echo: true
#| eval: false

import optuna
import numpy as np
import pandas as pd
from sklearn import ensemble
from sklearn.model_selection import cross_val_score, KFold

def objective(trial):
    X = train_df[variaveis_independentes]
    y = train_df.variavel_dependente

    params = dict(
        n_estimators=trial.suggest_int(
          name='n_estimators',
          low=1,
          high=1000),
        max_depth=trial.suggest_int(
          name='max_depth',
          low=20,
          high=1000),
        max_features='sqrt',
        random_state=42
    )

    model = ensemble.RandomForestRegressor(
        *params
    )
    model.fit(X=X, y=y)

    cv_scores = np.expm1(np.sqrt(-cross_val_score(
        estimator=model,
        X=X,
        y=y,
        scoring="neg_mean_squared_error",
        n_jobs=3,
        cv=KFold(n_splits=20))))

    return np.mean(cv_scores)

study = optuna.create_study()
study.optimize(objective, n_trials=100, n_jobs=-1)

In [8]:
#| eval: false
#| echo: true

from sklearn.inspection import PartialDependenceDisplay

PartialDependenceDisplay\
	.from_estimator(
		model,
        df,
        features,
        kind="both",
        centered=True,
        random_state=set_seed
    )

In [9]:
#| eval: false
#| echo: true

import shap

X1000 = shap.utils.sample(train_df, 1000)
explainer_stacking = shap.Explainer(
	model=stacking.predict,
	mask=X1000
	)
shap_values_stacking = explainer_stacking(test_df)

shap.summary_plot(
	shap_values_stacking,
    test_df,
	)

shap.summary_plot(
	shap_values_stacking,
    test_df,
	plot_type="bar",
	)

shap.dependence_plot(
	variavel,
	shap_values_stacking.values,
    test_df.values,
    interaction_index=None,
	)

In [10]:
#| label: fig-miss
#| fig-cap: Proporção de valores ausentes por variáveis

g_missing = sns.displot(
    data=train_df.isnull() \
        .melt(value_name="Valores ausentes") \
        .replace([False, True], ["Não é ausente", "Ausente"]) \
        .groupby(["variable", "Valores ausentes"]).size() \
        .reset_index(name="count") \
        .assign(
            proportion=lambda x: x.groupby("variable")["count"].transform(lambda y: y / y.sum())
        ),
    y="variable",
    hue="Valores ausentes",
    weights="proportion",
    multiple="fill",
    height=5,
    aspect=1.1,
    palette='crest'
    )

sns.move_legend(obj=g_missing, loc="upper center",
                bbox_to_anchor=(.5, -.0001), ncol=2, title="")
g_missing.set(xlabel="Proporção de valores ausentes (%)", ylabel="")
g_missing.ax.set_xlabel("Proporção de valores ausentes (%)", fontsize=10)
plt.show()

<Figure size 2101.75x1500 with 1 Axes>

In [11]:
#| label: fig-violin
#| fig-cap: Distribuição das variáveis numéricas.

fig, axes = plt.subplots(2, 3, figsize=(20, 10))

dists_var = train_df[['Valor', 'Área', 'Área de aluguel',
                      'Valor de aluguel', 'Vaga', 'Quarto',
                      'Banheiro', 'Tipo']]\
                      .set_index('Tipo')\
                      .stack()\
                      .reset_index()\
                      .rename(
                        columns={
                          'level_1': 'Variável',
                           0: 'Valor'
                          }
                      )

sns.violinplot(
  data=train_df,
  x='Vaga',
  hue='Tipo',
  ax=axes[0][0],
  palette='crest'
).legend_.remove()

sns.violinplot(
  data=train_df,
  x='Banheiro',
  hue='Tipo',
  ax=axes[0][1],
  palette='crest'
).legend_.remove()

sns.violinplot(
  data=train_df,
  x='Quarto',
  hue='Tipo',
  ax=axes[0][2],
  palette='crest'
).legend_.remove()

sns.violinplot(
  data=train_df,
  x='Área',
  hue='Tipo',
  ax=axes[1][0],
  palette='crest'
).legend_.remove()

sns.violinplot(
  data=train_df,
  x='Valor de aluguel',
  hue='Tipo',
  ax=axes[1][1],
  palette='crest'
).legend_.remove()

sns.violinplot(
  data=train_df,
  x='Área de aluguel',
  hue='Tipo',
  ax=axes[1][2],
  palette='crest'
).legend_.remove()

axes_shape = axes.shape
for i in range(axes_shape[0]):
	for j in range(axes_shape[1]):
		axes[i,j].grid(True, which="major", axis="both", linestyle="-", color="lightgray", linewidth=0.8, alpha=0.9)
		axes[i,j].grid(True, which="minor", axis="both", linestyle=":", color="lightgray", linewidth=0.5, alpha=0.8)


l = plt.legend(loc="upper center", bbox_to_anchor=(-.66, -.17), ncol=7)
l.get_texts()[0].set_text('Apartamentos')
l.get_texts()[1].set_text('Casas de condomínio')
l.get_texts()[2].set_text('Casas')
l.get_texts()[3].set_text('Flats')
l.get_texts()[4].set_text('Terrenos, lotes e condomínios')
l.get_texts()[5].set_text('Terrenos e lotes comerciais')
# l.get_texts()[6].set_text('Casas comerciais')
plt.show()

<Figure size 6000x3000 with 6 Axes>

In [12]:
#| label: fig-densitarg
#| fig-cap: Comparação entre distribuição dos valores dos imóveis antes e depois da transformação logarítmica.

fig, axes = plt.subplots(1, 2, figsize=(20, 10))

sns.kdeplot(
  data=train_df,
  x='Valor',
  ax=axes[0],
  fill=True,
  alpha=.5,
  color="#00708d"
  )

dens_plot = sns.kdeplot(
  data=train_df.assign(Valor=lambda x: np.log1p(x.Valor)),
  x='Valor',
  ax=axes[1],
  fill=True,
  alpha=.5,
  color="#00708d"
  )

axes[0].grid(True, which="major", axis="both", linestyle="-", color="gray", linewidth=0.8, alpha=0.9)
axes[0].grid(True, which="minor", axis="both", linestyle=":", color="gray", linewidth=0.5, alpha=0.8)
axes[1].grid(True, which="major", axis="both", linestyle="-", color="gray", linewidth=0.8, alpha=0.9)
axes[1].grid(True, which="minor", axis="both", linestyle=":", color="gray", linewidth=0.5, alpha=0.8)
axes[0].set_ylabel(ylabel='Densidade', fontsize=13)
axes[1].set_ylabel(ylabel='Densidade', fontsize=13)
axes[0].set_xlabel(xlabel='Valor', fontsize=13)
axes[1].set_xlabel(xlabel='Valor', fontsize=13)
plt.show()

<Figure size 6000x3000 with 2 Axes>

In [13]:
#| label: fig-corplot
#| fig-cap: Gráfico de correlação de Spearman das variáveis independentes.

fig = plt.figure(figsize=(20, 10))

mat_plot = train_df[['Valor', 'Área', 'Área de aluguel',
                      'Valor de aluguel', 'Vaga', 'Quarto',
                      'Banheiro', 'Latitude', 'Longitude']]\
            .assign(Valor=lambda x: np.log1p(x.Valor))\
            .corr(method='spearman')

heatmap = sns.heatmap(
  mat_plot,
  cmap='crest',
  annot=True,
  annot_kws={'size': 17}
  )
plt.tick_params(axis='both', which='major', labelsize=17)
heatmap.figure.axes[-1].tick_params(labelsize=16)
plt.xticks([])
plt.show()

<Figure size 6000x3000 with 2 Axes>

In [14]:
import io
import geopandas as gpd
import folium
import pandas
import requests
import imgkit
from PIL import Image
from branca.colormap import linear

state_geo = gpd.read_file('includes/bairros.geojson')\
  .assign(nome=lambda x: x.nome.str.strip())
df_inteiro = pd.concat([train_df, test_df])
df_inteiro = df_inteiro\
      .assign(Bairro=lambda x: x\
        .Bairro\
        .str\
        .replace('_', ' ')\
        .str.upper()\
        .replace(
          {
            'ALTIPLANO': 'ALTIPLANO CABO BRANCO',
            'JOSE AMERICO': 'JOSÉ AMÉRICO',
            'TAMBAU': 'TAMBAÚ',
            'JARDIM CIDADE UNIVERSITARIA': 'JARDIM CIDADE UNIVERSITÁRIA',
            'MANAIRA': 'MANAÍRA',
            'VARJAO': 'VARJÃO',
            'BANCARIOS': 'BANCÁRIOS',
            'EXPEDICIONARIOS': 'EXPEDICIONÁRIOS',
            'JOAO AGRIPINO': 'JOÃO AGRIPINO',
            'MANDACARU': 'MANDACARÚ',
            'JARDIM SAO PAULO': 'JARDIM SÃO PAULO',
            'COLIBRIS': 'CIDADE DOS COLIBRIS',
            'ANATOLIA': 'ANATÓLIA',
            'MUCUMAGRO': 'MUÇUMAGRO',
            'AGUA FRIA': 'ÁGUA FRIA',
            'BAIRRO DOS IPES': 'BAIRRO DOS IPÊS',
            'JARDIM LUNA': 'BRISAMAR',
            'VALENTINA DE FIGUEIREDO': 'VALENTINA',
            'PLANALTO BOA ESPERANCA': 'PLANALTO DA BOA ESPERANÇA',
            'CUIA': 'CUIÁ',
            'MANGABEIRA IV': 'MANGABEIRA',
            'GEISEL': 'ERNESTO GEISEL',
            'JOAO PAULO II': 'JOÃO PAULO II',
            'TAMBIA': 'TAMBIÁ',
            'ALTO DO CEU': 'ALTO DO CÉU',
            'QUADRAMARES': 'PORTAL DO SOL',
            'CIDADE UNIVERSITARIA': 'JARDIM CIDADE UNIVERSITÁRIA',
            'MANGABEIRA VII': 'MANGABEIRA',
            'SAO JOSE': 'SÃO JOSÉ',
            'JARDIM 13 DE MAIO': 'TREZE DE MAIO',
            'PRAIA DO SOL': 'COSTA DO SOL',
            'SEIXAS': 'PONTA DO SEIXAS',
            'RANGEL': 'VARJÃO',
            'COLINAS DO SUL': 'GRAMAME',
            'AREA RURAL DE JOAO PESSOA': 'MANAÍRA'}
        ).str.strip()
      )\
      .query('Bairro not in ["INTERMATES", "PRAIA DE INTERMARES"]')


average_values_area = df_inteiro.groupby("Bairro")["Área"].mean().reset_index(name='Área')

m = folium.Map(
  location=[-7.15, -34.85],
  zoom_control=False,
  tiles="CartoDB dark_matter",
  attribution_control=False,
  zoom_start=12.49994)

folium.Choropleth(
    geo_data=state_geo,
    name="choropleth",
    nan_fill_color="gray",
    data=average_values_area,
    columns=["Bairro", "Área"],
    key_on="feature.properties.nome",
    fill_color='YlGnBu',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="Variação da média da área dos imóveis",
).add_to(m)

legend_css = """
<style>
    .legend text {
        fill: white;
    }
</style>
"""
m.get_root().html.add_child(folium.Element(legend_css))

img_data = m._to_png(20)
img = Image.open(io.BytesIO(img_data))
# img.save('includes/map_area_dark.png')

In [15]:
import io
import geopandas as gpd
import folium
import pandas
import requests
import imgkit
from PIL import Image
from branca.colormap import linear

state_geo = gpd.read_file('includes/bairros.geojson')\
  .assign(nome=lambda x: x.nome.str.strip())
df_inteiro = pd.concat([train_df, test_df])
df_inteiro = df_inteiro\
      .assign(Bairro=lambda x: x\
        .Bairro\
        .str\
        .replace('_', ' ')\
        .str.upper()\
        .replace(
          {
            'ALTIPLANO': 'ALTIPLANO CABO BRANCO',
            'JOSE AMERICO': 'JOSÉ AMÉRICO',
            'TAMBAU': 'TAMBAÚ',
            'JARDIM CIDADE UNIVERSITARIA': 'JARDIM CIDADE UNIVERSITÁRIA',
            'MANAIRA': 'MANAÍRA',
            'VARJAO': 'VARJÃO',
            'BANCARIOS': 'BANCÁRIOS',
            'EXPEDICIONARIOS': 'EXPEDICIONÁRIOS',
            'JOAO AGRIPINO': 'JOÃO AGRIPINO',
            'MANDACARU': 'MANDACARÚ',
            'JARDIM SAO PAULO': 'JARDIM SÃO PAULO',
            'COLIBRIS': 'CIDADE DOS COLIBRIS',
            'ANATOLIA': 'ANATÓLIA',
            'MUCUMAGRO': 'MUÇUMAGRO',
            'AGUA FRIA': 'ÁGUA FRIA',
            'BAIRRO DOS IPES': 'BAIRRO DOS IPÊS',
            'JARDIM LUNA': 'BRISAMAR',
            'VALENTINA DE FIGUEIREDO': 'VALENTINA',
            'PLANALTO BOA ESPERANCA': 'PLANALTO DA BOA ESPERANÇA',
            'CUIA': 'CUIÁ',
            'MANGABEIRA IV': 'MANGABEIRA',
            'GEISEL': 'ERNESTO GEISEL',
            'JOAO PAULO II': 'JOÃO PAULO II',
            'TAMBIA': 'TAMBIÁ',
            'ALTO DO CEU': 'ALTO DO CÉU',
            'QUADRAMARES': 'PORTAL DO SOL',
            'CIDADE UNIVERSITARIA': 'JARDIM CIDADE UNIVERSITÁRIA',
            'MANGABEIRA VII': 'MANGABEIRA',
            'SAO JOSE': 'SÃO JOSÉ',
            'JARDIM 13 DE MAIO': 'TREZE DE MAIO',
            'PRAIA DO SOL': 'COSTA DO SOL',
            'SEIXAS': 'PONTA DO SEIXAS',
            'RANGEL': 'VARJÃO',
            'COLINAS DO SUL': 'GRAMAME',
            'AREA RURAL DE JOAO PESSOA': 'MANAÍRA'}
        ).str.strip()
      )\
      .query('Bairro not in ["INTERMATES", "PRAIA DE INTERMARES"]')


average_values = df_inteiro\
  .assign(**{"Valor $m^2$": lambda x: x.Valor / x["Área"]})\
  .groupby("Bairro")["Valor $m^2$"]\
  .mean().reset_index(name='Valor $m^2$')\
  .query("Bairro != 'INTERMARES'")

m = folium.Map(
  location=[-7.15, -34.85],
  zoom_control=False,
  attribution_control=False,
  zoom_start=12.49994)

folium.Choropleth(
    geo_data=state_geo,
    name="choropleth",
    nan_fill_color="gray",
    data=average_values,
    columns=["Bairro", "Valor $m^2$"],
    key_on="feature.properties.nome",
    fill_color='YlGnBu',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="Variação do valor médio do m² dos imóveis",
).add_to(m)

# legend_css = """
# <style>
#     .legend text {
#         fill: white; /* Change legend text color to white */
#     }
# </style>
# """
# m.get_root().html.add_child(folium.Element(legend_css))

img_data = m._to_png(20)
img = Image.open(io.BytesIO(img_data))
img.save('includes/map_valor_m2.png')

In [16]:
import optuna.visualization
import joblib

study1 = joblib.load('../app/modeling/study_pkl/study_rf.pkl')
study2 = joblib.load('../app/modeling/study_pkl/study_gdt.pkl')
study3 = joblib.load('../app/modeling/study_pkl/study_lgbm.pkl')
study4 = joblib.load('../app/modeling/study_pkl/study_xgb.pkl')

In [17]:
#| label: fig-rf_slice

fig_rf_slice = optuna.visualization.matplotlib.plot_slice(study1)
plt.suptitle('')

fig, ax = plt.gcf(), plt.gca()

fig.axes[-1].collections[1].set_cmap("crest")
fig.axes[-1].set_ylabel("Tentativa")

for ax in fig.axes[:-1]:
    for coll in ax.collections:
        coll.set_cmap("crest")

fig_rf_slice[0].set_ylabel('RMSE')
fig_rf_slice[0].set_title('RMSE')
for _ in fig_rf_slice:
  _.set_title('')
  _.grid(True, color='grey', linewidth=.05)
  _.set_facecolor('white')
  _.spines['top'].set_visible(True)
  _.spines['right'].set_visible(True)
  _.spines['left'].set_visible(True)
  _.spines['bottom'].set_visible(True)

plt.show()


plot_slice is experimental (supported from v2.2.0). The interface can change in the future.



<Figure size 1650x1050 with 3 Axes>

In [18]:
#| label: fig-rf_import

fig_rf_import = optuna.visualization.matplotlib.plot_param_importances(study1)

for bar in fig_rf_import.patches:
    bar.set_color("#00708d")

fig_rf_import.grid(True, color="grey", linewidth=0.05)
fig_rf_import.set_facecolor("white")
fig_rf_import.get_children()[-4].set_text('')
fig_rf_import.spines['top'].set_visible(True)
fig_rf_import.spines['right'].set_visible(True)
fig_rf_import.spines['left'].set_visible(True)
fig_rf_import.spines['bottom'].set_visible(True)
fig_rf_import.set_xlabel('Importância do Hiperparâmetro')
fig_rf_import.set_ylabel('Hiperparâmetro')
h, l = fig_rf_import.get_legend_handles_labels()
fig_rf_import.legend(h, ['RMSE'])

plt.show()


plot_param_importances is experimental (supported from v2.2.0). The interface can change in the future.



<Figure size 1650x1050 with 1 Axes>

In [19]:
#| label: fig-rf_history

fig_rf_hist = optuna.visualization.matplotlib.plot_optimization_history(study1)

for collection in fig_rf_hist.collections:
    collection.set_color("#00708d")

fig_rf_hist.grid(True, color="grey", linewidth=0.05)
fig_rf_hist.set_facecolor("white")
fig_rf_hist.get_children()[-5].set_text('')
fig_rf_hist.spines['top'].set_visible(True)
fig_rf_hist.spines['right'].set_visible(True)
fig_rf_hist.spines['left'].set_visible(True)
fig_rf_hist.spines['bottom'].set_visible(True)
fig_rf_hist.set_xlabel('Trial')
fig_rf_hist.set_ylabel('RMSE')
h, l = fig_rf_hist.get_legend_handles_labels()
fig_rf_hist.legend(h, ['RMSE', 'Melhor trial'])

plt.show()


plot_optimization_history is experimental (supported from v2.2.0). The interface can change in the future.



<Figure size 1650x1050 with 1 Axes>

In [20]:
#| label: fig-rf_contour

fig_rf_contour = optuna.visualization.matplotlib.plot_contour(study1)

fig, ax = plt.gcf(), plt.gca()

fig.axes[-1].set_ylabel("RMSE")
fig_rf_contour.get_children()[1].set_cmap('crest')

fig_rf_contour.get_children()[-4].set_text('')

plt.show()


plot_contour is experimental (supported from v2.2.0). The interface can change in the future.

[W 2025-01-21 21:07:24,048] Output figures of this Matplotlib-based `plot_contour` function would be different from those of the Plotly-based `plot_contour`.


<Figure size 1650x1050 with 2 Axes>

In [21]:
#| label: fig-gdt_slice

fig_gdt_slice = optuna.visualization.matplotlib.plot_slice(study2)
plt.suptitle('')

fig, ax = plt.gcf(), plt.gca()

fig.axes[-1].collections[1].set_cmap("crest")
fig.axes[-1].set_ylabel("Tentativa")

for ax in fig.axes[:-1]:
    for coll in ax.collections:
        coll.set_cmap("crest")

fig_gdt_slice[0].set_ylabel('RMSE')
fig_gdt_slice[0].set_title('RMSE')
for _ in fig_gdt_slice:
  _.set_title('')
  _.grid(True, color='grey', linewidth=.05)
  _.set_facecolor('white')
  _.spines['top'].set_visible(True)
  _.spines['right'].set_visible(True)
  _.spines['left'].set_visible(True)
  _.spines['bottom'].set_visible(True)

plt.show()


plot_slice is experimental (supported from v2.2.0). The interface can change in the future.



<Figure size 2475x1050 with 4 Axes>

In [22]:
#| label: fig-gdt_import

fig_gdt_import = optuna.visualization.matplotlib.plot_param_importances(study2)

for bar in fig_gdt_import.patches:
    bar.set_color("#00708d")

fig_gdt_import.grid(True, color="grey", linewidth=0.05)
fig_gdt_import.set_facecolor("white")
fig_gdt_import.get_children()[-4].set_text('')
fig_gdt_import.spines['top'].set_visible(True)
fig_gdt_import.spines['right'].set_visible(True)
fig_gdt_import.spines['left'].set_visible(True)
fig_gdt_import.spines['bottom'].set_visible(True)
fig_gdt_import.set_xlabel('Importância do Hiperparâmetro')
fig_gdt_import.set_ylabel('Hiperparâmetro')
h, l = fig_gdt_import.get_legend_handles_labels()
fig_gdt_import.legend(h, ['RMSE'])

plt.show()


plot_param_importances is experimental (supported from v2.2.0). The interface can change in the future.



<Figure size 1650x1050 with 1 Axes>

In [23]:
#| label: fig-gdt_history
fig_gdt_hist = optuna.visualization.matplotlib.plot_optimization_history(study2)

for collection in fig_gdt_hist.collections:
    collection.set_color("#00708d")

fig_gdt_hist.grid(True, color="grey", linewidth=0.05)
fig_gdt_hist.set_facecolor("white")
fig_gdt_hist.get_children()[-5].set_text('')
fig_gdt_hist.spines['top'].set_visible(True)
fig_gdt_hist.spines['right'].set_visible(True)
fig_gdt_hist.spines['left'].set_visible(True)
fig_gdt_hist.spines['bottom'].set_visible(True)
fig_gdt_hist.set_xlabel('Trial')
fig_gdt_hist.set_ylabel('RMSE')
h, l = fig_gdt_hist.get_legend_handles_labels()
fig_gdt_hist.legend(h, ['RMSE', 'Melhor trial'])

plt.show()


plot_optimization_history is experimental (supported from v2.2.0). The interface can change in the future.



<Figure size 1650x1050 with 1 Axes>

In [24]:
#| label: fig-gdt_contour

fig_gdt_contour = optuna.visualization.matplotlib.plot_contour(study2)
plt.suptitle('')

for i in range(fig_gdt_contour.shape[0]):

    for j in range(fig_gdt_contour.shape[1]):
      ax = fig_gdt_contour[i,j]
      if j == i:
        ax.grid(True, color='grey', linewidth=.05)
        ax.spines['top'].set_visible(True)
        ax.spines['right'].set_visible(True)
        ax.spines['left'].set_visible(True)
        ax.spines['bottom'].set_visible(True)
        ax.set_facecolor('white')
      else:
        ax.get_children()[1].set_cmap('crest')

      if j == 0:
        ax.set_ylabel(ax.get_ylabel(), rotation=0, ha='right', fontsize=10)

      if i == fig_gdt_contour.shape[0] - 1:
        ax.set_xlabel(ax.get_xlabel(), rotation=45, ha='right', fontsize=10)

fig, ax = plt.gcf(), plt.gca()

fig.axes[-1].set_ylabel("RMSE")

plt.show()


plot_contour is experimental (supported from v2.2.0). The interface can change in the future.

[W 2025-01-21 21:07:25,774] Output figures of this Matplotlib-based `plot_contour` function would be different from those of the Plotly-based `plot_contour`.


<Figure size 1650x1050 with 10 Axes>

In [25]:
#| label: fig-lgbm_slice

fig_lgbm_slice = optuna.visualization.matplotlib.plot_slice(study3)
plt.suptitle('')

fig, ax = plt.gcf(), plt.gca()

fig.axes[-1].collections[1].set_cmap("crest")
fig.axes[-1].set_ylabel("Tentativa")

for ax in fig.axes[:-1]:
    for coll in ax.collections:
        coll.set_cmap("crest")

fig_lgbm_slice[0].set_ylabel('RMSE')
fig_lgbm_slice[0].set_title('RMSE')
for _ in fig_lgbm_slice:
  _.set_title('')
  _.grid(True, color='grey', linewidth=.05)
  _.set_facecolor('white')
  _.spines['top'].set_visible(True)
  _.spines['right'].set_visible(True)
  _.spines['left'].set_visible(True)
  _.spines['bottom'].set_visible(True)

plt.show()


plot_slice is experimental (supported from v2.2.0). The interface can change in the future.



<Figure size 3300x1050 with 5 Axes>

In [26]:
#| label: fig-lgbm_import

fig_lgbm_import = optuna.visualization.matplotlib.plot_param_importances(study3)

for bar in fig_lgbm_import.patches:
    bar.set_color("#00708d")

fig_lgbm_import.grid(True, color="grey", linewidth=0.05)
fig_lgbm_import.set_facecolor("white")
fig_lgbm_import.get_children()[-4].set_text('')
fig_lgbm_import.spines['top'].set_visible(True)
fig_lgbm_import.spines['right'].set_visible(True)
fig_lgbm_import.spines['left'].set_visible(True)
fig_lgbm_import.spines['bottom'].set_visible(True)
fig_lgbm_import.set_xlabel('Importância do Hiperparâmetro')
fig_lgbm_import.set_ylabel('Hiperparâmetro')
h, l = fig_lgbm_import.get_legend_handles_labels()
fig_lgbm_import.legend(h, ['RMSE'])

plt.show()


plot_param_importances is experimental (supported from v2.2.0). The interface can change in the future.



<Figure size 1650x1050 with 1 Axes>

In [27]:
#| label: fig-lgbm_history
fig_lgbm_hist = optuna.visualization.matplotlib.plot_optimization_history(study3)

for collection in fig_lgbm_hist.collections:
    collection.set_color("#00708d")

fig_lgbm_hist.grid(True, color="grey", linewidth=0.05)
fig_lgbm_hist.set_facecolor("white")
fig_lgbm_hist.get_children()[-5].set_text('')
fig_lgbm_hist.spines['top'].set_visible(True)
fig_lgbm_hist.spines['right'].set_visible(True)
fig_lgbm_hist.spines['left'].set_visible(True)
fig_lgbm_hist.spines['bottom'].set_visible(True)
fig_lgbm_hist.set_xlabel('Trial')
fig_lgbm_hist.set_ylabel('RMSE')
h, l = fig_lgbm_hist.get_legend_handles_labels()
fig_lgbm_hist.legend(h, ['RMSE', 'Melhor trial'])

plt.show()


plot_optimization_history is experimental (supported from v2.2.0). The interface can change in the future.



<Figure size 1650x1050 with 1 Axes>

In [28]:
#| label: fig-lgbm_contour

fig_lgbm_contour = optuna.visualization.matplotlib.plot_contour(study3)
plt.suptitle('')

# for i in range(fig_xgb_contour.shape[0]):
#     for j in range(fig_xgb_contour.shape[1]):
#         ax = fig_xgb_contour[i, j]
#         if j == i:  # Diagonal plots
#             ax.grid(True, color='grey', linewidth=.05)
#             ax.spines['top'].set_visible(True)
#             ax.spines['right'].set_visible(True)
#             ax.spines['left'].set_visible(True)
#             ax.spines['bottom'].set_visible(True)
#             ax.set_facecolor('white')
#         else:
#             ax.get_children()[1].set_cmap('crest')

#         if j == 0:
#             ax.set_ylabel(ax.get_ylabel(), rotation=0, ha='right')

#         # if i == fig_xgb_contour.shape[0] - 1:
#         #     ax.set_xlabel(ax.get_xlabel(), rotation=45, ha='right')

for i in range(fig_lgbm_contour.shape[0]):

    for j in range(fig_lgbm_contour.shape[1]):
      ax = fig_lgbm_contour[i,j]
      if j == i:
        ax.grid(True, color='grey', linewidth=.05)
        ax.spines['top'].set_visible(True)
        ax.spines['right'].set_visible(True)
        ax.spines['left'].set_visible(True)
        ax.spines['bottom'].set_visible(True)
        ax.set_facecolor('white')
      else:
        ax.get_children()[1].set_cmap('crest')

      if j == 0:
        ax.set_ylabel(ax.get_ylabel(), rotation=0, ha='right', fontsize=10)

      if i == fig_lgbm_contour.shape[0] - 1:
        ax.set_xlabel(ax.get_xlabel(), rotation=45, ha='right', fontsize=10)

fig, ax = plt.gcf(), plt.gca()

fig.axes[-1].set_ylabel("RMSE")

plt.show()


plot_contour is experimental (supported from v2.2.0). The interface can change in the future.

[W 2025-01-21 21:07:28,526] Output figures of this Matplotlib-based `plot_contour` function would be different from those of the Plotly-based `plot_contour`.


<Figure size 1650x1050 with 17 Axes>

In [29]:
#| label: fig-xgb_slice

fig_xgb_slice = optuna.visualization.matplotlib.plot_slice(study4)
plt.suptitle('')

fig, ax = plt.gcf(), plt.gca()

fig.axes[-1].collections[1].set_cmap("crest")
fig.axes[-1].set_ylabel("Tentativa")

for ax in fig.axes[:-1]:
    for coll in ax.collections:
        coll.set_cmap("crest")

fig_xgb_slice[0].set_ylabel('RMSE')
fig_xgb_slice[0].set_title('RMSE')
for _ in fig_xgb_slice:
  _.set_title('')
  _.grid(True, color='grey', linewidth=.05)
  _.set_facecolor('white')
  _.spines['top'].set_visible(True)
  _.spines['right'].set_visible(True)
  _.spines['left'].set_visible(True)
  _.spines['bottom'].set_visible(True)

plt.show()


plot_slice is experimental (supported from v2.2.0). The interface can change in the future.



<Figure size 2475x1050 with 4 Axes>

In [30]:
#| label: fig-xgb_import

fig_xgb_import = optuna.visualization.matplotlib.plot_param_importances(study4)

for bar in fig_xgb_import.patches:
    bar.set_color("#00708d")

fig_xgb_import.grid(True, color="grey", linewidth=0.05)
fig_xgb_import.set_facecolor("white")
fig_xgb_import.get_children()[-4].set_text('')
fig_xgb_import.spines['top'].set_visible(True)
fig_xgb_import.spines['right'].set_visible(True)
fig_xgb_import.spines['left'].set_visible(True)
fig_xgb_import.spines['bottom'].set_visible(True)
fig_xgb_import.set_xlabel('Importância do Hiperparâmetro')
fig_xgb_import.set_ylabel('Hiperparâmetro')
h, l = fig_xgb_import.get_legend_handles_labels()
fig_xgb_import.legend(h, ['RMSE'])

plt.show()


plot_param_importances is experimental (supported from v2.2.0). The interface can change in the future.



<Figure size 1650x1050 with 1 Axes>

In [31]:
#| label: fig-xgb_history
fig_xgb_hist = optuna.visualization.matplotlib.plot_optimization_history(study4)

for collection in fig_xgb_hist.collections:
    collection.set_color("#00708d")

fig_xgb_hist.grid(True, color="grey", linewidth=0.05)
fig_xgb_hist.set_facecolor("white")
fig_xgb_hist.get_children()[-5].set_text('')
fig_xgb_hist.spines['top'].set_visible(True)
fig_xgb_hist.spines['right'].set_visible(True)
fig_xgb_hist.spines['left'].set_visible(True)
fig_xgb_hist.spines['bottom'].set_visible(True)
fig_xgb_hist.set_xlabel('Trial')
fig_xgb_hist.set_ylabel('RMSE')
h, l = fig_xgb_hist.get_legend_handles_labels()
fig_xgb_hist.legend(h, ['RMSE', 'Melhor trial'])

plt.show()


plot_optimization_history is experimental (supported from v2.2.0). The interface can change in the future.



<Figure size 1650x1050 with 1 Axes>

In [32]:
#| label: fig-xgb_contour

fig_xgb_contour = optuna.visualization.matplotlib.plot_contour(study4)
plt.suptitle('')

for i in range(fig_xgb_contour.shape[0]):
    for j in range(fig_xgb_contour.shape[1]):
        ax = fig_xgb_contour[i, j]
        if j == i:  # Diagonal plots
            ax.grid(True, color='grey', linewidth=.05)
            ax.spines['top'].set_visible(True)
            ax.spines['right'].set_visible(True)
            ax.spines['left'].set_visible(True)
            ax.spines['bottom'].set_visible(True)
            ax.set_facecolor('white')
        else:
            ax.get_children()[1].set_cmap('crest')

        if j == 0:
            ax.set_ylabel(ax.get_ylabel(), rotation=0, ha='right')

        # if i == fig_xgb_contour.shape[0] - 1:
        #     ax.set_xlabel(ax.get_xlabel(), rotation=45, ha='right')

fig, ax = plt.gcf(), plt.gca()
fig.axes[-1].set_ylabel("RMSE")

plt.show()


plot_contour is experimental (supported from v2.2.0). The interface can change in the future.

[W 2025-01-21 21:07:32,443] Output figures of this Matplotlib-based `plot_contour` function would be different from those of the Plotly-based `plot_contour`.


<Figure size 1650x1050 with 10 Axes>