docs/notebooks/11_regression.html


<!DOCTYPE html>

<html>
  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Spatial Regression &#8212; Geographic Data Science with Python</title>
    
  <link rel="stylesheet" href="../_static/css/index.f658d18f9b420779cfdf24aa0a7e2d77.css">

    
  <link rel="stylesheet"
    href="../_static/vendor/fontawesome/5.13.0/css/all.min.css">
  <link rel="preload" as="font" type="font/woff2" crossorigin
    href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
  <link rel="preload" as="font" type="font/woff2" crossorigin
    href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">

    
  <link rel="stylesheet"
    href="../_static/vendor/open-sans_all/1.44.1/index.css">
  <link rel="stylesheet"
    href="../_static/vendor/lato_latin-ext/1.44.1/index.css">

    
    <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
    <link rel="stylesheet" href="../_static/sphinx-book-theme.e7340bb3dbd8dde6db86f25597f54a1b.css" type="text/css" />
    <link rel="stylesheet" type="text/css" href="../_static/togglebutton.css" />
    <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
    <link rel="stylesheet" type="text/css" href="../_static/mystnb.css" />
    <link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css" />
    <link rel="stylesheet" type="text/css" href="../_static/custom.css" />
    <link rel="stylesheet" type="text/css" href="../_static/panels-main.c949a650a448cc0ae9fd3441c0e17fb0.css" />
    <link rel="stylesheet" type="text/css" href="../_static/panels-variables.06eb56fa6e07937060861dad626602ad.css" />
    
  <link rel="preload" as="script" href="../_static/js/index.d3f166471bb80abb5163.js">

    <script id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
    <script src="../_static/jquery.js"></script>
    <script src="../_static/underscore.js"></script>
    <script src="../_static/doctools.js"></script>
    <script src="../_static/togglebutton.js"></script>
    <script src="../_static/clipboard.min.js"></script>
    <script src="../_static/copybutton.js"></script>
    <script >var togglebuttonSelector = '.toggle, .admonition.dropdown, .tag_hide_input div.cell_input, .tag_hide-input div.cell_input, .tag_hide_output div.cell_output, .tag_hide-output div.cell_output, .tag_hide_cell.cell, .tag_hide-cell.cell';</script>
    <script src="../_static/sphinx-book-theme.7d483ff0a819d6edff12ce0b1ead3928.js"></script>
    <script async="async" src="https://unpkg.com/thebelab@latest/lib/index.js"></script>
    <script >
        const thebe_selector = ".thebe"
        const thebe_selector_input = "pre"
        const thebe_selector_output = ".output"
    </script>
    <script async="async" src="../_static/sphinx-thebe.js"></script>
    <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
    <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["\\(", "\\)"]], "displayMath": [["\\[", "\\]"]], "processRefs": false, "processEnvironments": false}})</script>
    <link rel="canonical" href="https://geographicdata.science/book/notebooks/11_regression.html" />
    <link rel="shortcut icon" href="../_static/favicon.ico"/>
    <link rel="index" title="Index" href="../genindex.html" />
    <link rel="search" title="Search" href="../search.html" />
    <link rel="next" title="Spatial Feature Engineering" href="12_feature_engineering.html" />
    <link rel="prev" title="Clustering &amp; Regionalization" href="10_clustering_and_regionalization.html" />

    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <meta name="docsearch:language" content="en" />


<!-- Opengraph tags -->
<meta property="og:url"         content="https://geographicdata.science/book/notebooks/11_regression.html" />
<meta property="og:type"        content="article" />
<meta property="og:title"       content="Spatial Regression" />
<meta property="og:description" content="Spatial Regression  Introduction  What is spatial regression and why should I care?  Regression (and prediction more generally) provides us a perfect case to ex" />
<meta property="og:image"       content="https://geographicdata.science/book/_static/logo.png" />

<meta name="twitter:card" content="summary" />


  </head>
  <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
    

    <div class="container-xl">
      <div class="row">
          
<div class="col-12 col-md-3 bd-sidebar site-navigation show" id="site-navigation">
    
        <div class="navbar-brand-box">
<a class="navbar-brand text-wrap" href="../index.html">
  
  <img src="../_static/logo.png" class="logo" alt="logo">
  
  
  <h1 class="site-logo" id="site-title">Geographic Data Science with Python</h1>
  
</a>
</div><form class="bd-search d-flex align-items-center" action="../search.html" method="get">
  <i class="icon fas fa-search"></i>
  <input type="search" class="form-control" name="q" id="search-input" placeholder="Search this book..." aria-label="Search this book..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
    <ul class="nav sidenav_l1">
 <li class="toctree-l1">
  <a class="reference internal" href="../intro.html">
   Home
  </a>
 </li>
</ul>
<p class="caption collapsible-parent">
 <span class="caption-text">
  Preface
 </span>
</p>
<ul class="nav sidenav_l1">
 <li class="toctree-l1">
  <a class="reference internal" href="00_toc.html">
   Table of Contents
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="references.html">
   References
  </a>
 </li>
</ul>
<p class="caption collapsible-parent">
 <span class="caption-text">
  Part I - Building Blocks
 </span>
</p>
<ul class="nav sidenav_l1">
 <li class="toctree-l1">
  <a class="reference internal" href="../intro_part_i.html">
   Overview
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="01_geospatial_computational_environment.html">
   Geospatial Computational Environment
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="02_geo_thinking.html">
   Geographic thinking for data scientists
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="03_spatial_data.html">
   Spatial Data
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="04_spatial_weights.html">
   Spatial Weights
  </a>
 </li>
</ul>
<p class="caption collapsible-parent">
 <span class="caption-text">
  Part II - Spatial Data Analysis
 </span>
</p>
<ul class="nav sidenav_l1">
 <li class="toctree-l1">
  <a class="reference internal" href="../intro_part_ii.html">
   Overview
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="05_choropleth.html">
   Choropleth Mapping
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="06_spatial_autocorrelation.html">
   Global Spatial Autocorrelation
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="07_local_autocorrelation.html">
   Local Spatial Autocorrelation
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="08_point_pattern_analysis.html">
   Point Pattern Analysis
  </a>
 </li>
</ul>
<p class="caption collapsible-parent">
 <span class="caption-text">
  Part III - Advanced Topics
 </span>
</p>
<ul class="current nav sidenav_l1">
 <li class="toctree-l1">
  <a class="reference internal" href="../intro_part_ii.html">
   Overview
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="09_spatial_inequality.html">
   Spatial Inequality
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="10_clustering_and_regionalization.html">
   Clustering &amp; Regionalization
  </a>
 </li>
 <li class="toctree-l1 current active">
  <a class="current reference internal" href="#">
   Spatial Regression
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="12_feature_engineering.html">
   Spatial Feature Engineering
  </a>
 </li>
</ul>
<p class="caption collapsible-parent">
 <span class="caption-text">
  Datasets
 </span>
</p>
<ul class="nav sidenav_l1">
 <li class="toctree-l1">
  <a class="reference internal" href="../data/README.html">
   Overview
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="../data/airbnb/regression_cleaning.html">
   AirBnb
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="../data/airports/airports_cleaning.html">
   Airports
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="../data/brexit/brexit_cleaning.html">
   Brexit
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="../data/countries/countries_cleaning.html">
   Countries
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="../data/h3_grid/build_sd_h3_grid.html">
   H3 Grid
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="../data/mexico/README.html">
   Mexico
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="../data/nasadem/build_nasadem_sd.html">
   NASA DEM
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="../data/sandiego/sandiego_tracts_cleaning.html">
   San Diego Tracts
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="../data/texas/README.html">
   Texas
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="../data/tokyo/tokyo_cleaning.html">
   Tokyo Photographs
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="../data/us_county_income/README.html">
   US County Income 1969-2017
  </a>
 </li>
</ul>

</nav> <!-- To handle the deprecated key -->

<div class="navbar_extra_footer">
  Powered by <a href="https://jupyterbook.org">Jupyter Book</a>
</div>

</div>


<main class="col py-md-3 pl-md-4 bd-content overflow-auto" role="main">
    
    <div class="topbar container-xl fixed-top">
    <div class="topbar-contents row">
        <div class="col-12 col-md-3 bd-topbar-whitespace site-navigation show"></div>
        <div class="col pl-md-4 topbar-main">
            
            <button id="navbar-toggler" class="navbar-toggler ml-0" type="button" data-toggle="collapse"
                data-toggle="tooltip" data-placement="bottom" data-target=".site-navigation" aria-controls="navbar-menu"
                aria-expanded="true" aria-label="Toggle navigation" aria-controls="site-navigation"
                title="Toggle navigation" data-toggle="tooltip" data-placement="left">
                <i class="fas fa-bars"></i>
                <i class="fas fa-arrow-left"></i>
                <i class="fas fa-arrow-up"></i>
            </button>
            
            
<div class="dropdown-buttons-trigger">
    <button id="dropdown-buttons-trigger" class="btn btn-secondary topbarbtn" aria-label="Download this page"><i
            class="fas fa-download"></i></button>

    <div class="dropdown-buttons">
        <!-- ipynb file if we had a myst markdown file -->
        
        <!-- Download raw file -->
        <a class="dropdown-buttons" href="../_sources/notebooks/11_regression.ipynb"><button type="button"
                class="btn btn-secondary topbarbtn" title="Download source file" data-toggle="tooltip"
                data-placement="left">.ipynb</button></a>
        <!-- Download PDF via print -->
        <button type="button" id="download-print" class="btn btn-secondary topbarbtn" title="Print to PDF"
            onClick="window.print()" data-toggle="tooltip" data-placement="left">.pdf</button>
    </div>
</div>

            <!-- Source interaction buttons -->


            <!-- Full screen (wrap in <a> to have style consistency -->
            <a class="full-screen-button"><button type="button" class="btn btn-secondary topbarbtn" data-toggle="tooltip"
                    data-placement="bottom" onclick="toggleFullScreen()" aria-label="Fullscreen mode"
                    title="Fullscreen mode"><i
                        class="fas fa-expand"></i></button></a>

            <!-- Launch buttons -->

<div class="dropdown-buttons-trigger">
    <button id="dropdown-buttons-trigger" class="btn btn-secondary topbarbtn"
        aria-label="Launch interactive content"><i class="fas fa-rocket"></i></button>
    <div class="dropdown-buttons">
        
        <a class="binder-button" href="https://mybinder.org/v2/gh/gdsbook/book/master?urlpath=lab/tree/notebooks/11_regression.ipynb"><button type="button"
                class="btn btn-secondary topbarbtn" title="Launch Binder" data-toggle="tooltip"
                data-placement="left"><img class="binder-button-logo"
                    src="../_static/images/logo_binder.svg"
                    alt="Interact on binder">Binder</button></a>
        
        
        <a class="colab-button" href="https://colab.research.google.com/github/gdsbook/book/blob/master/notebooks/11_regression.ipynb"><button type="button" class="btn btn-secondary topbarbtn"
                title="Launch Colab" data-toggle="tooltip" data-placement="left"><img class="colab-button-logo"
                    src="../_static/images/logo_colab.png"
                    alt="Interact on Colab">Colab</button></a>
        
        
    </div>
</div>

        </div>

        <!-- Table of contents -->
        <div class="d-none d-md-block col-md-2 bd-toc show">
            
        <div class="tocsection onthispage pt-5 pb-3">
            <i class="fas fa-list"></i>
            Contents
        </div>
        <nav id="bd-toc-nav">
            <ul class="nav section-nav flex-column">
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#introduction">
   Introduction
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#what-is-spatial-regression-and-why-should-i-care">
     <em>
      What
     </em>
     is spatial regression and
     <em>
      why
     </em>
     should I care?
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#the-data-san-diego-airbnb">
     The Data: San Diego AirBnB
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#non-spatial-regression-a-very-quick-refresh">
   Non-spatial regression, a (very) quick refresh
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#hidden-structures">
     Hidden Structures
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#bringing-space-into-the-regression-framework">
   Bringing space into the regression framework
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#spatial-feature-engineering">
     Spatial Feature Engineering
    </a>
    <ul class="nav section-nav flex-column">
     <li class="toc-h4 nav-item toc-entry">
      <a class="reference internal nav-link" href="#proximity-variables">
       Proximity variables
      </a>
     </li>
    </ul>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#spatial-heterogeneity">
     Spatial Heterogeneity
    </a>
    <ul class="nav section-nav flex-column">
     <li class="toc-h4 nav-item toc-entry">
      <a class="reference internal nav-link" href="#spatial-regimes">
       Spatial Regimes
      </a>
     </li>
    </ul>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#spatial-dependence">
     Spatial Dependence
    </a>
    <ul class="nav section-nav flex-column">
     <li class="toc-h4 nav-item toc-entry">
      <a class="reference internal nav-link" href="#exogenous-effects-the-slx-model">
       Exogenous effects: The SLX Model
      </a>
     </li>
     <li class="toc-h4 nav-item toc-entry">
      <a class="reference internal nav-link" href="#spatial-error">
       Spatial Error
      </a>
     </li>
     <li class="toc-h4 nav-item toc-entry">
      <a class="reference internal nav-link" href="#spatial-lag">
       Spatial Lag
      </a>
     </li>
     <li class="toc-h4 nav-item toc-entry">
      <a class="reference internal nav-link" href="#other-ways-of-bringing-space-into-regression">
       Other ways of bringing space into regression
      </a>
     </li>
    </ul>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#questions">
   Questions
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#challenge-questions">
     Challenge Questions
    </a>
    <ul class="nav section-nav flex-column">
     <li class="toc-h4 nav-item toc-entry">
      <a class="reference internal nav-link" href="#the-random-coast">
       The random coast
      </a>
     </li>
     <li class="toc-h4 nav-item toc-entry">
      <a class="reference internal nav-link" href="#the-k-neighbor-correlogram">
       The K-neighbor correlogram
      </a>
     </li>
    </ul>
   </li>
  </ul>
 </li>
</ul>

        </nav>
        
        </div>
    </div>
</div>
    <div id="main-content" class="row">
        <div class="col-12 col-md-9 pl-md-3 pr-md-0">
        
              <div>
                
  <div class="section" id="spatial-regression">
<h1>Spatial Regression<a class="headerlink" href="#spatial-regression" title="Permalink to this headline">¶</a></h1>
<div class="section" id="introduction">
<h2>Introduction<a class="headerlink" href="#introduction" title="Permalink to this headline">¶</a></h2>
<div class="section" id="what-is-spatial-regression-and-why-should-i-care">
<h3><em>What</em> is spatial regression and <em>why</em> should I care?<a class="headerlink" href="#what-is-spatial-regression-and-why-should-i-care" title="Permalink to this headline">¶</a></h3>
<p>Regression (and prediction more generally) provides us a perfect case to examine how spatial structure can help us understand and analyze our data.
Usually, spatial structure helps models in one of two ways.
The first (and most clear) way space can have an impact on our data is when the process <em>generating</em> the data is itself explicitly spatial.
Here, think of something like the prices for single family homes.
It’s often the case that individuals pay a premium on their house price in order to live in a better school district for the same quality house.
Alternatively, homes closer to noise or chemical polluters like waste water treatment plants, recycling facilities, or wide highways, may actually be cheaper than we would otherwise anticipate.
Finally, in cases like asthma incidence, the locations individuals tend to travel to throughout the day, such as their places of work or recreation, may have more impact on their health than their residential addresses.
In this case, it may be necessary to use data <em>from other sites</em> to predict the asthma incidence at a given site.
Regardless of the specific case at play, here, <em>geography is a feature</em>: it directly helps us make predictions about outcomes <em>because those outcomes obtain from geographical processes</em>.</p>
<p>An alternative (and more skeptical understanding) reluctantly acknowledges geography’s instrumental value.
Often, in the analysis of predictive methods and classifiers, we are interested in analyzing what we get wrong.
This is common in econometrics; an analyst may be concerned that the model <em>systematically</em> mis-predicts some types of observations.
If we know our model routinely performs poorly on a known set of observations or type of input, we might make a better model if we can account for this.
Among other kinds of error diagnostics, geography provides us with an exceptionally-useful embedding to assess structure in our errors.
Mapping classification/prediction error can help show whether or not there are <em>clusters of error</em> in our data.
If we <em>know</em> that errors tend to be larger in some areas than in other areas (or if error is “contagious” between observations), then we might be able to exploit this structure to make better predictions.</p>
<p>Spatial structure in our errors might arise from when geography <em>should be</em> an attribute somehow, but we are not sure exactly how to include it in our model.
They might also arise because there is some <em>other</em> feature whose omission causes the spatial patterns in the error we see; if this additional feature were included, the structure would disappear.
Or, it might arise from the complex interactions and interdependences between the features that we have chosen to use as predictors, resulting in intrinsic structure in mis-prediction.
Most of the predictors we use in models of social processes contain <em>embodied</em> spatial information: patterning intrinsic to the feature that we get for free in the model.
If we intend to or not, using a spatially-patterned predictor in a model can result in spatially-patterned errors; using more than one can amplify this effect.
Thus, <em>regardless of whether or not the true process is explicitly geographic</em>, additional information about the spatial relationships between our observations or more information about nearby sites can make our predictions better.</p>
</div>
<div class="section" id="the-data-san-diego-airbnb">
<h3>The Data: San Diego AirBnB<a class="headerlink" href="#the-data-san-diego-airbnb" title="Permalink to this headline">¶</a></h3>
<p>To learn a little more about how regression works, we’ll examine some information about AirBnB in San Diego, CA.
This dataset contains house intrinsic characteristics, both continuous (number of beds as in <code class="docutils literal notranslate"><span class="pre">beds</span></code>) and categorical (type of renting or, in AirBnb jargon, property group as in the series of <code class="docutils literal notranslate"><span class="pre">pg_X</span></code> binary variables), but also variables that explicitly refer to the location and spatial configuration of the dataset (e.g. distance to Balboa Park, <code class="docutils literal notranslate"><span class="pre">d2balboa</span></code> or neigbourhood id, <code class="docutils literal notranslate"><span class="pre">neighbourhood_cleansed</span></code>).</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="o">%</span><span class="k">matplotlib</span> inline

<span class="kn">from</span> <span class="nn">pysal.model</span> <span class="kn">import</span> <span class="n">spreg</span>
<span class="kn">from</span> <span class="nn">pysal.lib</span> <span class="kn">import</span> <span class="n">weights</span>
<span class="kn">from</span> <span class="nn">pysal.explore</span> <span class="kn">import</span> <span class="n">esda</span>
<span class="kn">from</span> <span class="nn">scipy</span> <span class="kn">import</span> <span class="n">stats</span>
<span class="kn">import</span> <span class="nn">statsmodels.formula.api</span> <span class="k">as</span> <span class="nn">sm</span>
<span class="kn">import</span> <span class="nn">numpy</span>
<span class="kn">import</span> <span class="nn">pandas</span>
<span class="kn">import</span> <span class="nn">geopandas</span>
<span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="k">as</span> <span class="nn">plt</span>
<span class="kn">import</span> <span class="nn">seaborn</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stderr highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>/opt/conda/lib/python3.8/site-packages/spglm/utils.py:367: SyntaxWarning: &quot;is not&quot; with a literal. Did you mean &quot;!=&quot;?
  if resetlist is not ():
/opt/conda/lib/python3.8/site-packages/spvcm/utils.py:149: SyntaxWarning: &quot;is&quot; with a literal. Did you mean &quot;==&quot;?
  if np.isinf(ldet) or sgn is 0:
/opt/conda/lib/python3.8/site-packages/spvcm/abstracts.py:268: SyntaxWarning: &quot;is&quot; with a literal. Did you mean &quot;==&quot;?
  if chains is () and kwargs != dict():
/opt/conda/lib/python3.8/site-packages/spvcm/abstracts.py:270: SyntaxWarning: &quot;is not&quot; with a literal. Did you mean &quot;!=&quot;?
  if chains is not ():
/opt/conda/lib/python3.8/site-packages/spvcm/plotting.py:37: SyntaxWarning: &quot;is&quot; with a literal. Did you mean &quot;==&quot;?
  if thin is None or thin is 0:
</pre></div>
</div>
</div>
</div>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">db</span> <span class="o">=</span> <span class="n">geopandas</span><span class="o">.</span><span class="n">read_file</span><span class="p">(</span><span class="s1">&#39;../data/airbnb/regression_db.geojson&#39;</span><span class="p">)</span>
<span class="n">db</span><span class="o">.</span><span class="n">info</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>&lt;class &#39;geopandas.geodataframe.GeoDataFrame&#39;&gt;
RangeIndex: 6110 entries, 0 to 6109
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   accommodates        6110 non-null   int64   
 1   bathrooms           6110 non-null   float64 
 2   bedrooms            6110 non-null   float64 
 3   beds                6110 non-null   float64 
 4   neighborhood        6110 non-null   object  
 5   pool                6110 non-null   int64   
 6   d2balboa            6110 non-null   float64 
 7   coastal             6110 non-null   int64   
 8   price               6110 non-null   float64 
 9   log_price           6110 non-null   float64 
 10  id                  6110 non-null   int64   
 11  pg_Apartment        6110 non-null   int64   
 12  pg_Condominium      6110 non-null   int64   
 13  pg_House            6110 non-null   int64   
 14  pg_Other            6110 non-null   int64   
 15  pg_Townhouse        6110 non-null   int64   
 16  rt_Entire_home/apt  6110 non-null   int64   
 17  rt_Private_room     6110 non-null   int64   
 18  rt_Shared_room      6110 non-null   int64   
 19  geometry            6110 non-null   geometry
dtypes: float64(6), geometry(1), int64(12), object(1)
memory usage: 954.8+ KB
</pre></div>
</div>
</div>
</div>
<p>These are the explanatory variables we will use throughout the chapter.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">variable_names</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;accommodates&#39;</span><span class="p">,</span> <span class="s1">&#39;bathrooms&#39;</span><span class="p">,</span> <span class="s1">&#39;bedrooms&#39;</span><span class="p">,</span> 
                  <span class="s1">&#39;beds&#39;</span><span class="p">,</span> <span class="s1">&#39;rt_Private_room&#39;</span><span class="p">,</span> <span class="s1">&#39;rt_Shared_room&#39;</span><span class="p">,</span>
                  <span class="s1">&#39;pg_Condominium&#39;</span><span class="p">,</span> <span class="s1">&#39;pg_House&#39;</span><span class="p">,</span> 
                  <span class="s1">&#39;pg_Other&#39;</span><span class="p">,</span> <span class="s1">&#39;pg_Townhouse&#39;</span><span class="p">]</span>
</pre></div>
</div>
</div>
</div>
</div>
</div>
<div class="section" id="non-spatial-regression-a-very-quick-refresh">
<h2>Non-spatial regression, a (very) quick refresh<a class="headerlink" href="#non-spatial-regression-a-very-quick-refresh" title="Permalink to this headline">¶</a></h2>
<p>Before we discuss how to explicitly include space into the linear regression framework, let us show how basic regression can be carried out in Python, and how one can begin to interpret the results. By no means is this a formal and complete introduction to regression so, if that is what you are looking for, we recommend <a href="#id1"><span class="problematic" id="id2">:cite:`Gelman_2006`</span></a>, in particular chapters 3 and 4, which provide a fantastic, non-spatial introduction.</p>
<p>The core idea of linear regression is to explain the variation in a given (<em>dependent</em>) variable as a linear function of a collection of other (<em>explanatory</em>) variables. For example, in our case, we may want to express/explain the price of a house as a function of whether it is new and the degree of deprivation of the area where it is located. At the individual level, we can express this as:</p>
<div class="math notranslate nohighlight">
\[
P_i = \alpha + \sum_k \mathbf{X}_{ik}\beta_k  + \epsilon_i
\]</div>
<p>where <span class="math notranslate nohighlight">\(P_i\)</span> is the AirBnb price of house <span class="math notranslate nohighlight">\(i\)</span>, and <span class="math notranslate nohighlight">\(X\)</span> is a set of covariates that we use to explain such price. <span class="math notranslate nohighlight">\(\beta\)</span> is a vector of parameters that give us information about in which way and to what extent each variable is related to the price, and <span class="math notranslate nohighlight">\(\alpha\)</span>, the constant term, is the average house price when all the other variables are zero. The term <span class="math notranslate nohighlight">\(\epsilon_i\)</span> is usually referred to as “error” and captures elements that influence the price of a house but are not included in <span class="math notranslate nohighlight">\(X\)</span>. We can also express this relation in matrix form, excluding subindices for <span class="math notranslate nohighlight">\(i\)</span>, which yields:</p>
<div class="math notranslate nohighlight">
\[
P = \alpha + \mathbf{X}\beta + \epsilon
\]</div>
<p>A regression can be seen as a multivariate extension of bivariate correlations. Indeed, one way to interpret the <span class="math notranslate nohighlight">\(\beta_k\)</span> coefficients in the equation above is as the degree of correlation between the explanatory variable <span class="math notranslate nohighlight">\(k\)</span> and the dependent variable, <em>keeping all the other explanatory variables constant</em>. When one calculates bivariate correlations, the coefficient of a variable is picking up the correlation between the variables, but it is also subsuming into it variation associated with other correlated variables – also called confounding factors. Regression allows us to isolate the distinct effect that a single variable has on the dependent one, once we <em>control</em> for those other variables.</p>
<p>Practically speaking, linear regressions in Python are rather streamlined and easy to work with. There are also several packages which will run them (e.g. <code class="docutils literal notranslate"><span class="pre">statsmodels</span></code>, <code class="docutils literal notranslate"><span class="pre">scikit-learn</span></code>, <code class="docutils literal notranslate"><span class="pre">PySAL</span></code>). In the context of this chapter, it makes sense to start with <code class="docutils literal notranslate"><span class="pre">PySAL</span></code> as that is the only library that will allow us to move into explicitly spatial econometric models. To fit the model specified in the equation above with <span class="math notranslate nohighlight">\(X\)</span> as the list defined, we only need the following line of code:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">m1</span> <span class="o">=</span> <span class="n">spreg</span><span class="o">.</span><span class="n">OLS</span><span class="p">(</span><span class="n">db</span><span class="p">[[</span><span class="s1">&#39;log_price&#39;</span><span class="p">]]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span> <span class="n">db</span><span class="p">[</span><span class="n">variable_names</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span>
                <span class="n">name_y</span><span class="o">=</span><span class="s1">&#39;log_price&#39;</span><span class="p">,</span> <span class="n">name_x</span><span class="o">=</span><span class="n">variable_names</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>We use the command <code class="docutils literal notranslate"><span class="pre">OLS</span></code>, part of the <code class="docutils literal notranslate"><span class="pre">spreg</span></code> sub-package, and specify the dependent variable (the log of the price, so we can interpret results in terms of percentage change) and the explanatory ones. Note that both objects need to be arrays, so we extract them from the <code class="docutils literal notranslate"><span class="pre">pandas.DataFrame</span></code> object using <code class="docutils literal notranslate"><span class="pre">.values</span></code>.</p>
<p>In order to inspect the results of the model, we can call <code class="docutils literal notranslate"><span class="pre">summary</span></code>:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="n">m1</span><span class="o">.</span><span class="n">summary</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :   log_price                Number of Observations:        6110
Mean dependent var  :      4.9958                Number of Variables   :          11
S.D. dependent var  :      0.8072                Degrees of Freedom    :        6099
R-squared           :      0.6683
Adjusted R-squared  :      0.6678
Sum squared residual:    1320.148                F-statistic           :   1229.0564
Sigma-square        :       0.216                Prob(F-statistic)     :           0
S.E. of regression  :       0.465                Log likelihood        :   -3988.895
Sigma-square ML     :       0.216                Akaike info criterion :    7999.790
S.E of regression ML:      0.4648                Schwarz criterion     :    8073.685

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       4.3883830       0.0161147     272.3217773       0.0000000
        accommodates       0.0834523       0.0050781      16.4336318       0.0000000
           bathrooms       0.1923790       0.0109668      17.5419773       0.0000000
            bedrooms       0.1525221       0.0111323      13.7009195       0.0000000
                beds      -0.0417231       0.0069383      -6.0134430       0.0000000
     rt_Private_room      -0.5506868       0.0159046     -34.6244758       0.0000000
      rt_Shared_room      -1.2383055       0.0384329     -32.2198992       0.0000000
      pg_Condominium       0.1436347       0.0221499       6.4846529       0.0000000
            pg_House      -0.0104894       0.0145315      -0.7218393       0.4704209
            pg_Other       0.1411546       0.0228016       6.1905633       0.0000000
        pg_Townhouse      -0.0416702       0.0342758      -1.2157316       0.2241342
------------------------------------------------------------------------------------

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER           11.964

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2        2671.611           0.0000

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test               10         322.532           0.0000
Koenker-Bassett test             10         135.581           0.0000
================================ END OF REPORT =====================================
</pre></div>
</div>
</div>
</div>
<p>A full detailed explanation of the output is beyond the scope of this chapter, so we will focus on the relevant bits for our main purpose. This is concentrated on the <code class="docutils literal notranslate"><span class="pre">Coefficients</span></code> section, which gives us the estimates for <span class="math notranslate nohighlight">\(\beta_k\)</span> in our model. In other words, these numbers express the relationship between each explanatory variable and the dependent one, once the effect of confounding factors has been accounted for. Keep in mind however that regression is no magic; we are only discounting the effect of confounding factors that we include in the model, not of <em>all</em> potentially confounding factors.</p>
<p>Results are largely as expected: houses tend to be significantly more expensive if they accommodate more people (<code class="docutils literal notranslate"><span class="pre">accommodates</span></code>), if they have more bathrooms and bedrooms and if they are a condominium or part of the “other” category of house type. Conversely, given a number of rooms, houses with more beds (ie. listings that are more “crowded”) tend to go for cheaper, as it is the case for properties where one does not rent the entire house but only a room (<code class="docutils literal notranslate"><span class="pre">rt_Private_room</span></code>) or even shares it (<code class="docutils literal notranslate"><span class="pre">rt_Shared_room</span></code>). Of course, you might conceptually doubt the assumption that it is possible to <em>arbitrarily</em> change the number of beds within an Airbnb without eventually changing the number of people it accommodates, but methods to address these concerns using <em>interaction effects</em> won’t be discussed here.</p>
<div class="section" id="hidden-structures">
<h3>Hidden Structures<a class="headerlink" href="#hidden-structures" title="Permalink to this headline">¶</a></h3>
<p>In general, our model performs well, being able to predict slightly more than 65% (<span class="math notranslate nohighlight">\(R^2=0.67\)</span>) of the variation in the mean nightly price using the covariates we’ve discussed above.
But, our model might display some clustering in errors.
To interrogate this, we can do a few things.
One simple concept might be to look at the correlation between the error in predicting an airbnb and the error in predicting its nearest neighbor.
To examine this, we first might want to split our data up by regions and see if we’ve got some spatial structure in our residuals.
One reasonable theory might be that our model does not include any information about <em>beaches</em>, a critical aspect of why people live and vacation in San Diego.
Therefore, we might want to see whether or not our errors are higher or lower depending on whether or not an airbnb is in a “beach” neighborhood, a neighborhood near the ocean:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">is_coastal</span> <span class="o">=</span> <span class="n">db</span><span class="o">.</span><span class="n">coastal</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">bool</span><span class="p">)</span>
<span class="n">coastal</span> <span class="o">=</span> <span class="n">m1</span><span class="o">.</span><span class="n">u</span><span class="p">[</span><span class="n">is_coastal</span><span class="p">]</span>
<span class="n">not_coastal</span> <span class="o">=</span> <span class="n">m1</span><span class="o">.</span><span class="n">u</span><span class="p">[</span><span class="o">~</span><span class="n">is_coastal</span><span class="p">]</span>
<span class="n">plt</span><span class="o">.</span><span class="n">hist</span><span class="p">(</span><span class="n">coastal</span><span class="p">,</span> <span class="n">density</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s1">&#39;Coastal&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">hist</span><span class="p">(</span><span class="n">not_coastal</span><span class="p">,</span> <span class="n">histtype</span><span class="o">=</span><span class="s1">&#39;step&#39;</span><span class="p">,</span>
         <span class="n">density</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">linewidth</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s1">&#39;Not Coastal&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">vlines</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="mi">0</span><span class="p">,</span><span class="mi">1</span><span class="p">,</span> <span class="n">linestyle</span><span class="o">=</span><span class="s2">&quot;:&quot;</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s1">&#39;k&#39;</span><span class="p">,</span> <span class="n">linewidth</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">legend</span><span class="p">()</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="../_images/11_regression_11_0.png" src="../_images/11_regression_11_0.png" />
</div>
</div>
<p>While it appears that the neighborhoods on the coast have only slightly higher average errors (and have lower variance in their prediction errors), the two distributions are significantly distinct from one another when compared using a classic <span class="math notranslate nohighlight">\(t\)</span> test:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">stats</span><span class="o">.</span><span class="n">ttest_ind</span><span class="p">(</span><span class="n">coastal</span><span class="p">,</span> 
             <span class="n">not_coastal</span><span class="p">,</span>
<span class="c1">#             permutations=9999 not yet available in scipy</span>
             <span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Ttest_indResult(statistic=array([13.98193858]), pvalue=array([9.442438e-44]))
</pre></div>
</div>
</div>
</div>
<p>There are more sophisticated (and harder to fool) tests that may be applicable for this data, however. We cover them in the <a class="reference external" href="#Challenge">Challenge</a> section.</p>
<p>Additionally, it might be the case that some neighborhoods are more desirable than other neighborhoods due to unmodeled latent preferences or marketing.
For instance, despite its presence close to the sea, living near Camp Pendleton -a Marine base in the North of the city- may incur some significant penalties on area desirability due to noise and pollution.
For us to determine whether this is the case, we might be interested in the full distribution of model residuals within each neighborhood.</p>
<p>To make this more clear, we’ll first sort the data by the median residual in that neighborhood, and then make a box plot, which shows the distribution of residuals in each neighborhood:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">db</span><span class="p">[</span><span class="s1">&#39;residual&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">m1</span><span class="o">.</span><span class="n">u</span>
<span class="n">medians</span> <span class="o">=</span> <span class="n">db</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s2">&quot;neighborhood&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">residual</span><span class="o">.</span><span class="n">median</span><span class="p">()</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(</span><span class="s1">&#39;hood_residual&#39;</span><span class="p">)</span>

<span class="n">f</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">figure</span><span class="p">(</span><span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="mi">15</span><span class="p">,</span><span class="mi">3</span><span class="p">))</span>
<span class="n">ax</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">gca</span><span class="p">()</span>
<span class="n">seaborn</span><span class="o">.</span><span class="n">boxplot</span><span class="p">(</span><span class="s1">&#39;neighborhood&#39;</span><span class="p">,</span> <span class="s1">&#39;residual&#39;</span><span class="p">,</span> <span class="n">ax</span> <span class="o">=</span> <span class="n">ax</span><span class="p">,</span>
                <span class="n">data</span><span class="o">=</span><span class="n">db</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span><span class="n">medians</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s1">&#39;left&#39;</span><span class="p">,</span>
                              <span class="n">left_on</span><span class="o">=</span><span class="s1">&#39;neighborhood&#39;</span><span class="p">,</span>
                              <span class="n">right_index</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
                   <span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="s1">&#39;hood_residual&#39;</span><span class="p">),</span> <span class="n">palette</span><span class="o">=</span><span class="s1">&#39;bwr&#39;</span><span class="p">)</span>
<span class="n">f</span><span class="o">.</span><span class="n">autofmt_xdate</span><span class="p">()</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stderr highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
</pre></div>
</div>
<img alt="../_images/11_regression_16_1.png" src="../_images/11_regression_16_1.png" />
</div>
</div>
<p>No neighborhood is disjoint from one another, but some do appear to be higher than others, such as the well-known downtown tourist neighborhoods areas of the Gaslamp Quarter, Little Italy, or The Core.
Thus, there may be a distinctive effect of intangible neighborhood fashionableness that matters in this model.</p>
<p>Noting that many of the most over- and under-predicted neighborhoods are near one another in the city, it may also be the case that there is some sort of <em>contagion</em> or spatial spillovers in the nightly rent price.
This often is apparent when individuals seek to price their airbnb listings to compete with similar nearby listings.
Since our model is not aware of this behavior, its errors may tend to cluster.
One exceptionally simple way we can look into this structure is by examining the relationship between an observation’s residuals and its surrounding residuals.</p>
<p>To do this, we will use <em>spatial weights</em> to represent the geographic relationships between observations.
We cover spatial weights in detail in another chapter, so we will not repeat ourselves here.
For this example, we’ll start off with a <span class="math notranslate nohighlight">\(KNN\)</span> matrix where <span class="math notranslate nohighlight">\(k=1\)</span>, meaning we’re focusing only on the linkages of each airbnb to their closest other listing.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">knn</span> <span class="o">=</span> <span class="n">weights</span><span class="o">.</span><span class="n">KNN</span><span class="o">.</span><span class="n">from_dataframe</span><span class="p">(</span><span class="n">db</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stderr highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>/opt/conda/lib/python3.8/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 1849 disconnected components.
  warnings.warn(message)
</pre></div>
</div>
</div>
</div>
<p>This means that, when we compute the <em>spatial lag</em> of that knn weight and the residual, we get the residual of the airbnb listing closest to each observation.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">lag_residual</span> <span class="o">=</span> <span class="n">weights</span><span class="o">.</span><span class="n">spatial_lag</span><span class="o">.</span><span class="n">lag_spatial</span><span class="p">(</span><span class="n">knn</span><span class="p">,</span> <span class="n">m1</span><span class="o">.</span><span class="n">u</span><span class="p">)</span>
<span class="n">ax</span> <span class="o">=</span> <span class="n">seaborn</span><span class="o">.</span><span class="n">regplot</span><span class="p">(</span><span class="n">m1</span><span class="o">.</span><span class="n">u</span><span class="o">.</span><span class="n">flatten</span><span class="p">(),</span> <span class="n">lag_residual</span><span class="o">.</span><span class="n">flatten</span><span class="p">(),</span> 
                     <span class="n">line_kws</span><span class="o">=</span><span class="nb">dict</span><span class="p">(</span><span class="n">color</span><span class="o">=</span><span class="s1">&#39;orangered&#39;</span><span class="p">),</span>
                     <span class="n">ci</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
<span class="n">ax</span><span class="o">.</span><span class="n">set_xlabel</span><span class="p">(</span><span class="s1">&#39;Model Residuals - $u$&#39;</span><span class="p">)</span>
<span class="n">ax</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s1">&#39;Spatial Lag of Model Residuals - $W u$&#39;</span><span class="p">);</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stderr highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
</pre></div>
</div>
<img alt="../_images/11_regression_20_1.png" src="../_images/11_regression_20_1.png" />
</div>
</div>
<p>In this plot, we see that our prediction errors tend to cluster!
Above, we show the relationship between our prediction error at each site and the prediction error at the site nearest to it.
Here, we’re using this nearest site to stand in for the <em>surroundings</em> of that Airbnb.
This means that, when the model tends to overpredict a given Airbnb’s nightly log price, sites around that Airbnb are more likely to <em>also be overpredicted</em>.</p>
<p>An interesting property of this relationship is that it tends to stabilize as the number of nearest neighbors used to construct each Airbnb’s surroundings increases.
Consult the <a class="reference external" href="#Challenge">Challenge</a> section for more on this property.</p>
<p>Given this behavior, let’s look at the stable <span class="math notranslate nohighlight">\(k=20\)</span> number of neighbors.
Examining the relationship between this stable <em>surrounding</em> average and the focal Airbnb, we can even find clusters in our model error.
Recalling the <em>local Moran</em> statistics, we can identify certain areas where our predictions of the nightly (log) Airbnb price tend to be significantly off:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">knn</span><span class="o">.</span><span class="n">reweight</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">outliers</span> <span class="o">=</span> <span class="n">esda</span><span class="o">.</span><span class="n">moran</span><span class="o">.</span><span class="n">Moran_Local</span><span class="p">(</span><span class="n">m1</span><span class="o">.</span><span class="n">u</span><span class="p">,</span> <span class="n">knn</span><span class="p">,</span> <span class="n">permutations</span><span class="o">=</span><span class="mi">9999</span><span class="p">)</span>
<span class="n">error_clusters</span> <span class="o">=</span> <span class="p">(</span><span class="n">outliers</span><span class="o">.</span><span class="n">q</span> <span class="o">%</span> <span class="mi">2</span> <span class="o">==</span> <span class="mi">1</span><span class="p">)</span> <span class="c1"># only the cluster cores</span>
<span class="n">error_clusters</span> <span class="o">&amp;=</span> <span class="p">(</span><span class="n">outliers</span><span class="o">.</span><span class="n">p_sim</span> <span class="o">&lt;=</span> <span class="o">.</span><span class="mi">001</span><span class="p">)</span> <span class="c1"># filtering out non-significant clusters</span>
<span class="n">db</span><span class="o">.</span><span class="n">assign</span><span class="p">(</span><span class="n">error_clusters</span> <span class="o">=</span> <span class="n">error_clusters</span><span class="p">,</span>
          <span class="n">local_I</span> <span class="o">=</span> <span class="n">outliers</span><span class="o">.</span><span class="n">Is</span><span class="p">)</span>\
  <span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&quot;error_clusters&quot;</span><span class="p">)</span>\
  <span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="s1">&#39;local_I&#39;</span><span class="p">)</span>\
  <span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="s1">&#39;local_I&#39;</span><span class="p">,</span> <span class="n">cmap</span><span class="o">=</span><span class="s1">&#39;bwr&#39;</span><span class="p">,</span> <span class="n">marker</span><span class="o">=</span><span class="s1">&#39;.&#39;</span><span class="p">);</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stderr highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>/opt/conda/lib/python3.8/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 3 disconnected components.
  warnings.warn(message)
</pre></div>
</div>
<img alt="../_images/11_regression_23_1.png" src="../_images/11_regression_23_1.png" />
</div>
</div>
<p>Thus, these areas tend to be locations where our model significantly underpredicts the nightly airbnb price both for that specific observation and observations in its immediate surroundings.
This is critical since, if we can identify how these areas are structured — if they have a <em>consistent geography</em> that we can model — then we might make our predictions even better, or at least not systematically mis-predict prices in some areas while correctly predicting prices in other areas.</p>
<p>Since significant under- and over-predictions do appear to cluster in a highly structured way, we might be able to use a better model to fix the geography of our model errors.</p>
</div>
</div>
<div class="section" id="bringing-space-into-the-regression-framework">
<h2>Bringing space into the regression framework<a class="headerlink" href="#bringing-space-into-the-regression-framework" title="Permalink to this headline">¶</a></h2>
<p>There are many different ways that spatial structure shows up in our models, predictions, and our data, even if we do not explicitly intend to study it.
Fortunately, there are nearly as many techniques, called <em>spatial regression</em> methods, that are designed to handle these sorts of structures.
Spatial regression is about <em>explicitly</em> introducing space or geographical context into the statistical framework of a regression.
Conceptually, we want to introduce space into our model whenever we think it plays an important role in the process we are interested in, or when space can act as a reasonable proxy for other factors we cannot but should include in our model.
As an example of the former, we can imagine how houses at the seafront are probably more expensive than those in the second row, given their better views.
To illustrate the latter, we can think of how the character of a neighborhood is important in determining the price of a house; however, it is very hard to identify and quantify “character” <em>per se,</em> although it might be easier to get at its spatial variation, hence a case of space as a proxy.</p>
<p>Spatial regression is a large field of development in the econometrics and statistics literatures.
In this brief introduction, we will consider two related but very different processes that give rise to spatial effects: spatial heterogeneity and spatial dependence.
For more rigorous treatments of the topics introduced here, the reader is
referred to <a href="#id3"><span class="problematic" id="id4">:cite:`Anselin_2003,Anselin_2014,Gelman_2006`</span></a>.</p>
<div class="section" id="spatial-feature-engineering">
<h3>Spatial Feature Engineering<a class="headerlink" href="#spatial-feature-engineering" title="Permalink to this headline">¶</a></h3>
<p>Using geographic information to “construct” new data is a common approach to bring in spatial information into geographic analysis.
Often, this reflects the fact that processes are not the same everywhere in the map of analysis, or that geographical information may be useful to predict our outcome of interest. In this section, we will briefly present how to use <em>spatial features</em>, or <span class="math notranslate nohighlight">\(X\)</span> variables that are constructed from geographical relationships, in a standard linear model. We discuss spatial feature engineering extensively in the next chapter, though, and the depth and extent of spatial feature engineering is difficult to overstate. In this, we will consider only the simplest of spatial features: proximity variables.</p>
<div class="section" id="proximity-variables">
<h4>Proximity variables<a class="headerlink" href="#proximity-variables" title="Permalink to this headline">¶</a></h4>
<p>For a start, one relevant proximity-driven variable that could influence our model is based on the listings proximity to Balboa Park. A common tourist destination, Balboa park is a central recreation hub for the city of San Diego, containing many museums and the San Diego zoo. Thus, it could be the case that people searching for Airbnbs in San Diego are willing to pay a premium to live closer to the park. If this were true <em>and</em> we omitted this from our model, we may indeed see a significant spatial pattern caused by this distance decay effect.</p>
<p>Therefore, this is sometimes called a <em>spatially-patterned omitted covariate</em>: geographic information our model needs to make good precitions which we have left out of our model. Therefore, let’s build a new model containing this distance to Balboa Park covariate. First, though, it helps to visualize the structure of this distance covariate itself:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">db</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="s1">&#39;d2balboa&#39;</span><span class="p">,</span> <span class="n">marker</span><span class="o">=</span><span class="s1">&#39;.&#39;</span><span class="p">,</span> <span class="n">s</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>&lt;AxesSubplot:&gt;
</pre></div>
</div>
<img alt="../_images/11_regression_26_1.png" src="../_images/11_regression_26_1.png" />
</div>
</div>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">base_names</span> <span class="o">=</span> <span class="n">variable_names</span>
<span class="n">balboa_names</span> <span class="o">=</span> <span class="n">variable_names</span> <span class="o">+</span> <span class="p">[</span><span class="s1">&#39;d2balboa&#39;</span><span class="p">]</span>
</pre></div>
</div>
</div>
</div>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">m2</span> <span class="o">=</span> <span class="n">spreg</span><span class="o">.</span><span class="n">OLS</span><span class="p">(</span><span class="n">db</span><span class="p">[[</span><span class="s1">&#39;log_price&#39;</span><span class="p">]]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span> <span class="n">db</span><span class="p">[</span><span class="n">balboa_names</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span> 
                  <span class="n">name_y</span> <span class="o">=</span> <span class="s1">&#39;log_price&#39;</span><span class="p">,</span> <span class="n">name_x</span> <span class="o">=</span> <span class="n">balboa_names</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>Unfortunately, when you inspect the regression diagnostics and output, you see that this covariate is not quite as helpful as we might anticipate. It is not statistically significant at conventional significance levels, the model fit does not substantially change:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="n">m2</span><span class="o">.</span><span class="n">summary</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :   log_price                Number of Observations:        6110
Mean dependent var  :      4.9958                Number of Variables   :          12
S.D. dependent var  :      0.8072                Degrees of Freedom    :        6098
R-squared           :      0.6685
Adjusted R-squared  :      0.6679
Sum squared residual:    1319.522                F-statistic           :   1117.9338
Sigma-square        :       0.216                Prob(F-statistic)     :           0
S.E. of regression  :       0.465                Log likelihood        :   -3987.446
Sigma-square ML     :       0.216                Akaike info criterion :    7998.892
S.E of regression ML:      0.4647                Schwarz criterion     :    8079.504

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       4.3796237       0.0169152     258.9162210       0.0000000
        accommodates       0.0836436       0.0050786      16.4698200       0.0000000
           bathrooms       0.1907912       0.0110047      17.3371724       0.0000000
            bedrooms       0.1507462       0.0111794      13.4842887       0.0000000
                beds      -0.0414762       0.0069387      -5.9774814       0.0000000
     rt_Private_room      -0.5529958       0.0159599     -34.6490178       0.0000000
      rt_Shared_room      -1.2355206       0.0384618     -32.1232754       0.0000000
      pg_Condominium       0.1404588       0.0222251       6.3198282       0.0000000
            pg_House      -0.0133019       0.0146230      -0.9096565       0.3630396
            pg_Other       0.1411756       0.0227980       6.1924442       0.0000000
        pg_Townhouse      -0.0457839       0.0343557      -1.3326417       0.1826992
            d2balboa       0.0016453       0.0009673       1.7008587       0.0890205
------------------------------------------------------------------------------------

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER           12.745

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2        2710.322           0.0000

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test               11         317.519           0.0000
Koenker-Bassett test             11         132.860           0.0000
================================ END OF REPORT =====================================
</pre></div>
</div>
</div>
</div>
<p>And, there still appears to be spatial structure in our model’s errors:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">lag_residual</span> <span class="o">=</span> <span class="n">weights</span><span class="o">.</span><span class="n">spatial_lag</span><span class="o">.</span><span class="n">lag_spatial</span><span class="p">(</span><span class="n">knn</span><span class="p">,</span> <span class="n">m2</span><span class="o">.</span><span class="n">u</span><span class="p">)</span>
<span class="n">seaborn</span><span class="o">.</span><span class="n">regplot</span><span class="p">(</span><span class="n">m2</span><span class="o">.</span><span class="n">u</span><span class="o">.</span><span class="n">flatten</span><span class="p">(),</span> <span class="n">lag_residual</span><span class="o">.</span><span class="n">flatten</span><span class="p">(),</span> 
                <span class="n">line_kws</span><span class="o">=</span><span class="nb">dict</span><span class="p">(</span><span class="n">color</span><span class="o">=</span><span class="s1">&#39;orangered&#39;</span><span class="p">),</span>
                <span class="n">ci</span><span class="o">=</span><span class="kc">None</span><span class="p">);</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stderr highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
</pre></div>
</div>
<img alt="../_images/11_regression_32_1.png" src="../_images/11_regression_32_1.png" />
</div>
</div>
<p>Finally, the distance to Balboa Park variable does not fit our theory about how distance to amenity should affect the price of an Airbnb; the coefficient estimate is <em>positive</em>, meaning that people are paying a premium to be <em>further</em> from the Park. We will revisit this result later on, when we consider spatial heterogeneity and will be able to shed some light on this. Further, the next chapter is an extensive treatment of spatial fixed effects, presenting many more spatial feature engineering methods. Here, we have only showed how to include these engineered features in a standard linear modelling framework.</p>
</div>
</div>
<div class="section" id="spatial-heterogeneity">
<h3>Spatial Heterogeneity<a class="headerlink" href="#spatial-heterogeneity" title="Permalink to this headline">¶</a></h3>
<p>While we’ve assumed that our proximity variable might stand in for a difficult-to-measure premium individuals pay when they’re close to a recreational zone. However, not all neighborhoods are created equal; some neighborhoods may be more lucrative than others, regardless of their proximity to Balboa Park. When this is the case, we need some way to account for the fact that each neighborhood may experience these kinds of <em>gestalt</em>, unique effects. One way to do this is by capturing <em>spatial heterogeneity</em>. At its most basic, <em>spatial heterogeneity</em> means that parts of the model may change in different places. For example, changes to the intercept, <span class="math notranslate nohighlight">\(\alpha\)</span>, may reflect the fact that different areas have different baseline exposures to a given process. Changes to the slope terms, <span class="math notranslate nohighlight">\(\beta\)</span>, may indicate some kind of geographical mediating factor, such as when a governmental policy is not consistently applied across jurisdictions. Finally, changes to the variance of the residuals, commonly denoted <span class="math notranslate nohighlight">\(\sigma^2\)</span>, can introduce spatial heteroskedasticity. We deal with the first two in this section.</p>
<p>To illustrate spatial fixed effects, let us consider the house price example from the previous section to introduce a more general illustration for “space as a proxy”. Given we are only including two explanatory variables in the model, it is likely we are missing some important factors that play a role at determining the price at which a house is sold. Some of them, however, are likely to vary systematically over space (e.g. different neighborhood characteristics). If that is the case, we can control for those unobserved factors by using traditional dummy variables but basing their creation on a spatial rule. For example, let us include a binary variable for every neighborhood, indicating whether a given house is located within such area (<code class="docutils literal notranslate"><span class="pre">1</span></code>) or not (<code class="docutils literal notranslate"><span class="pre">0</span></code>). Mathematically, we are now fitting the following equation:</p>
<div class="math notranslate nohighlight">
\[
\log{P_i} = \alpha_r + \sum_k \mathbf{X}_{ik}\beta_k  + \epsilon_i
\]</div>
<p>where the main difference is that we are now allowing the constant term, <span class="math notranslate nohighlight">\(\alpha\)</span>, to vary by neighbourhood <span class="math notranslate nohighlight">\(r\)</span>, <span class="math notranslate nohighlight">\(\alpha_r\)</span>.</p>
<p>Programmatically, we will show two different ways can estimate this: one,
using <code class="docutils literal notranslate"><span class="pre">statsmodels</span></code>; and two, with <code class="docutils literal notranslate"><span class="pre">PySAL</span></code>. First, we will use <code class="docutils literal notranslate"><span class="pre">statsmodels</span></code>. This package provides a formula-like API, which allows us to express the <em>equation</em> we wish to estimate directly:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">f</span> <span class="o">=</span> <span class="s1">&#39;log_price ~ &#39;</span> <span class="o">+</span> <span class="s1">&#39; + &#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">variable_names</span><span class="p">)</span> <span class="o">+</span> <span class="s1">&#39; + neighborhood - 1&#39;</span>
<span class="nb">print</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>log_price ~ accommodates + bathrooms + bedrooms + beds + rt_Private_room + rt_Shared_room + pg_Condominium + pg_House + pg_Other + pg_Townhouse + neighborhood - 1
</pre></div>
</div>
</div>
</div>
<p>The <em>tilde</em> operator in this statement is usually read as “log price is a function of …”, to account for the fact that many different model specifications can be fit according to that functional relationship between <code class="docutils literal notranslate"><span class="pre">log_price</span></code> and our covariate list. Critically, note that the trailing <code class="docutils literal notranslate"><span class="pre">-1</span></code> term means that we are fitting this model without an intercept term. This is necessary, since including an intercept term alongside unique means for every neighborhood would make the underlying system of equations underspecified.</p>
<p>Using this expression, we can estimate the unique effects of each neighborhood, fitting the model in <code class="docutils literal notranslate"><span class="pre">statsmodels</span></code>:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">m3</span> <span class="o">=</span> <span class="n">sm</span><span class="o">.</span><span class="n">ols</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="n">data</span><span class="o">=</span><span class="n">db</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="n">m3</span><span class="o">.</span><span class="n">summary2</span><span class="p">())</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>                           Results: Ordinary least squares
======================================================================================
Model:                      OLS                    Adj. R-squared:           0.709    
Dependent Variable:         log_price              AIC:                      7229.6640
Date:                       2021-03-17 22:29       BIC:                      7599.1365
No. Observations:           6110                   Log-Likelihood:           -3559.8  
Df Model:                   54                     F-statistic:              276.9    
Df Residuals:               6055                   Prob (F-statistic):       0.00     
R-squared:                  0.712                  Scale:                    0.18946  
--------------------------------------------------------------------------------------
                                       Coef.  Std.Err.    t     P&gt;|t|   [0.025  0.975]
--------------------------------------------------------------------------------------
neighborhood[Balboa Park]              4.2808   0.0333 128.5836 0.0000  4.2155  4.3460
neighborhood[Bay Ho]                   4.1983   0.0769  54.6089 0.0000  4.0475  4.3490
neighborhood[Bay Park]                 4.3292   0.0510  84.9084 0.0000  4.2293  4.4292
neighborhood[Carmel Valley]            4.3893   0.0566  77.6126 0.0000  4.2784  4.5001
neighborhood[City Heights West]        4.0535   0.0584  69.4358 0.0000  3.9391  4.1680
neighborhood[Clairemont Mesa]          4.0953   0.0477  85.8559 0.0000  4.0018  4.1888
neighborhood[College Area]             4.0337   0.0583  69.2386 0.0000  3.9195  4.1479
neighborhood[Core]                     4.7262   0.0526  89.7775 0.0000  4.6230  4.8294
neighborhood[Cortez Hill]              4.6081   0.0515  89.4322 0.0000  4.5071  4.7091
neighborhood[Del Mar Heights]          4.4969   0.0543  82.7599 0.0000  4.3904  4.6034
neighborhood[East Village]             4.5455   0.0294 154.7473 0.0000  4.4879  4.6031
neighborhood[Gaslamp Quarter]          4.7758   0.0473 100.9589 0.0000  4.6831  4.8685
neighborhood[Grant Hill]               4.3067   0.0524  82.2442 0.0000  4.2041  4.4094
neighborhood[Grantville]               4.0533   0.0714  56.7719 0.0000  3.9133  4.1933
neighborhood[Kensington]               4.3027   0.0772  55.7511 0.0000  4.1514  4.4540
neighborhood[La Jolla]                 4.6821   0.0258 181.4137 0.0000  4.6315  4.7327
neighborhood[La Jolla Village]         4.3303   0.0772  56.0653 0.0000  4.1789  4.4817
neighborhood[Linda Vista]              4.1911   0.0569  73.6380 0.0000  4.0796  4.3027
neighborhood[Little Italy]             4.6667   0.0468  99.6364 0.0000  4.5749  4.7586
neighborhood[Loma Portal]              4.3019   0.0332 129.4346 0.0000  4.2368  4.3671
neighborhood[Marina]                   4.5583   0.0480  94.9761 0.0000  4.4642  4.6524
neighborhood[Midtown]                  4.3667   0.0284 153.7902 0.0000  4.3110  4.4223
neighborhood[Midtown District]         4.5849   0.0651  70.4436 0.0000  4.4573  4.7125
neighborhood[Mira Mesa]                3.9896   0.0561  71.1135 0.0000  3.8796  4.0995
neighborhood[Mission Bay]              4.5155   0.0224 201.3850 0.0000  4.4715  4.5594
neighborhood[Mission Valley]           4.2760   0.0742  57.6031 0.0000  4.1304  4.4215
neighborhood[Moreno Mission]           4.4009   0.0567  77.5773 0.0000  4.2897  4.5122
neighborhood[Normal Heights]           4.0974   0.0490  83.5821 0.0000  4.0013  4.1935
neighborhood[North Clairemont]         3.9844   0.0691  57.6209 0.0000  3.8489  4.1200
neighborhood[North Hills]              4.2534   0.0255 166.9470 0.0000  4.2035  4.3034
neighborhood[Northwest]                4.1738   0.0697  59.8572 0.0000  4.0371  4.3104
neighborhood[Ocean Beach]              4.4372   0.0301 147.4709 0.0000  4.3782  4.4961
neighborhood[Old Town]                 4.4202   0.0419 105.5098 0.0000  4.3380  4.5023
neighborhood[Otay Ranch]               4.1859   0.0816  51.2999 0.0000  4.0260  4.3459
neighborhood[Pacific Beach]            4.4388   0.0224 198.0136 0.0000  4.3949  4.4828
neighborhood[Park West]                4.4409   0.0448  99.1988 0.0000  4.3531  4.5287
neighborhood[Rancho Bernadino]         4.1809   0.0720  58.0598 0.0000  4.0397  4.3221
neighborhood[Rancho Penasquitos]       4.1624   0.0618  67.3789 0.0000  4.0413  4.2835
neighborhood[Roseville]                4.3870   0.0586  74.8346 0.0000  4.2721  4.5019
neighborhood[San Carlos]               4.3350   0.0830  52.2035 0.0000  4.1722  4.4978
neighborhood[Scripps Ranch]            4.0824   0.0762  53.5440 0.0000  3.9329  4.2318
neighborhood[Serra Mesa]               4.3130   0.0599  71.9725 0.0000  4.1955  4.4304
neighborhood[South Park]               4.2253   0.0536  78.7676 0.0000  4.1202  4.3305
neighborhood[University City]          4.1937   0.0370 113.4516 0.0000  4.1213  4.2662
neighborhood[West University Heights]  4.2977   0.0431  99.6359 0.0000  4.2131  4.3822
accommodates                           0.0728   0.0048  15.0672 0.0000  0.0633  0.0822
bathrooms                              0.1702   0.0105  16.2171 0.0000  0.1496  0.1908
bedrooms                               0.1686   0.0106  15.8731 0.0000  0.1478  0.1894
beds                                  -0.0416   0.0065  -6.3508 0.0000 -0.0544 -0.0287
rt_Private_room                       -0.4873   0.0154 -31.6225 0.0000 -0.5175 -0.4570
rt_Shared_room                        -1.2396   0.0368 -33.6657 0.0000 -1.3118 -1.1674
pg_Condominium                         0.1329   0.0210   6.3333 0.0000  0.0918  0.1741
pg_House                               0.0400   0.0144   2.7868 0.0053  0.0119  0.0681
pg_Other                               0.0610   0.0224   2.7290 0.0064  0.0172  0.1048
pg_Townhouse                          -0.0075   0.0324  -0.2323 0.8163 -0.0710  0.0560
--------------------------------------------------------------------------------------
Omnibus:                   1215.551             Durbin-Watson:                1.835   
Prob(Omnibus):             0.000                Jarque-Bera (JB):             4115.510
Skew:                      0.989                Prob(JB):                     0.000   
Kurtosis:                  6.500                Condition No.:                132     
======================================================================================
</pre></div>
</div>
</div>
</div>
<p>The approach above shows how spatial FE are a particular case of a linear regression with a categorical  variable. Neighborhood membership is modeled using binary dummy variables. Thanks to the formula grammar used in <code class="docutils literal notranslate"><span class="pre">statsmodels</span></code>, we can express the model abstractly, and Python parses it, appropriately creating binary variables as required.</p>
<p>The second approach leverages <code class="docutils literal notranslate"><span class="pre">PySAL</span></code> Regimes functionality, which allows the user to specify which variables are to be estimated separately for each “regime”. In this case however, instead of describing the model in a formula, we need to pass each element of the model as separate arguments.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># PySAL implementation</span>
<span class="n">m4</span> <span class="o">=</span> <span class="n">spreg</span><span class="o">.</span><span class="n">OLS_Regimes</span><span class="p">(</span><span class="n">db</span><span class="p">[[</span><span class="s1">&#39;log_price&#39;</span><span class="p">]]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span> <span class="n">db</span><span class="p">[</span><span class="n">variable_names</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span>
                       <span class="n">db</span><span class="p">[</span><span class="s1">&#39;neighborhood&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">tolist</span><span class="p">(),</span>
                       <span class="n">constant_regi</span><span class="o">=</span><span class="s1">&#39;many&#39;</span><span class="p">,</span> <span class="n">cols2regi</span><span class="o">=</span><span class="p">[</span><span class="kc">False</span><span class="p">]</span><span class="o">*</span><span class="nb">len</span><span class="p">(</span><span class="n">variable_names</span><span class="p">),</span>
                       <span class="n">regime_err_sep</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
                       <span class="n">name_y</span><span class="o">=</span><span class="s1">&#39;log_price&#39;</span><span class="p">,</span> <span class="n">name_x</span><span class="o">=</span><span class="n">variable_names</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">m4</span><span class="o">.</span><span class="n">summary</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :   log_price                Number of Observations:        6110
Mean dependent var  :      4.9958                Number of Variables   :          55
S.D. dependent var  :      0.8072                Degrees of Freedom    :        6055
R-squared           :      0.7118
Adjusted R-squared  :      0.7092
Sum squared residual:    1147.169                F-statistic           :    276.9408
Sigma-square        :       0.189                Prob(F-statistic)     :           0
S.E. of regression  :       0.435                Log likelihood        :   -3559.832
Sigma-square ML     :       0.188                Akaike info criterion :    7229.664
S.E of regression ML:      0.4333                Schwarz criterion     :    7599.137

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
Balboa Park_CONSTANT       4.2807664       0.0332917     128.5835585       0.0000000
     Bay Ho_CONSTANT       4.1982505       0.0768784      54.6089479       0.0000000
   Bay Park_CONSTANT       4.3292234       0.0509870      84.9083655       0.0000000
Carmel Valley_CONSTANT       4.3892614       0.0565535      77.6125622       0.0000000
City Heights West_CONSTANT       4.0535183       0.0583780      69.4357707       0.0000000
Clairemont Mesa_CONSTANT       4.0952589       0.0476992      85.8558747       0.0000000
College Area_CONSTANT       4.0336972       0.0582579      69.2386376       0.0000000
       Core_CONSTANT       4.7261863       0.0526433      89.7775229       0.0000000
Cortez Hill_CONSTANT       4.6080896       0.0515261      89.4322167       0.0000000
Del Mar Heights_CONSTANT       4.4969102       0.0543368      82.7599068       0.0000000
East Village_CONSTANT       4.5454690       0.0293735     154.7473234       0.0000000
Gaslamp Quarter_CONSTANT       4.7757987       0.0473044     100.9588995       0.0000000
 Grant Hill_CONSTANT       4.3067425       0.0523653      82.2441742       0.0000000
 Grantville_CONSTANT       4.0532975       0.0713962      56.7718990       0.0000000
 Kensington_CONSTANT       4.3026710       0.0771765      55.7510746       0.0000000
   La Jolla_CONSTANT       4.6820840       0.0258089     181.4136961       0.0000000
La Jolla Village_CONSTANT       4.3303114       0.0772369      56.0652857       0.0000000
Linda Vista_CONSTANT       4.1911487       0.0569155      73.6380443       0.0000000
Little Italy_CONSTANT       4.6667423       0.0468377      99.6363950       0.0000000
Loma Portal_CONSTANT       4.3019094       0.0332362     129.4346151       0.0000000
     Marina_CONSTANT       4.5582979       0.0479941      94.9761422       0.0000000
    Midtown_CONSTANT       4.3666608       0.0283936     153.7902257       0.0000000
Midtown District_CONSTANT       4.5849382       0.0650866      70.4436292       0.0000000
  Mira Mesa_CONSTANT       3.9895616       0.0561013      71.1135365       0.0000000
Mission Bay_CONSTANT       4.5154791       0.0224221     201.3849675       0.0000000
Mission Valley_CONSTANT       4.2759604       0.0742315      57.6030636       0.0000000
Moreno Mission_CONSTANT       4.4009417       0.0567298      77.5773078       0.0000000
Normal Heights_CONSTANT       4.0973996       0.0490225      83.5820603       0.0000000
North Clairemont_CONSTANT       3.9844398       0.0691492      57.6208858       0.0000000
North Hills_CONSTANT       4.2534252       0.0254777     166.9470009       0.0000000
  Northwest_CONSTANT       4.1737520       0.0697284      59.8572467       0.0000000
Ocean Beach_CONSTANT       4.4371642       0.0300884     147.4709376       0.0000000
   Old Town_CONSTANT       4.4201603       0.0418934     105.5097966       0.0000000
 Otay Ranch_CONSTANT       4.1859412       0.0815974      51.2999205       0.0000000
Pacific Beach_CONSTANT       4.4388288       0.0224168     198.0136040       0.0000000
  Park West_CONSTANT       4.4409072       0.0447677      99.1988153       0.0000000
Rancho Bernadino_CONSTANT       4.1809062       0.0720103      58.0598088       0.0000000
Rancho Penasquitos_CONSTANT       4.1624276       0.0617764      67.3788989       0.0000000
  Roseville_CONSTANT       4.3869921       0.0586225      74.8346070       0.0000000
 San Carlos_CONSTANT       4.3349911       0.0830403      52.2034885       0.0000000
Scripps Ranch_CONSTANT       4.0823805       0.0762435      53.5439686       0.0000000
 Serra Mesa_CONSTANT       4.3129674       0.0599252      71.9725317       0.0000000
 South Park_CONSTANT       4.2253108       0.0536428      78.7675791       0.0000000
University City_CONSTANT       4.1937181       0.0369648     113.4516038       0.0000000
West University Heights_CONSTANT       4.2976715       0.0431338      99.6358857       0.0000000
_Global_accommodates       0.0727766       0.0048301      15.0671860       0.0000000
   _Global_bathrooms       0.1702080       0.0104956      16.2171367       0.0000000
    _Global_bedrooms       0.1685720       0.0106200      15.8731267       0.0000000
        _Global_beds      -0.0415809       0.0065474      -6.3507569       0.0000000
_Global_rt_Private_room      -0.4872544       0.0154085     -31.6225002       0.0000000
_Global_rt_Shared_room      -1.2395926       0.0368206     -33.6656955       0.0000000
_Global_pg_Condominium       0.1329341       0.0209896       6.3333214       0.0000000
    _Global_pg_House       0.0399982       0.0143528       2.7867915       0.0053399
    _Global_pg_Other       0.0610112       0.0223565       2.7290143       0.0063707
_Global_pg_Townhouse      -0.0075250       0.0323876      -0.2323436       0.8162790
------------------------------------------------------------------------------------
Regimes variable: unknown

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER           12.143

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2        4115.510           0.0000

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test               54         854.587           0.0000
Koenker-Bassett test             54         310.744           0.0000

REGIMES DIAGNOSTICS - CHOW TEST
                 VARIABLE        DF        VALUE           PROB
                 CONSTANT        44         913.016           0.0000
              Global test        44         913.016           0.0000
================================ END OF REPORT =====================================
</pre></div>
</div>
</div>
</div>
<p>Econometrically speaking, what the neighborhood FEs we have introduced imply is that, instead of comparing all house prices across San Diego as equal, we only derive variation from within each postcode. Remember that the interpretation of <span class="math notranslate nohighlight">\(\beta_k\)</span> is the effect of variable <span class="math notranslate nohighlight">\(k\)</span>, <em>given all the other explanatory variables included remain constant</em>. By including a single variable for each area, we are effectively forcing the model to compare as equal only house prices that share the same value for each variable; or, in other words, only houses located within the same area. Introducing FE affords a higher degree of isolation of the effects of the variables we introduce in the model because we can control for unobserved effects that align spatially with the distribution of the FE introduced (by postcode, in our case).</p>
<p>To make a map of neighborhood fixed effects, we need to process the results from our model slightly.</p>
<p>First, we extract only the effects pertaining to the neighborhoods:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">neighborhood_effects</span> <span class="o">=</span> <span class="n">m3</span><span class="o">.</span><span class="n">params</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">like</span><span class="o">=</span><span class="s1">&#39;neighborhood&#39;</span><span class="p">)</span>
<span class="n">neighborhood_effects</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>neighborhood[Balboa Park]          4.280766
neighborhood[Bay Ho]               4.198251
neighborhood[Bay Park]             4.329223
neighborhood[Carmel Valley]        4.389261
neighborhood[City Heights West]    4.053518
dtype: float64
</pre></div>
</div>
</div>
</div>
<p>Then, we need to extract just the neighborhood name from the index of this Series. A simple way to do this is to strip all the characters that come before and after our neighborhood names:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">stripped</span> <span class="o">=</span> <span class="n">neighborhood_effects</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">strip</span><span class="p">(</span><span class="s1">&#39;neighborhood[&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">strip</span><span class="p">(</span><span class="s1">&#39;]&#39;</span><span class="p">)</span>
<span class="n">neighborhood_effects</span><span class="o">.</span><span class="n">index</span> <span class="o">=</span> <span class="n">stripped</span>
<span class="n">neighborhood_effects</span> <span class="o">=</span> <span class="n">neighborhood_effects</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(</span><span class="s1">&#39;fixed_effect&#39;</span><span class="p">)</span>
<span class="n">neighborhood_effects</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_html"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>fixed_effect</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Balboa Park</th>
      <td>4.280766</td>
    </tr>
    <tr>
      <th>Bay Ho</th>
      <td>4.198251</td>
    </tr>
    <tr>
      <th>Bay Park</th>
      <td>4.329223</td>
    </tr>
    <tr>
      <th>Carmel Valley</th>
      <td>4.389261</td>
    </tr>
    <tr>
      <th>City Heights West</th>
      <td>4.053518</td>
    </tr>
  </tbody>
</table>
</div></div></div>
</div>
<p>Good, we’re back to our raw neighborhood names. Now, we can join them back up with the neighborhood shapes:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">sd_path</span> <span class="o">=</span> <span class="s1">&#39;../data/airbnb/neighbourhoods.geojson&#39;</span>
<span class="n">neighborhoods</span> <span class="o">=</span> <span class="n">geopandas</span><span class="o">.</span><span class="n">read_file</span><span class="p">(</span><span class="n">sd_path</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">ax</span> <span class="o">=</span> <span class="n">neighborhoods</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">color</span><span class="o">=</span><span class="s1">&#39;k&#39;</span><span class="p">,</span> 
                        <span class="n">alpha</span><span class="o">=</span><span class="mf">0.5</span><span class="p">,</span>
                        <span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span><span class="mi">6</span><span class="p">))</span>
<span class="n">neighborhoods</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span><span class="n">neighborhood_effects</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s1">&#39;left&#39;</span><span class="p">,</span>
                    <span class="n">left_on</span><span class="o">=</span><span class="s1">&#39;neighbourhood&#39;</span><span class="p">,</span> 
                    <span class="n">right_index</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>\
                  <span class="o">.</span><span class="n">dropna</span><span class="p">(</span><span class="n">subset</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;fixed_effect&#39;</span><span class="p">])</span>\
                  <span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="s1">&#39;fixed_effect&#39;</span><span class="p">,</span>
                        <span class="n">ax</span><span class="o">=</span><span class="n">ax</span><span class="p">)</span>
<span class="n">ax</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s2">&quot;San Diego Neighborhood Fixed Effects&quot;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="../_images/11_regression_46_0.png" src="../_images/11_regression_46_0.png" />
</div>
</div>
<div class="section" id="spatial-regimes">
<h4>Spatial Regimes<a class="headerlink" href="#spatial-regimes" title="Permalink to this headline">¶</a></h4>
<p>At the core of estimating spatial FEs is the idea that, instead of assuming the dependent variable behaves uniformly over space, there are systematic effects following a geographical pattern that affect its behaviour. In other words, spatial FEs introduce econometrically the notion of spatial heterogeneity. They do this in the simplest possible form: by allowing the constant term to vary geographically. The other elements of the regression are left untouched and hence apply uniformly across space. The idea of spatial regimes (SRs) is to generalize the spatial FE approach to allow not only the constant term to vary but also any other explanatory variable. This implies that the equation we will be estimating is:</p>
<div class="math notranslate nohighlight">
\[
\log{P_i} = \alpha_r + \sum_k \mathbf{X}_{ki}\beta_{k-r} + \epsilon_i
\]</div>
<p>where we are not only allowing the constant term to vary by region (<span class="math notranslate nohighlight">\(\alpha_r\)</span>), but also every other parameter (<span class="math notranslate nohighlight">\(\beta_{k-r}\)</span>).</p>
<p>To illustrate this approach, we will use the “spatial differentiator” of whether a house is in a coastal neighbourhood or not (<code class="docutils literal notranslate"><span class="pre">coastal_neig</span></code>) to define the regimes. The rationale behind this choice is that renting a house close to the ocean might be a strong enough pull that people might be willing to pay at different <em>rates</em> for each of the house’s characteristics.</p>
<p>To implement this in Python, we use the <code class="docutils literal notranslate"><span class="pre">OLS_Regimes</span></code> class in <code class="docutils literal notranslate"><span class="pre">PySAL</span></code>, which does most of the heavy lifting for us:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">m4</span> <span class="o">=</span> <span class="n">spreg</span><span class="o">.</span><span class="n">OLS_Regimes</span><span class="p">(</span><span class="n">db</span><span class="p">[[</span><span class="s1">&#39;log_price&#39;</span><span class="p">]]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span> <span class="n">db</span><span class="p">[</span><span class="n">variable_names</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span>
                          <span class="n">db</span><span class="p">[</span><span class="s1">&#39;coastal&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">tolist</span><span class="p">(),</span>
                          <span class="n">constant_regi</span><span class="o">=</span><span class="s1">&#39;many&#39;</span><span class="p">,</span>
                          <span class="n">regime_err_sep</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
                          <span class="n">name_y</span><span class="o">=</span><span class="s1">&#39;log_price&#39;</span><span class="p">,</span> <span class="n">name_x</span><span class="o">=</span><span class="n">variable_names</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">m4</span><span class="o">.</span><span class="n">summary</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :   log_price                Number of Observations:        6110
Mean dependent var  :      4.9958                Number of Variables   :          22
S.D. dependent var  :      0.8072                Degrees of Freedom    :        6088
R-squared           :      0.6853
Adjusted R-squared  :      0.6843
Sum squared residual:    1252.489                F-statistic           :    631.4283
Sigma-square        :       0.206                Prob(F-statistic)     :           0
S.E. of regression  :       0.454                Log likelihood        :   -3828.169
Sigma-square ML     :       0.205                Akaike info criterion :    7700.339
S.E of regression ML:      0.4528                Schwarz criterion     :    7848.128

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
          0_CONSTANT       4.4072424       0.0215156     204.8392695       0.0000000
      0_accommodates       0.0901860       0.0064737      13.9311338       0.0000000
         0_bathrooms       0.1433760       0.0142680      10.0487871       0.0000000
          0_bedrooms       0.1129626       0.0138273       8.1695568       0.0000000
              0_beds      -0.0262719       0.0088380      -2.9726102       0.0029644
   0_rt_Private_room      -0.5293343       0.0189179     -27.9805699       0.0000000
    0_rt_Shared_room      -1.2244586       0.0425969     -28.7452834       0.0000000
    0_pg_Condominium       0.1053065       0.0281309       3.7434523       0.0001832
          0_pg_House      -0.0454471       0.0179571      -2.5308637       0.0114032
          0_pg_Other       0.0607526       0.0276365       2.1982715       0.0279673
      0_pg_Townhouse      -0.0103973       0.0456730      -0.2276456       0.8199294
          1_CONSTANT       4.4799043       0.0250938     178.5260014       0.0000000
      1_accommodates       0.0484639       0.0078806       6.1497397       0.0000000
         1_bathrooms       0.2474779       0.0165661      14.9388057       0.0000000
          1_bedrooms       0.1897404       0.0179229      10.5864676       0.0000000
              1_beds      -0.0506077       0.0107429      -4.7107925       0.0000025
   1_rt_Private_room      -0.5586281       0.0283122     -19.7309699       0.0000000
    1_rt_Shared_room      -1.0528541       0.0841745     -12.5079997       0.0000000
    1_pg_Condominium       0.2044470       0.0339434       6.0231780       0.0000000
          1_pg_House       0.0753534       0.0233783       3.2232188       0.0012743
          1_pg_Other       0.2954848       0.0386455       7.6460385       0.0000000
      1_pg_Townhouse      -0.0735077       0.0493672      -1.4889984       0.1365396
------------------------------------------------------------------------------------
Regimes variable: unknown

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER           14.033

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2        3977.425           0.0000

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test               21         443.593           0.0000
Koenker-Bassett test             21         164.276           0.0000

REGIMES DIAGNOSTICS - CHOW TEST
                 VARIABLE        DF        VALUE           PROB
                 CONSTANT         1           4.832           0.0279
             accommodates         1          16.736           0.0000
                bathrooms         1          22.671           0.0000
                 bedrooms         1          11.504           0.0007
                     beds         1           3.060           0.0802
           pg_Condominium         1           5.057           0.0245
                 pg_House         1          16.793           0.0000
                 pg_Other         1          24.410           0.0000
             pg_Townhouse         1           0.881           0.3480
          rt_Private_room         1           0.740           0.3896
           rt_Shared_room         1           3.309           0.0689
              Global test        11         328.869           0.0000
================================ END OF REPORT =====================================
</pre></div>
</div>
</div>
</div>
</div>
</div>
<div class="section" id="spatial-dependence">
<h3>Spatial Dependence<a class="headerlink" href="#spatial-dependence" title="Permalink to this headline">¶</a></h3>
<p>As we have just discussed, SH is about effects of phenomena that are <em>explicitly linked</em>
to geography and that hence cause spatial variation and clustering. This
encompasses many of the kinds of spatial effects we may be interested in when we fit
linear regressions. However, in other cases, our focus is on the effect of the <em>spatial
configuration</em> of the observations, and the extent to which that has an effect on the
outcome we are considering. For example, we might think that the price of a house not
only depends on whether it is a townhouse or an appartment, but also on
whether it is surrounded by many more townhouses than skyscrapers with more
appartments. This, we could hypothesise, might be related to the different “look and feel” a
neighbourhood with low-height, historic buildings has as compared to one with
modern highrises. To the extent these two different spatial configurations
enter differently the house price determination process, we will be
interested in capturing not only the characteristics of a house, but also of
its surrounding ones.
This kind of spatial effect is fundamentally different
from SH in that is it not related to inherent characteristics of the geography but relates
to the characteristics of the observations in our dataset and, specially, to their spatial
arrangement. We call this phenomenon by which the values of observations are related to
each other through distance <em>spatial dependence</em> <a href="#id5"><span class="problematic" id="id6">:cite:`Anselin_1988`</span></a>.</p>
<p>There are several ways to introduce spatial dependence in an econometric
framework, with varying degrees of econometric sophistication (see
<a href="#id7"><span class="problematic" id="id8">:cite:`Anselin_2002`</span></a> for a good overview). Common to all of them however is the way space is
formally encapsulated: through <em>spatial weights matrices (<span class="math notranslate nohighlight">\(\mathbf{W}\)</span>)</em>, which we discussed in Chapter 4.</p>
<div class="section" id="exogenous-effects-the-slx-model">
<h4>Exogenous effects: The SLX Model<a class="headerlink" href="#exogenous-effects-the-slx-model" title="Permalink to this headline">¶</a></h4>
<p>Let us come back to the house price example we have been working with. So far, we
have hypothesized that the price of a house rented in San Diego through AirBnb can
be explained using information about its own characteristics as well as some
relating to its location such as the neighborhood or the distance to the main
park in the city. However, it is also reasonable to think that prospective renters
care about the larger area around a house, not only about the house itself, and would
be willing to pay more for a house that was surrounded by certain types of houses,
and less if it was located in the middle of other types. How could we test this idea?</p>
<p>The most straightforward way to introduce spatial dependence in a regression is by
considering not only a given explanatory variable, but also its spatial lag. In our
example case, in addition to including a dummy for the type of house (<code class="docutils literal notranslate"><span class="pre">pg_XXX</span></code>), we
can also include the spatial lag of each type of house. This addition implies
we are also including as explanatory factor of the price of a given house the proportion
neighbouring houses in each type. Mathematically, this implies estimating the following model:</p>
<div class="math notranslate nohighlight">
\[
\log(P_i) = \alpha + \sum^{p}_{k=1}X_{ij}\beta_j + \sum^{p}_{k=1}\left(\sum^{N}_{j=1}w_{ij}x_{jk}\right)\gamma_k + \epsilon_i
\]</div>
<p>where <span class="math notranslate nohighlight">\(\sum_{j=1}^N w_{ij}x_{jk}\)</span> represents the spatial lag of the <span class="math notranslate nohighlight">\(k\)</span>th explanatory variable.
This can be stated in <em>matrix</em> form using the spatial weights matrix, <span class="math notranslate nohighlight">\(\mathbf{W}\)</span>, as:
$<span class="math notranslate nohighlight">\(
\log(P_i) = \alpha + \mathbf{X}\beta + \mathbf{WX}\gamma + \epsilon
\)</span>$</p>
<p>This splits the model to focus on two main effects: <span class="math notranslate nohighlight">\(\beta\)</span> and <span class="math notranslate nohighlight">\(\gamma\)</span>. The
<span class="math notranslate nohighlight">\(\beta\)</span> effect describes the change in <span class="math notranslate nohighlight">\(y_i\)</span> when <span class="math notranslate nohighlight">\(X_{ik}\)</span> changes by one.
^[Since we use the log price for a <span class="math notranslate nohighlight">\(y\)</span> variable, our
<span class="math notranslate nohighlight">\(\beta\)</span> coefficients are still all interpreted as <em>elasticities</em>, meaning that a
unit change in the <span class="math notranslate nohighlight">\(x_i\)</span> variate results in a <span class="math notranslate nohighlight">\(\beta\)</span> percent change in the
price <em>y_i</em>]. The subscript for site <span class="math notranslate nohighlight">\(i\)</span> is important here: since we’re dealing
with a <span class="math notranslate nohighlight">\(\mathbf{W}\)</span> matrix, it’s useful to be clear about where the change occurs.</p>
<p>Indeed, this matters for the <span class="math notranslate nohighlight">\(\gamma\)</span> effect, which represents an
<em>indirect</em> effect of a change in <span class="math notranslate nohighlight">\(X_i\)</span>. This can be conceptualized in two ways.
First, one could think of <span class="math notranslate nohighlight">\(\gamma\)</span> as simply <em>the effect of a unit change in your average surroundings.</em>
This is useful and simple. But, this interpretation ignores where this change
might occur. In truth, a change in a variable at site <span class="math notranslate nohighlight">\(i\)</span> will result in a <em>spillover</em> to its surroundings:
when <span class="math notranslate nohighlight">\(x_i\)</span> changes, so too does the <em>spatial lag</em> of any site near <span class="math notranslate nohighlight">\(i\)</span>.
The precise size of this will depend on the structure of <span class="math notranslate nohighlight">\(\mathbf{W}\)</span>, and can be
different for every site. For example, think of a very highly-connected “focal” site in a
row-standardized weight matrix. This focal site will not be strongly affected
if a neighbor changes by a single unit, since each site only contributes a
small amount to the lag at the focal site. Alternatively, consider a site with only
one neighbor: its lag will change by <em>exactly</em> the amount its sole neighbor changes.
Thus, to discover the exact indirect effect of a change <span class="math notranslate nohighlight">\(y\)</span> caused by the change
at a specific site <span class="math notranslate nohighlight">\(x_i\)</span> you would need to compute the <em>change in the spatial lag</em>,
and then use that as your <em>change</em> in <span class="math notranslate nohighlight">\(X\)</span>. We will discuss this in the following section.</p>
<p>In Python, we can calculate the spatial lag of each variable whose name starts by <code class="docutils literal notranslate"><span class="pre">pg_</span></code>
by first creating a list of all of those names, and then applying <code class="docutils literal notranslate"><span class="pre">PySAL</span></code>’s
<code class="docutils literal notranslate"><span class="pre">lag_spatial</span></code> to each of them:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">wx</span> <span class="o">=</span> <span class="n">db</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">like</span><span class="o">=</span><span class="s1">&#39;pg&#39;</span><span class="p">)</span>\
        <span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="k">lambda</span> <span class="n">y</span><span class="p">:</span> <span class="n">weights</span><span class="o">.</span><span class="n">spatial_lag</span><span class="o">.</span><span class="n">lag_spatial</span><span class="p">(</span><span class="n">knn</span><span class="p">,</span> <span class="n">y</span><span class="p">))</span>\
        <span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="k">lambda</span> <span class="n">c</span><span class="p">:</span> <span class="s1">&#39;w_&#39;</span><span class="o">+</span><span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="s1">&#39;w_pg_Apartment&#39;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>Once computed, we can run the model using OLS estimation because, in this
context, the spatial  lags included do not violate any of the assumptions OLS
relies on (they are essentially additional exogenous variables):</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">slx_exog</span> <span class="o">=</span> <span class="n">db</span><span class="p">[</span><span class="n">variable_names</span><span class="p">]</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">wx</span><span class="p">)</span>
<span class="n">m5</span> <span class="o">=</span> <span class="n">spreg</span><span class="o">.</span><span class="n">OLS</span><span class="p">(</span><span class="n">db</span><span class="p">[[</span><span class="s1">&#39;log_price&#39;</span><span class="p">]]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span> 
                  <span class="n">slx_exog</span><span class="o">.</span><span class="n">values</span><span class="p">,</span>
                  <span class="n">name_y</span><span class="o">=</span><span class="s1">&#39;l_price&#39;</span><span class="p">,</span> 
               <span class="n">name_x</span><span class="o">=</span><span class="n">slx_exog</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">tolist</span><span class="p">())</span>
<span class="nb">print</span><span class="p">(</span><span class="n">m5</span><span class="o">.</span><span class="n">summary</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :     l_price                Number of Observations:        6110
Mean dependent var  :      4.9958                Number of Variables   :          15
S.D. dependent var  :      0.8072                Degrees of Freedom    :        6095
R-squared           :      0.6800
Adjusted R-squared  :      0.6792
Sum squared residual:    1273.933                F-statistic           :    924.9423
Sigma-square        :       0.209                Prob(F-statistic)     :           0
S.E. of regression  :       0.457                Log likelihood        :   -3880.030
Sigma-square ML     :       0.208                Akaike info criterion :    7790.061
S.E of regression ML:      0.4566                Schwarz criterion     :    7890.826

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       4.3205814       0.0234977     183.8727044       0.0000000
        accommodates       0.0809972       0.0050046      16.1843874       0.0000000
           bathrooms       0.1893447       0.0108059      17.5224026       0.0000000
            bedrooms       0.1635998       0.0109764      14.9047058       0.0000000
                beds      -0.0451529       0.0068249      -6.6159365       0.0000000
     rt_Private_room      -0.5293783       0.0157308     -33.6524367       0.0000000
      rt_Shared_room      -1.2892590       0.0381443     -33.7995105       0.0000000
      pg_Condominium       0.1063490       0.0221782       4.7952003       0.0000017
            pg_House       0.0327806       0.0156954       2.0885538       0.0367893
            pg_Other       0.0861857       0.0239774       3.5944620       0.0003276
        pg_Townhouse      -0.0277116       0.0338485      -0.8186965       0.4129916
    w_pg_Condominium       0.5928369       0.0689612       8.5966706       0.0000000
          w_pg_House      -0.0774462       0.0318830      -2.4290766       0.0151661
          w_pg_Other       0.4851047       0.0551461       8.7967121       0.0000000
      w_pg_Townhouse      -0.2724493       0.1223388      -2.2270058       0.0259833
------------------------------------------------------------------------------------

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER           14.277

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2        2458.006           0.0000

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test               14         393.052           0.0000
Koenker-Bassett test             14         169.585           0.0000
================================ END OF REPORT =====================================
</pre></div>
</div>
</div>
</div>
<p>The way to interpret the table of results is similar to that of any other
non-spatial regression. The variables we included in the original regression
display similar behaviour, albeit with small changes in size, and can be
interpreted also in a similar way. The spatial lag of each type of property
(<code class="docutils literal notranslate"><span class="pre">w_pg_XXX</span></code>) is the new addition. We observe that, except for the case
of townhouses (same as with the binary variable, <code class="docutils literal notranslate"><span class="pre">pg_Townhouse</span></code>), they are all
significant, suggesting our initial hypothesis on the role of the surrounding
houses might indeed be at work here.</p>
<p>As an illustration, let’s look at some of the direct/indirect effects.
The direct effect of the <code class="docutils literal notranslate"><span class="pre">pg_Condominium</span></code> variable means that condominimums are
typically 11% more expensive (<span class="math notranslate nohighlight">\(\beta_{pg\_Condominium}=0.1063\)</span>) than the benchmark
property type, apartments. More relevant to this section, any given house surrounded by
condominiums <em>also</em> receives a price premium. But, since <span class="math notranslate nohighlight">\(pg_Condominium\)</span> is a dummy variable,
the spatial lag at site <span class="math notranslate nohighlight">\(i\)</span> represents the <em>percentage</em> of properties near <span class="math notranslate nohighlight">\(i\)</span> that are
condominiums, which is between <span class="math notranslate nohighlight">\(0\)</span> and <span class="math notranslate nohighlight">\(1\)</span>.^[Discover this for yourself: what is the average of <code class="docutils literal notranslate"><span class="pre">numpy.array([True,</span> <span class="pre">True,</span> <span class="pre">True,</span> <span class="pre">False,</span> <span class="pre">False,</span> <span class="pre">True)]</span></code>?]
So, a <em>unit</em> change in this variable means that you would increase the condominium
percentage by 100%. Thus, a <span class="math notranslate nohighlight">\(.1\)</span> increase in <code class="docutils literal notranslate"><span class="pre">w_pg_Condominium</span></code> (a change of ten percentage points)
would result in a 5.92% increase in the property house price (<span class="math notranslate nohighlight">\(\beta_{w_pg\_Condominium} = 0.6\)</span>).
Similar interpretations can be derived for all other spatially lagged variables to derive the
<em>indirect</em> effect of a change in the spatial lag.</p>
<p>However, to compute the indirect change for a given site <span class="math notranslate nohighlight">\(i\)</span>, you may need to examine the predicted values for <span class="math notranslate nohighlight">\(y_i\)</span>. In this example, since we are using a row-standardized weights matrix with twenty nearest neighbors, the impact of changing <span class="math notranslate nohighlight">\(x_i\)</span> is the same for all of its neighbors and for any site <span class="math notranslate nohighlight">\(i\)</span>. Thus, the effect is always <span class="math notranslate nohighlight">\(\frac{\gamma}{20}\)</span>, or about <span class="math notranslate nohighlight">\(0.0296\)</span>. However, this would not be the same for many other kinds of weights (like <code class="docutils literal notranslate"><span class="pre">Kernel</span></code>, <code class="docutils literal notranslate"><span class="pre">Queen</span></code>, <code class="docutils literal notranslate"><span class="pre">Rook</span></code>, <code class="docutils literal notranslate"><span class="pre">DistanceBand</span></code>, or <code class="docutils literal notranslate"><span class="pre">Voronoi</span></code>), so we will demonstrate how to construct the indirect effect for a specific <span class="math notranslate nohighlight">\(i\)</span>:</p>
<p>First, predicted values for <span class="math notranslate nohighlight">\(y_i\)</span> are stored in the <code class="docutils literal notranslate"><span class="pre">predy</span></code> attribute of any model:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">m5</span><span class="o">.</span><span class="n">predy</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>array([[5.43610121],
       [5.38596868],
       [4.25377454],
       ...,
       [4.29145318],
       [4.89174746],
       [4.85867698]])
</pre></div>
</div>
</div>
</div>
<p>To build new predictions, we need to follow the equation stated above.</p>
<p>Showing this process below, let’s first change a property to be a condominimum. Consider the third observation, which is the first apartment in the data:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">db</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>accommodates                                                     2
bathrooms                                                      1.0
bedrooms                                                       1.0
beds                                                           1.0
neighborhood                                           North Hills
pool                                                             0
d2balboa                                                  2.493893
coastal                                                          0
price                                                         99.0
log_price                                                  4.59512
id                                                            9553
pg_Apartment                                                     1
pg_Condominium                                                   0
pg_House                                                         0
pg_Other                                                         0
pg_Townhouse                                                     0
rt_Entire_home/apt                                               0
rt_Private_room                                                  1
rt_Shared_room                                                   0
geometry              POINT (-117.1412083878189 32.75326632438691)
residual                                                  0.287341
Name: 2, dtype: object
</pre></div>
</div>
</div>
</div>
<p>This is an apartment. Let’s make a copy of our data and change this apartment into a condominium:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">db_scenario</span> <span class="o">=</span> <span class="n">db</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">db_scenario</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="s1">&#39;pg_Apartment&#39;</span><span class="p">,</span> <span class="s1">&#39;pg_Condominium&#39;</span><span class="p">]]</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># make Apartment 0 and condo 1</span>
</pre></div>
</div>
</div>
</div>
<p>We’ve successfully made the change:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">db_scenario</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>accommodates                                                     2
bathrooms                                                      1.0
bedrooms                                                       1.0
beds                                                           1.0
neighborhood                                           North Hills
pool                                                             0
d2balboa                                                  2.493893
coastal                                                          0
price                                                         99.0
log_price                                                  4.59512
id                                                            9553
pg_Apartment                                                     0
pg_Condominium                                                   1
pg_House                                                         0
pg_Other                                                         0
pg_Townhouse                                                     0
rt_Entire_home/apt                                               0
rt_Private_room                                                  1
rt_Shared_room                                                   0
geometry              POINT (-117.1412083878189 32.75326632438691)
residual                                                  0.287341
Name: 2, dtype: object
</pre></div>
</div>
</div>
</div>
<p>Now, we need to <em>also</em> update the spatial lag variates:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">wx_scenario</span> <span class="o">=</span> <span class="n">db_scenario</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">like</span><span class="o">=</span><span class="s1">&#39;pg&#39;</span><span class="p">)</span>\
                         <span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="k">lambda</span> <span class="n">y</span><span class="p">:</span> <span class="n">weights</span><span class="o">.</span><span class="n">spatial_lag</span><span class="o">.</span><span class="n">lag_spatial</span><span class="p">(</span><span class="n">knn</span><span class="p">,</span> <span class="n">y</span><span class="p">))</span>\
                         <span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="k">lambda</span> <span class="n">c</span><span class="p">:</span> <span class="s1">&#39;w_&#39;</span><span class="o">+</span><span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="s1">&#39;w_pg_Apartment&#39;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>And build a new exogenous <span class="math notranslate nohighlight">\(\mathbf{X}\)</span> matrix, including the a constant 1 as the leading column</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">slx_exog_scenario</span> <span class="o">=</span> <span class="n">db_scenario</span><span class="p">[</span><span class="n">variable_names</span><span class="p">]</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">wx_scenario</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>Now, our new prediction (in the scenario where we have changed site <code class="docutils literal notranslate"><span class="pre">2</span></code> from an apartment into a condominium), is:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">y_pred_scenario</span> <span class="o">=</span> <span class="n">m5</span><span class="o">.</span><span class="n">betas</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="n">slx_exog_scenario</span> <span class="o">@</span> <span class="n">m5</span><span class="o">.</span><span class="n">betas</span><span class="p">[</span><span class="mi">1</span><span class="p">:]</span>
</pre></div>
</div>
</div>
</div>
<p>This prediction will be exactly the same for all sites, except site <code class="docutils literal notranslate"><span class="pre">2</span></code> and its neighbors. So, the <em>neighbors</em> to site <code class="docutils literal notranslate"><span class="pre">2</span></code> are:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">knn</span><span class="o">.</span><span class="n">neighbors</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>[772,
 2212,
 139,
 4653,
 2786,
 1218,
 138,
 808,
 1480,
 4241,
 1631,
 3617,
 2612,
 1162,
 135,
 23,
 5528,
 3591,
 407,
 6088]
</pre></div>
</div>
</div>
</div>
<p>And the effect of changing site <code class="docutils literal notranslate"><span class="pre">2</span></code> into a condominium is associated with the following changes to <span class="math notranslate nohighlight">\(y_i\)</span>:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="p">(</span><span class="n">y_pred_scenario</span> <span class="o">-</span> <span class="n">m5</span><span class="o">.</span><span class="n">predy</span><span class="p">)</span><span class="o">.</span><span class="n">loc</span><span class="p">[[</span><span class="mi">2</span><span class="p">]</span> <span class="o">+</span> <span class="n">knn</span><span class="o">.</span><span class="n">neighbors</span><span class="p">[</span><span class="mi">2</span><span class="p">]]</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_html"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>0</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>2</th>
      <td>0.106349</td>
    </tr>
    <tr>
      <th>772</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>2212</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>139</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>4653</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>2786</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>1218</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>138</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>808</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>1480</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>4241</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>1631</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>3617</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>2612</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>1162</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>135</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>23</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>5528</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>3591</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>407</th>
      <td>0.029642</td>
    </tr>
    <tr>
      <th>6088</th>
      <td>0.029642</td>
    </tr>
  </tbody>
</table>
</div></div></div>
</div>
<p>We see the first row, representing the direct effect, is equal exactly to the estimate for <code class="docutils literal notranslate"><span class="pre">pg_Condominium</span></code>. For the other effects, though, we have only changed <code class="docutils literal notranslate"><span class="pre">w_pg_Condominium</span></code> by <span class="math notranslate nohighlight">\(.05\)</span></p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">scenario_near_2</span> <span class="o">=</span> <span class="n">slx_exog_scenario</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">knn</span><span class="o">.</span><span class="n">neighbors</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="s1">&#39;w_pg_Condominium&#39;</span><span class="p">]]</span>
<span class="n">orig_near_2</span> <span class="o">=</span> <span class="n">slx_exog</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">knn</span><span class="o">.</span><span class="n">neighbors</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="s1">&#39;w_pg_Condominium&#39;</span><span class="p">]]</span>
<span class="n">scenario_near_2</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">orig_near_2</span><span class="p">,</span> <span class="n">lsuffix</span><span class="o">=</span><span class="s1">&#39;_scenario&#39;</span><span class="p">,</span> <span class="n">rsuffix</span><span class="o">=</span> <span class="s1">&#39;_original&#39;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_html"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>w_pg_Condominium_scenario</th>
      <th>w_pg_Condominium_original</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>772</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
    <tr>
      <th>2212</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
    <tr>
      <th>139</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
    <tr>
      <th>4653</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
    <tr>
      <th>2786</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
    <tr>
      <th>1218</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
    <tr>
      <th>138</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
    <tr>
      <th>808</th>
      <td>0.05</td>
      <td>0.00</td>
    </tr>
    <tr>
      <th>1480</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
    <tr>
      <th>4241</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
    <tr>
      <th>1631</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
    <tr>
      <th>3617</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
    <tr>
      <th>2612</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
    <tr>
      <th>1162</th>
      <td>0.05</td>
      <td>0.00</td>
    </tr>
    <tr>
      <th>135</th>
      <td>0.05</td>
      <td>0.00</td>
    </tr>
    <tr>
      <th>23</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
    <tr>
      <th>5528</th>
      <td>0.05</td>
      <td>0.00</td>
    </tr>
    <tr>
      <th>3591</th>
      <td>0.05</td>
      <td>0.00</td>
    </tr>
    <tr>
      <th>407</th>
      <td>0.05</td>
      <td>0.00</td>
    </tr>
    <tr>
      <th>6088</th>
      <td>0.10</td>
      <td>0.05</td>
    </tr>
  </tbody>
</table>
</div></div></div>
</div>
<p>Introducing a spatial lag of an explanatory variable, as we have just seen, is the most straightforward way of incorporating the notion of spatial dependence in a linear regression framework. It does not require additional changes, it can be estimated with OLS, and the interpretation is rather similar to interpreting non-spatial variables, so long as aggregate changes are required.</p>
<p>The field of spatial econometrics however is a much broader one and has produced over the last decades many techniques to deal with spatial effects and spatial dependence in different ways. Although this might be an over simplification, one can say that most of such efforts for the case of a single cross-section are focused on two main variations: the spatial lag and the spatial error model. Both are similar to the case we have seen in that they are based on the introduction of a spatial lag, but they differ in the component of the model they modify and affect.</p>
</div>
<div class="section" id="spatial-error">
<h4>Spatial Error<a class="headerlink" href="#spatial-error" title="Permalink to this headline">¶</a></h4>
<p>The spatial error model includes a spatial lag in the <em>error</em> term of the equation:</p>
<div class="math notranslate nohighlight">
\[
\log{P_i} = \alpha + \sum_k \beta_k X_{ki} + u_i
\]</div>
<div class="math notranslate nohighlight">
\[
u_i = \lambda u_{lag-i} + \epsilon_i
\]</div>
<p>where <span class="math notranslate nohighlight">\(u_{lag-i} = \sum_j w_{i,j} u_j\)</span>.
Although it appears similar, this specification violates the assumptions about the
error term in a classical OLS model. Hence, alternative estimation methods are
required. <code class="docutils literal notranslate"><span class="pre">PySAL</span></code> incorporates functionality to estimate several of the most
advanced techniques developed by the literature on spatial econometrics. For
example, we can use a general method of moments that account for
heterogeneity (Arraiz et al., 2010):</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">m6</span> <span class="o">=</span> <span class="n">spreg</span><span class="o">.</span><span class="n">GM_Error_Het</span><span class="p">(</span><span class="n">db</span><span class="p">[[</span><span class="s1">&#39;log_price&#39;</span><span class="p">]]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span> <span class="n">db</span><span class="p">[</span><span class="n">variable_names</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span>
                           <span class="n">w</span><span class="o">=</span><span class="n">knn</span><span class="p">,</span> <span class="n">name_y</span><span class="o">=</span><span class="s1">&#39;log_price&#39;</span><span class="p">,</span> <span class="n">name_x</span><span class="o">=</span><span class="n">variable_names</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">m6</span><span class="o">.</span><span class="n">summary</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>REGRESSION
----------
SUMMARY OF OUTPUT: SPATIALLY WEIGHTED LEAST SQUARES (HET)
---------------------------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :   log_price                Number of Observations:        6110
Mean dependent var  :      4.9958                Number of Variables   :          11
S.D. dependent var  :      0.8072                Degrees of Freedom    :        6099
Pseudo R-squared    :      0.6655
N. of iterations    :           1                Step1c computed       :          No

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       4.4262033       0.0217088     203.8898738       0.0000000
        accommodates       0.0695536       0.0063268      10.9934495       0.0000000
           bathrooms       0.1614101       0.0151312      10.6673765       0.0000000
            bedrooms       0.1739251       0.0146697      11.8560847       0.0000000
                beds      -0.0377710       0.0088293      -4.2779023       0.0000189
     rt_Private_room      -0.4947947       0.0163843     -30.1993140       0.0000000
      rt_Shared_room      -1.1613985       0.0515304     -22.5381175       0.0000000
      pg_Condominium       0.1003761       0.0213148       4.7092198       0.0000025
            pg_House       0.0308334       0.0147100       2.0960849       0.0360747
            pg_Other       0.0861768       0.0254942       3.3802547       0.0007242
        pg_Townhouse      -0.0074515       0.0292873      -0.2544285       0.7991646
              lambda       0.6448728       0.0186626      34.5543739       0.0000000
              lambda       0.6448728       0.0186626      34.5543739       0.0000000
------------------------------------------------------------------------------------
================================ END OF REPORT =====================================
</pre></div>
</div>
</div>
</div>
</div>
<div class="section" id="spatial-lag">
<h4>Spatial Lag<a class="headerlink" href="#spatial-lag" title="Permalink to this headline">¶</a></h4>
<p>The spatial lag model introduces a spatial lag of the <em>dependent</em> variable. In the example we have covered, this would translate into:</p>
<div class="math notranslate nohighlight">
\[
\log{P_i} = \alpha + \rho \log{P_{lag-i}} + \sum_k \beta_k X_{ki} + \epsilon_i
\]</div>
<p>Although it might not seem very different from the previous equation, this model violates
the exogeneity assumption, crucial for OLS to work.
Put simply, this occurs when <span class="math notranslate nohighlight">\(P_i\)</span> exists on both “sides” of the equals sign.
In theory, since <span class="math notranslate nohighlight">\(P_i\)</span> is included in computing <span class="math notranslate nohighlight">\(P_{lag-i}\)</span>, exogeneity is violated.
Similarly to the case of
the spatial error, several techniques have been proposed to overcome this
limitation, and <code class="docutils literal notranslate"><span class="pre">PySAL</span></code> implements several of them. In the example below, we
use a two-stage least squares estimation <a href="#id9"><span class="problematic" id="id10">:cite:`Anselin_1988`</span></a>, where the spatial
lag of all the explanatory variables is used as instrument for the endogenous
lag:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">m7</span> <span class="o">=</span> <span class="n">spreg</span><span class="o">.</span><span class="n">GM_Lag</span><span class="p">(</span><span class="n">db</span><span class="p">[[</span><span class="s1">&#39;log_price&#39;</span><span class="p">]]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span> <span class="n">db</span><span class="p">[</span><span class="n">variable_names</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span>
                     <span class="n">w</span><span class="o">=</span><span class="n">knn</span><span class="p">,</span> <span class="n">name_y</span><span class="o">=</span><span class="s1">&#39;log_price&#39;</span><span class="p">,</span> <span class="n">name_x</span><span class="o">=</span><span class="n">variable_names</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">m7</span><span class="o">.</span><span class="n">summary</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>REGRESSION
----------
SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES
--------------------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :   log_price                Number of Observations:        6110
Mean dependent var  :      4.9958                Number of Variables   :          12
S.D. dependent var  :      0.8072                Degrees of Freedom    :        6098
Pseudo R-squared    :      0.7057
Spatial Pseudo R-squared:  0.6883

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       2.7440254       0.0727290      37.7294400       0.0000000
        accommodates       0.0697596       0.0048157      14.4859187       0.0000000
           bathrooms       0.1626725       0.0104007      15.6405467       0.0000000
            bedrooms       0.1604137       0.0104823      15.3033012       0.0000000
                beds      -0.0365065       0.0065336      -5.5874750       0.0000000
     rt_Private_room      -0.4981415       0.0151396     -32.9031780       0.0000000
      rt_Shared_room      -1.1157392       0.0365563     -30.5210777       0.0000000
      pg_Condominium       0.1072995       0.0209048       5.1327614       0.0000003
            pg_House      -0.0004017       0.0136828      -0.0293598       0.9765777
            pg_Other       0.1207503       0.0214771       5.6222929       0.0000000
        pg_Townhouse      -0.0185543       0.0322730      -0.5749190       0.5653461
         W_log_price       0.3416482       0.0147787      23.1175620       0.0000000
------------------------------------------------------------------------------------
Instrumented: W_log_price
Instruments: W_accommodates, W_bathrooms, W_bedrooms, W_beds,
             W_pg_Condominium, W_pg_House, W_pg_Other, W_pg_Townhouse,
             W_rt_Private_room, W_rt_Shared_room
================================ END OF REPORT =====================================
</pre></div>
</div>
</div>
</div>
<p>Similarly to the effects in the SLX regression, changes in the spatial lag regression need to be interpreted with care. Here, <code class="docutils literal notranslate"><span class="pre">w_log_price</span></code> applies consistently over all observations, and actually changes the effective strength of each of the <span class="math notranslate nohighlight">\(\beta\)</span> coefficients. Thus, it is useful to use predictions and scenario-building to predict <span class="math notranslate nohighlight">\(y\)</span> when changing <span class="math notranslate nohighlight">\(X\)</span>, which allows you to analyze the <em>direct</em> and <em>indirect</em> components.</p>
</div>
<div class="section" id="other-ways-of-bringing-space-into-regression">
<h4>Other ways of bringing space into regression<a class="headerlink" href="#other-ways-of-bringing-space-into-regression" title="Permalink to this headline">¶</a></h4>
<p>While these are some kinds of spatial regressions, many other advanced spatial regression methods see routine use in statistics, data science, and applied analysis. For example, Generalized Additive Models <a href="#id11"><span class="problematic" id="id12">:cite:`Gibbons_2015,Wood_2006`</span></a> haven been used to apply spatial kernel smoothing directly within a regression function. Other similar smoothing methods, such as spatial Gaussian process models <a href="#id13"><span class="problematic" id="id14">:cite:`Brunsdon_2010`</span></a> or Kriging, conceptualize the dependence between locations as smooth as well. Other methods in spatial regression that consider graph-based geographies (rather than distance/kernel effects) include variations on conditional autoregressive model, which examines spatial relationships at locations <em>conditional</em> on their surroundings, rather than as jointly co-emergent with them. Full coverage of these topics is beyond the scope of this book, however, though <a href="#id15"><span class="problematic" id="id16">:cite:`Banerjee_2008`</span></a> provides a detailed and comprehensive discussion.</p>
</div>
</div>
</div>
<div class="section" id="questions">
<h2>Questions<a class="headerlink" href="#questions" title="Permalink to this headline">¶</a></h2>
<ol class="simple">
<li><p>One common kind of spatial econometric model is the “Spatial Durbin Model,” which combines the SLX model with the spatial lag model. Alternatively, the “Spatial Durbin Error Model” combines the SLX model with the spatial error model. Fit a Spatial Durbin variant of the spaital models fit in this chapter.</p>
<ul class="simple">
<li><p>Do these variants improve the model fit?</p></li>
<li><p>What happens to the spatial autocorrelation parameters (<span class="math notranslate nohighlight">\(\rho\)</span>, <span class="math notranslate nohighlight">\(\lambda\)</span>) when the SLX term is added? Why might this occur?</p></li>
</ul>
</li>
<li><p>Fortunately for us, spatial error models recover the same estimates (asymptotically) as a typical OLS estimate, although their confidence intervals will change. Statistically, this occurs because OLS is <em>inefficient</em> when there is spatial correlation and/or spatial heteroskedasticity. How much do the confidence intervals change when the spatial error model is fit?</p></li>
<li><p>One common justification for the SLX model (and the Spatial Durbin variants) is about <em>omitted, spatially-patterned variables</em>. That is, if an omitted variable is associated with the included variables <em>and</em> is spatially-patterned, then we can use the spatial structure of our existing variables to mimic the omitted variable. In our spatial lag model,</p>
<ul class="simple">
<li><p>what variables might we be missing that are important to predict the price of an AirBnB?</p></li>
<li><p>would these omitted variables have a similar spatial pattern to our included variables? why or why not?</p></li>
</ul>
</li>
<li><p>Where <em>spatial</em> regression models generally focus on how nearby observations are similar to one another, <em>platial</em> models focus on how observations in the same spatial group are similar to one another. These are often dealt with using multilevel or spatial mixed-effect models. When do these two ideas work together well? And, when might these disagree?</p></li>
</ol>
<div class="section" id="challenge-questions">
<h3>Challenge Questions<a class="headerlink" href="#challenge-questions" title="Permalink to this headline">¶</a></h3>
<p>The following discussions are a bit challenging, but reflect extra enhancements to the discussions in the chapter that may solidify or enhance an advanced understanding of the material.</p>
<div class="section" id="the-random-coast">
<h4>The random coast<a class="headerlink" href="#the-random-coast" title="Permalink to this headline">¶</a></h4>
<p>In the section analyzing our naive model residuals, we ran a classic two-sample <span class="math notranslate nohighlight">\(t\)</span>-test to identify whether or not our coastal and not-coastal residential districts tended to have the same prediction errors. Often, though, it’s better to use straightforward, data-driven testing and simulation methods than assuming that the mathematical assumptions of the <span class="math notranslate nohighlight">\(t\)</span>-statistic are met.</p>
<p>To do this, we can shuffle our assignments to coast and not-coast, and check whether or not there are differences in the distributions of the <em>observed</em> residual distributions and random distributions. In this way, we shuffle the observations that are on the coast, and plot the resulting cumulative distributions.</p>
<p>Below, we do this; running 100 simulated re-assignments of districts to either “coast” or “not coast,” and comparing the distributions of randomly-assigned residuals to the observed distributions of residuals. Further, we do this plotting by the <em>empirical cumulative density function</em>, not the histogram directly. This is because the <em>empirical cumulative density function</em> is usually easier to examine visually, especially for subtle differences.</p>
<p>Below, the black lines represent our simulations, and the colored patches below them represents the observed distribution of residuals. If the black lines tend to be on the left of the colored patch, then, the simulations (where prediction error is totally random with respect to our categories of “coastal” and “not coastal”) tend to have more negative residuals than our actual model. If the black lines tend to be on the right, then they tend to have more positive residuals. As a refresher, positive residuals mean that our model is underpredicting, and negative residuals mean that our model is overpredicting. Below, our simulations provide direct evidence for the claim that our model may be systematically underpredicting coastal price and overpredicting non-coastal prices.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">n_simulations</span> <span class="o">=</span> <span class="mi">100</span>
<span class="n">f</span><span class="p">,</span> <span class="n">ax</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="mi">2</span><span class="p">,</span><span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span><span class="mi">3</span><span class="p">),</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">sharey</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">ax</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">hist</span><span class="p">(</span><span class="n">coastal</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s1">&#39;r&#39;</span><span class="p">,</span> <span class="n">alpha</span><span class="o">=.</span><span class="mi">5</span><span class="p">,</span> 
           <span class="n">density</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">bins</span><span class="o">=</span><span class="mi">30</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s1">&#39;Coastal&#39;</span><span class="p">,</span> 
           <span class="n">cumulative</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">ax</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">hist</span><span class="p">(</span><span class="n">not_coastal</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s1">&#39;b&#39;</span><span class="p">,</span> <span class="n">alpha</span><span class="o">=.</span><span class="mi">5</span><span class="p">,</span>
           <span class="n">density</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">bins</span><span class="o">=</span><span class="mi">30</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s1">&#39;Not Coastal&#39;</span><span class="p">,</span> 
           <span class="n">cumulative</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">for</span> <span class="n">simulation</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n_simulations</span><span class="p">):</span>
    <span class="n">shuffled_residuals</span> <span class="o">=</span> <span class="n">m1</span><span class="o">.</span><span class="n">u</span><span class="p">[</span><span class="n">numpy</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">permutation</span><span class="p">(</span><span class="n">m1</span><span class="o">.</span><span class="n">n</span><span class="p">)]</span>
    <span class="n">random_coast</span><span class="p">,</span> <span class="n">random_notcoast</span> <span class="o">=</span> <span class="p">(</span><span class="n">shuffled_residuals</span><span class="p">[</span><span class="n">is_coastal</span><span class="p">],</span> 
                                     <span class="n">shuffled_residuals</span><span class="p">[</span><span class="o">~</span><span class="n">is_coastal</span><span class="p">])</span>
    <span class="k">if</span> <span class="n">simulation</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
        <span class="n">label</span> <span class="o">=</span> <span class="s1">&#39;Simulations&#39;</span>
    <span class="k">else</span><span class="p">:</span>
        <span class="n">label</span> <span class="o">=</span> <span class="kc">None</span>
    <span class="n">ax</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">hist</span><span class="p">(</span><span class="n">random_coast</span><span class="p">,</span> 
                <span class="n">density</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> 
                <span class="n">histtype</span><span class="o">=</span><span class="s1">&#39;step&#39;</span><span class="p">,</span>
                <span class="n">color</span><span class="o">=</span><span class="s1">&#39;k&#39;</span><span class="p">,</span> <span class="n">alpha</span><span class="o">=.</span><span class="mi">05</span><span class="p">,</span> <span class="n">bins</span><span class="o">=</span><span class="mi">30</span><span class="p">,</span> 
                <span class="n">label</span><span class="o">=</span><span class="n">label</span><span class="p">,</span> 
                <span class="n">cumulative</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
    <span class="n">ax</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">hist</span><span class="p">(</span><span class="n">random_coast</span><span class="p">,</span> 
                <span class="n">density</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> 
                <span class="n">histtype</span><span class="o">=</span><span class="s1">&#39;step&#39;</span><span class="p">,</span>
                <span class="n">color</span><span class="o">=</span><span class="s1">&#39;k&#39;</span><span class="p">,</span> <span class="n">alpha</span><span class="o">=.</span><span class="mi">05</span><span class="p">,</span> <span class="n">bins</span><span class="o">=</span><span class="mi">30</span><span class="p">,</span> 
                <span class="n">label</span><span class="o">=</span><span class="n">label</span><span class="p">,</span> 
                <span class="n">cumulative</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">ax</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">legend</span><span class="p">()</span>
<span class="n">ax</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">legend</span><span class="p">()</span>
<span class="n">plt</span><span class="o">.</span><span class="n">tight_layout</span><span class="p">()</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="../_images/11_regression_80_0.png" src="../_images/11_regression_80_0.png" />
</div>
</div>
</div>
<div class="section" id="the-k-neighbor-correlogram">
<h4>The K-neighbor correlogram<a class="headerlink" href="#the-k-neighbor-correlogram" title="Permalink to this headline">¶</a></h4>
<p>Further, it might be the case that spatial dependence in our mis-predictions only matters for sites that are extremely close to one another, and decays quickly with distance.
To investigate this, we can examine the correlation between each sites’ residual and the <em>average</em> of the <span class="math notranslate nohighlight">\(k\)</span>th nearest neighbors’ residuals, increasing <span class="math notranslate nohighlight">\(k\)</span> until the estimate stabilizes.
This main idea is central to the geostatistical concept, the <em>correlogram</em>, which gives the correlation between sites of an attribute being studied as distance increases.</p>
<p>One quick way to check whether or not what we’ve seen is <em>unique</em> or <em>significant</em> is to compare it to what happens when we just assign neighbors randomly.
If what we observe is substantially different from what emerges when neighbors are random, then the structure of the neighbors embeds a structure in the residuals.
We won’t spend too much time on this theory specifically, but we can quickly and efficiently compute the correlation between our observed residuals and the spatial lag of an increasing <span class="math notranslate nohighlight">\(k\)</span>-nearest neighbor set:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">correlations</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">nulls</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">order</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">51</span><span class="p">,</span> <span class="mi">5</span><span class="p">):</span>
    <span class="n">knn</span><span class="o">.</span><span class="n">reweight</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">order</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="c1">#operates in place, quickly and efficiently avoiding copies</span>
    <span class="n">knn</span><span class="o">.</span><span class="n">transform</span> <span class="o">=</span> <span class="s1">&#39;r&#39;</span>
    <span class="n">lag_residual</span> <span class="o">=</span> <span class="n">weights</span><span class="o">.</span><span class="n">spatial_lag</span><span class="o">.</span><span class="n">lag_spatial</span><span class="p">(</span><span class="n">knn</span><span class="p">,</span> <span class="n">m1</span><span class="o">.</span><span class="n">u</span><span class="p">)</span>
    <span class="n">random_residual</span> <span class="o">=</span> <span class="n">m1</span><span class="o">.</span><span class="n">u</span><span class="p">[</span><span class="n">numpy</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">permutation</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">m1</span><span class="o">.</span><span class="n">u</span><span class="p">))]</span> 
    <span class="n">random_lag_residual</span> <span class="o">=</span> <span class="n">weights</span><span class="o">.</span><span class="n">spatial_lag</span><span class="o">.</span><span class="n">lag_spatial</span><span class="p">(</span><span class="n">knn</span><span class="p">,</span> <span class="n">random_residual</span><span class="p">)</span> <span class="c1"># identical to random neighbors in KNN </span>
    <span class="n">correlations</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">numpy</span><span class="o">.</span><span class="n">corrcoef</span><span class="p">(</span><span class="n">m1</span><span class="o">.</span><span class="n">u</span><span class="o">.</span><span class="n">flatten</span><span class="p">(),</span> <span class="n">lag_residual</span><span class="o">.</span><span class="n">flatten</span><span class="p">())[</span><span class="mi">0</span><span class="p">,</span><span class="mi">1</span><span class="p">])</span>
    <span class="n">nulls</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">numpy</span><span class="o">.</span><span class="n">corrcoef</span><span class="p">(</span><span class="n">m1</span><span class="o">.</span><span class="n">u</span><span class="o">.</span><span class="n">flatten</span><span class="p">(),</span> <span class="n">random_lag_residual</span><span class="o">.</span><span class="n">flatten</span><span class="p">())[</span><span class="mi">0</span><span class="p">,</span><span class="mi">1</span><span class="p">])</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stderr highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>/opt/conda/lib/python3.8/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 1849 disconnected components.
  warnings.warn(message)
/opt/conda/lib/python3.8/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 9 disconnected components.
  warnings.warn(message)
/opt/conda/lib/python3.8/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 4 disconnected components.
  warnings.warn(message)
/opt/conda/lib/python3.8/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 3 disconnected components.
  warnings.warn(message)
/opt/conda/lib/python3.8/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 2 disconnected components.
  warnings.warn(message)
</pre></div>
</div>
</div>
</div>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="mi">51</span><span class="p">,</span><span class="mi">5</span><span class="p">),</span> <span class="n">correlations</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="mi">51</span><span class="p">,</span><span class="mi">5</span><span class="p">),</span> <span class="n">nulls</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s1">&#39;orangered&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">hlines</span><span class="p">(</span><span class="n">numpy</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">correlations</span><span class="p">[</span><span class="o">-</span><span class="mi">3</span><span class="p">:]),</span><span class="o">*</span><span class="n">plt</span><span class="o">.</span><span class="n">xlim</span><span class="p">(),</span><span class="n">linestyle</span><span class="o">=</span><span class="s1">&#39;:&#39;</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s1">&#39;k&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">hlines</span><span class="p">(</span><span class="n">numpy</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">nulls</span><span class="p">[</span><span class="o">-</span><span class="mi">3</span><span class="p">:]),</span><span class="o">*</span><span class="n">plt</span><span class="o">.</span><span class="n">xlim</span><span class="p">(),</span><span class="n">linestyle</span><span class="o">=</span><span class="s1">&#39;:&#39;</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s1">&#39;k&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">text</span><span class="p">(</span><span class="n">s</span><span class="o">=</span><span class="s1">&#39;Long-Run Correlation: $</span><span class="si">{:.2f}</span><span class="s1">$&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">numpy</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">correlations</span><span class="p">[</span><span class="o">-</span><span class="mi">3</span><span class="p">:])),</span> <span class="n">x</span><span class="o">=</span><span class="mi">25</span><span class="p">,</span><span class="n">y</span><span class="o">=.</span><span class="mi">3</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">text</span><span class="p">(</span><span class="n">s</span><span class="o">=</span><span class="s1">&#39;Long-Run Null: $</span><span class="si">{:.2f}</span><span class="s1">$&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">numpy</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">nulls</span><span class="p">[</span><span class="o">-</span><span class="mi">3</span><span class="p">:])),</span> <span class="n">x</span><span class="o">=</span><span class="mi">25</span><span class="p">,</span> <span class="n">y</span><span class="o">=.</span><span class="mi">05</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">xlabel</span><span class="p">(</span><span class="s1">&#39;$K$: number of nearest neighbors&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">ylabel</span><span class="p">(</span><span class="s2">&quot;Correlation between site </span><span class="se">\n</span><span class="s2"> and neighborhood average of size $K$&quot;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="../_images/11_regression_83_0.png" src="../_images/11_regression_83_0.png" />
</div>
</div>
<p>Clearly, the two curves are different. The observed correlation reaches a peak around <span class="math notranslate nohighlight">\(r=.34\)</span> when around 20 nearest listings are used. This means that adding more than 20 nearest neighbors does not significantly change the correlation in the residuals. Further, the lowest correlation is for the single nearest neighbor, and correlation rapidly increases as more neighbors are added close to the listing. Thus, this means that there does appear to be an unmeasured spatial structure in the residuals, since they are more similar to one another when they are near than when they are far apart. Further, while it’s not shown here (since computationally, it becomes intractable), as the number of nearest neighbors gets very large (approaching the number of observations in the dataset), the average of the <span class="math notranslate nohighlight">\(k\)</span>th nearest neighbors’ residuals goes to zero, the global average of residuals. This means that the correlation of the residuals and a vector that is nearly constant begins to approach zero.</p>
<p>The null correlations, however, use randomly-chosen neighbors (without reassignment).
Thus, since sampling is truly random in this case, each average of <span class="math notranslate nohighlight">\(k\)</span> randomly-chosen neighbors is usually zero (the global mean).
So, the correlation between the observed residual and the average of <span class="math notranslate nohighlight">\(k\)</span> randomly-chosen residuals is also usually zero.
Thus, increasing the number of randomly-chosen neighbors does not significantly adjust the long-run average of zero.
Taken together, we can conclude that there is distinct positive spatial dependence in the error.
This means that our over- and under-predictions are likely to cluster.</p>
</div>
</div>
</div>
</div>

    <script type="text/x-thebe-config">
    {
        requestKernel: true,
        binderOptions: {
            repo: "binder-examples/jupyter-stacks-datascience",
            ref: "master",
        },
        codeMirrorConfig: {
            theme: "abcdef",
            mode: "python"
        },
        kernelOptions: {
            kernelName: "python3",
            path: "./notebooks"
        },
        predefinedOutput: true
    }
    </script>
    <script>kernelName = 'python3'</script>

              </div>
              
        
        <div class='prev-next-bottom'>
            
    <a class='left-prev' id="prev-link" href="10_clustering_and_regionalization.html" title="previous page">Clustering &amp; Regionalization</a>
    <a class='right-next' id="next-link" href="12_feature_engineering.html" title="next page">Spatial Feature Engineering</a>

        </div>
        
        </div>
    </div>
    <footer class="footer mt-5 mt-md-0">
    <div class="container">
      <p>
        
          By Sergio J. Rey, Dani Arribas-Bel, Levi J. Wolf<br/>
        
            &copy; Copyright 2020.<br/>
          <div class="extra_footer">
            <a rel="license" href="http://creativecommons.org/licenses/by-nc-nd/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-nd/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-nd/4.0/">Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License</a>.

          </div>
      </p>
    </div>
  </footer>
</main>


      </div>
    </div>

    
  <script src="../_static/js/index.d3f166471bb80abb5163.js"></script>


    <!-- Google Analytics -->
    <script>
      window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;
      ga('create', 'UA-146598819-1', 'auto');
      ga('set', 'anonymizeIp', true);
      ga('send', 'pageview');
    </script>
    <script async src='https://www.google-analytics.com/analytics.js'></script>
    <!-- End Google Analytics -->
    
  </body>
</html>