In [1]:
from __future__ import division
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import pysal as ps
import geopandas as gpd
from geopandas import GeoSeries, GeoDataFrame
from shapely.geometry import Point
from sklearn import neighbors

sns.set(style="white")
sns.set_context({"figure.figsize": (24, 10)})

pd.options.display.float_format = '{:.2f}'.format

abb_link = './tfg/dbases/development3.csv'
zc_link = './tfg/mapas/barrios_area.shp'

muestra = pd.read_csv(abb_link)
barrios = gpd.read_file(zc_link)

geometry = [Point(xy) for xy in zip(muestra['lon'], muestra['lat'])]
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(muestra, crs=crs, geometry=geometry)

db = gpd.sjoin(geo_df, barrios, how="inner", op='intersects')

metro = pd.read_csv('./tfg/dbases/distance_matrix_metro.csv')

db = db.join(metro.set_index('InputID'),
                            on='id', how='left')

db = db.rename(index=str, columns={"DESBDT": "subdistrict_f", "Distance": "metro_distance", "NUMPOINTS": "metro_number"})

db = pd.DataFrame(db)
db['floor']=db['floor'].replace(['Ground floor', 'Mezzanine', 'Semi-basement', 'Basement', 'ground', 'Floor -2', 'Floor -1'], 0,regex=True)
#db.replace(u'\xe', 'A')
db['floor'] = pd.to_numeric(db['floor'])

In [2]:
varis = ['pricems', 'rooms', 'floor', 'needs_renovating', 'garden', 'terrace', 'new_dev', 'garage']

In [31]:
zona = dict()
mcl = dict()
mscl = dict()

In [33]:
for clu in range(0, 8):
    
    zona[clu] = db[(db["cl"] == clu) & (db["share_loc"] == 1)].drop_duplicates(subset=["lat", "lon"] )
    y = np.log(zona[clu]['pricems'])
    yxs = zona[clu].loc[:, varis + ['pricems']].dropna()
    
    w = ps.knnW_from_array(zona[clu].loc[\
                               yxs.index, \
                              ['lon', 'lat']\
                              ].values)
    w.transform = 'R'
    
    mcl[clu] = ps.spreg.GM_Lag(y.values[:, None], yxs.drop('pricems', axis=1).values, \
                  w=w, spat_diag=True, \
                  name_x=yxs.drop('pricems', axis=1).columns.tolist(), name_y='ln(pricems)', \
                  name_ds = 'zona ' + str([clu]))
    
    mscl[clu] = mse(y, mcl[clu].predy.flatten())
    
    print(mcl[clu].summary)

REGRESSION
----------
SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES
--------------------------------------------------
Data set            :    zona [0]
Weights matrix      :     unknown
Dependent Variable  : ln(pricems)                Number of Observations:         715
Mean dependent var  :      7.9761                Number of Variables   :           7
S.D. dependent var  :      0.3347                Degrees of Freedom    :         708
Pseudo R-squared    :      0.4576
Spatial Pseudo R-squared:  0.1492

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       4.0711675       0.7416391       5.4894190       0.0000000
               floor       0.0070630       0.0031433       2.2470083       0.0246395
    needs_renovating      -0.1143547       0.0279152      -

Endogena

In [48]:
zona = db[(db["share_loc"] == 1)].drop_duplicates(subset=["lat", "lon"] )
y = np.log(zona['pricems'])
yxs = zona.loc[:, varis + ['pricems']].dropna()

w = ps.knnW_from_array(zona.loc[\
                           yxs.index, \
                          ['lon', 'lat']\
                          ].values)
w.transform = 'R'

mreg = ps.spreg.GM_Lag(y.values[:, None], yxs.drop('pricems', axis=1).values, \
              w=w, spat_diag=True, \
              name_x=yxs.drop('pricems', axis=1).columns.tolist(), name_y='ln(pricems)', \
              name_ds = 'madrid')

print(mreg.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES
--------------------------------------------------
Data set            :      madrid
Weights matrix      :     unknown
Dependent Variable  : ln(pricems)                Number of Observations:        5673
Mean dependent var  :      7.9001                Number of Variables   :           7
S.D. dependent var  :      0.4282                Degrees of Freedom    :        5666
Pseudo R-squared    :      0.5786
Spatial Pseudo R-squared:  0.0725

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       4.1645780       0.3229571      12.8951421       0.0000000
               floor       0.0173748       0.0016446      10.5650279       0.0000000
    needs_renovating      -0.1306761       0.0128406     -1

In [25]:
varis = ['pricems', 'rooms', 'floor', 'needs_renovating', 'terrace', 'new_dev', 'garage']

zona = db[(db["share_loc"] == 1)].drop_duplicates(subset=["lat", "lon"] )
y = np.log(zona['pricems'])
yxs = zona.loc[:, varis + ['pricems']].dropna()

w = ps.knnW_from_array(zona.loc[\
                           yxs.index, \
                          ['lon', 'lat']\
                          ].values)
w.transform = 'R'

mreg = ps.spreg.GM_Lag(y.values[:, None], yxs.drop('pricems', axis=1).values, \
              w=w, spat_diag=True, \
              name_x=yxs.drop('pricems', axis=1).columns.tolist(), name_y='ln(pricems)', \
              name_ds = 'madrid')

print(mreg.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES
--------------------------------------------------
Data set            :      madrid
Weights matrix      :     unknown
Dependent Variable  : ln(pricems)                Number of Observations:        5673
Mean dependent var  :      7.9001                Number of Variables   :           8
S.D. dependent var  :      0.4282                Degrees of Freedom    :        5665
Pseudo R-squared    :      0.5868
Spatial Pseudo R-squared:  0.0665

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       4.0317772       0.3117614      12.9322510       0.0000000
               rooms      -0.0154038       0.0036961      -4.1675557       0.0000308
               floor       0.0178522       0.0016146      1

Regresores

In [6]:
varis = ['pricems', 'rooms', 'floor', 'needs_renovating', 'garden', 'terrace', 'new_dev', 'garage']

zona = db[(db["share_loc"] == 1)].drop_duplicates(subset=["lat", "lon"] )
y = np.log(zona['pricems'])
yxs = zona.loc[:, varis + ['pricems']].dropna()
w_garden = ps.knnW_from_array(zona.loc[\
                               yxs.index, \
                              ['lon', 'lat']\
                              ].values)
yxs_w = yxs.assign(w_garden=ps.lag_spatial(w_garden, yxs['garden'].values))

In [7]:
m2 = ps.spreg.OLS(y.values[:, None], \
                  yxs_w.drop('pricems', axis=1).values, \
                  w=w, spat_diag=True, \
                  name_x=yxs_w.drop('pricems', axis=1).columns.tolist(), name_y='ln(pricems)', name_ds = 'madrid')
print(m2.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :      madrid
Weights matrix      :     unknown
Dependent Variable  : ln(pricems)                Number of Observations:        5673
Mean dependent var  :      7.9001                Number of Variables   :           9
S.D. dependent var  :      0.4282                Degrees of Freedom    :        5664
R-squared           :      0.0795
Adjusted R-squared  :      0.0782
Sum squared residual:     957.433                F-statistic           :     61.1423
Sigma-square        :       0.169                Prob(F-statistic)     :   2.563e-96
S.E. of regression  :       0.411                Log likelihood        :   -3002.887
Sigma-square ML     :       0.169                Akaike info criterion :    6023.773
S.E of regression ML:      0.4108                Schwarz criterion     :    6083.564

-----------------------------------------------------------------------------

In [8]:
varis = ['pricems', 'floor', 'needs_renovating', 'garden', 'terrace',  'garage']

zona = db[(db["share_loc"] == 1)].drop_duplicates(subset=["lat", "lon"] )
y = np.log(zona['pricems'])
yxs = zona.loc[:, varis + ['pricems']].dropna()
w_garden = ps.knnW_from_array(zona.loc[\
                               yxs.index, \
                              ['lon', 'lat']\
                              ].values)
yxs_w = yxs.assign(w_garden=ps.lag_spatial(w_garden, yxs['garden'].values))

m2 = ps.spreg.OLS(y.values[:, None], \
                  yxs_w.drop('pricems', axis=1).values, \
                  w=w, spat_diag=True, \
                  name_x=yxs_w.drop('pricems', axis=1).columns.tolist(), name_y='ln(pricems)', name_ds = 'madrid')
print(m2.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :      madrid
Weights matrix      :     unknown
Dependent Variable  : ln(pricems)                Number of Observations:        5673
Mean dependent var  :      7.9001                Number of Variables   :           7
S.D. dependent var  :      0.4282                Degrees of Freedom    :        5666
R-squared           :      0.0789
Adjusted R-squared  :      0.0779
Sum squared residual:     958.038                F-statistic           :     80.9036
Sigma-square        :       0.169                Prob(F-statistic)     :   1.848e-97
S.E. of regression  :       0.411                Log likelihood        :   -3004.679
Sigma-square ML     :       0.169                Akaike info criterion :    6023.359
S.E of regression ML:      0.4109                Schwarz criterion     :    6069.863

-----------------------------------------------------------------------------