In [1]:
from __future__ import division
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import pysal as ps
import geopandas as gpd
from geopandas import GeoSeries, GeoDataFrame
from shapely.geometry import Point
from sklearn import neighbors

sns.set(style="white")
sns.set_context({"figure.figsize": (24, 10)})

pd.options.display.float_format = '{:.2f}'.format

abb_link = './tfg/dbases/development3.csv'
zc_link = './tfg/mapas/barrios_area.shp'

muestra = pd.read_csv(abb_link)
barrios = gpd.read_file(zc_link)

geometry = [Point(xy) for xy in zip(muestra['lon'], muestra['lat'])]
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(muestra, crs=crs, geometry=geometry)

db = gpd.sjoin(geo_df, barrios, how="inner", op='intersects')

metro = pd.read_csv('./tfg/dbases/distance_matrix_metro.csv')

db = db.join(metro.set_index('InputID'),
                            on='id', how='left')

db = db.rename(index=str, columns={"DESBDT": "subdistrict_f", "Distance": "metro_distance", "NUMPOINTS": "metro_number"})

db = pd.DataFrame(db)
db['floor']=db['floor'].replace(['Ground floor', 'Mezzanine', 'Semi-basement', 'Basement', 'ground', 'Floor -2', 'Floor -1'], 0,regex=True)
#db.replace(u'\xe', 'A')
db['floor'] = pd.to_numeric(db['floor'])

In [2]:
varis = ['pricems', 'rooms', 'floor', 'needs_renovating', 'garden', 'terrace', 'new_dev', 'garage']

In [3]:
db.loc[:,varis].describe()

Unnamed: 0,pricems,rooms,floor,needs_renovating,garden,terrace,new_dev,garage
count,19177.0,19177.0,19177.0,19177.0,19177.0,19177.0,19177.0,19177.0
mean,3288.34,3.01,3.02,0.18,0.27,0.42,0.03,0.34
std,1755.78,1.28,2.65,0.38,0.45,0.49,0.17,0.47
min,468.85,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2000.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,3000.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0
75%,4109.59,4.0,4.0,0.0,1.0,1.0,0.0,1.0
max,23787.53,25.0,60.0,1.0,1.0,1.0,1.0,1.0


In [3]:
y = np.log(db['pricems'])

In [4]:
yxs = db.loc[:, varis + ['pricems']].dropna()

Regresion no espacial sin cluster

In [5]:
m1 = ps.spreg.ols.OLS(y.values[:, None], yxs.drop('pricems', axis=1).values, \
                  name_x=yxs.drop('pricems', axis=1).columns.tolist(), name_y='ln(pricems)')

In [6]:
print(m1.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  : ln(pricems)                Number of Observations:       19177
Mean dependent var  :      7.9693                Number of Variables   :           8
S.D. dependent var  :      0.5114                Degrees of Freedom    :       19169
R-squared           :      0.0936
Adjusted R-squared  :      0.0932
Sum squared residual:    4545.057                F-statistic           :    282.6218
Sigma-square        :       0.237                Prob(F-statistic)     :           0
S.E. of regression  :       0.487                Log likelihood        :  -13406.695
Sigma-square ML     :       0.237                Akaike info criterion :   26829.391
S.E of regression ML:      0.4868                Schwarz criterion     :   26892.283

-----------------------------------------------------------------------------

In [7]:
zona = dict()
mreg = dict()

In [8]:
for clu in range(0, 8):
    
    zona[clu] = db[db["cl"] == clu]
    y = np.log(zona[clu]['pricems'])
    yxs = zona[clu].loc[:, varis + ['pricems']].dropna()
    
    mreg[clu] = ps.spreg.ols.OLS(y.values[:, None], yxs.drop('pricems', axis=1).values, \
                  name_x=yxs.drop('pricems', axis=1).columns.tolist(), name_y='ln(pricems)', \
                  name_ds = 'zona ' + str([clu]))
    print(mreg[clu].summary)

REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :    zona [0]
Weights matrix      :        None
Dependent Variable  : ln(pricems)                Number of Observations:        1927
Mean dependent var  :      7.9418                Number of Variables   :           8
S.D. dependent var  :      0.3581                Degrees of Freedom    :        1919
R-squared           :      0.1440
Adjusted R-squared  :      0.1409
Sum squared residual:     211.443                F-statistic           :     46.1128
Sigma-square        :       0.110                Prob(F-statistic)     :   1.129e-60
S.E. of regression  :       0.332                Log likelihood        :    -605.186
Sigma-square ML     :       0.110                Akaike info criterion :    1226.372
S.E of regression ML:      0.3312                Schwarz criterion     :    1270.882

-----------------------------------------------------------------------------