# California Housing Prices

Los datos están sacados del censo de California en 1990.

Simplemente descargamos un único archivo comprimido, housing.tgz, que contiene un archivo de valores separados por comas (CSV) llamado housing.csv con todos los datos.

In [2]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
#definimos qué backend queremos que utilice Jupyter
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import seaborn as sns

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [3]:
import os
import tarfile
from six.moves import urllib #paquete que reúne diversos módulos para trabajar con URLs

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/" #Github repo con archivos
HOUSING_PATH = os.path.join("datasets", "housing") # path.join join one or more path components "intelligently"
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz" # carpeta del repo con los datos

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path): # si no existe directorio, lo crea en la linea de abajo
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz") 
    urllib.request.urlretrieve(housing_url, tgz_path)# con urlretrieve ->
    # -> Copy a network object denoted by a URL to a local file
    housing_tgz = tarfile.open(tgz_path) # leemos / abrimos fichero
    housing_tgz.extractall(path=housing_path) # Extract all members from the archive to the current wrkng drectory
    housing_tgz.close()

In [4]:
#Llamamos a la función que busca los datos


In [5]:
#Importamos pandas

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [1]:
#Nos quedamos con los 10 primeras filas


In [2]:
#Sacar filas del dataframe


In [3]:
# General info


In [4]:
#Obtener numero de casas en cada rango de distancia


### Para sacar la media podemos hacerlo con Numpy o con Pandas

In [5]:
#Numpy - sacamos array


In [6]:
#Aplicamos los métodos de numpy al array


### Aplicar método mean() a la columna del df de pandas

In [7]:
#Valor medio de una columna


In [8]:
#Estadísticas agregadas de múltiples columnas (valores numéricos)


In [9]:
# Se pueden definir estadísticas agregadas en vez de las predefinidas por el describe()



In [10]:
#Podemos sacar los percentiles



### Podemos ayudarnos de las visualizaciones para entender los datos

Discover and visualize the data to gain insights

In [15]:
import matplotlib.image as mpimg
'''
california_img=mpimg.imread(PROJECT_ROOT_DIR + '/images/end_to_end_project/california.png')
ax = housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(10,7),
                       s=housing['population']/100, label="Population",
                       c="median_house_value", cmap=plt.get_cmap("jet"),
                       colorbar=False, alpha=0.4,
                      )
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,
           cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)

prices = housing["median_house_value"]
tick_values = np.linspace(prices.min(), prices.max(), 11)
cbar = plt.colorbar()
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values], fontsize=14)
cbar.set_label('Median House Value', fontsize=16)

plt.legend(fontsize=16)
#save_fig("california_housing_prices_plot")
plt.show()
'''

'\ncalifornia_img=mpimg.imread(PROJECT_ROOT_DIR + \'/images/end_to_end_project/california.png\')\nax = housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(10,7),\n                       s=housing[\'population\']/100, label="Population",\n                       c="median_house_value", cmap=plt.get_cmap("jet"),\n                       colorbar=False, alpha=0.4,\n                      )\nplt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,\n           cmap=plt.get_cmap("jet"))\nplt.ylabel("Latitude", fontsize=14)\nplt.xlabel("Longitude", fontsize=14)\n\nprices = housing["median_house_value"]\ntick_values = np.linspace(prices.min(), prices.max(), 11)\ncbar = plt.colorbar()\ncbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values], fontsize=14)\ncbar.set_label(\'Median House Value\', fontsize=16)\n\nplt.legend(fontsize=16)\n#save_fig("california_housing_prices_plot")\nplt.show()\n'

### En muchas ocasiones, podemos crear datos nuevos a partir de datos ya existentes, por ejemplo: