In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [100]:
# Cargar datos
datos = pd.read_csv('melb_data.csv')
datos.head()

Unnamed: 0.1,Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,6,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [101]:
datos.columns

Index(['Unnamed: 0', 'Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method',
       'SellerG', 'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom',
       'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea',
       'Lattitude', 'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [102]:
# Renombrar indices
datos.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
datos.set_index('index', inplace=True)
datos.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [103]:
# Librería modelo de arbol de decisión
from sklearn.tree import DecisionTreeRegressor

# Librería de entrenamiento y pruebas
from sklearn.model_selection import train_test_split

In [104]:
# Seleccionar la variable objetivo o dependiente (y)
y = datos.Price

# Seleccionar las variables independientes (X)
columnas_seleccionadas = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

X = datos[columnas_seleccionadas]

X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,18396.0,14925.0,13603.0,15064.0,15064.0
mean,2.93504,1.538492,558.116371,-37.809849,144.996338
std,0.958202,0.689311,3987.326586,0.081152,0.106375
min,1.0,0.0,0.0,-38.18255,144.43181
25%,2.0,1.0,176.5,-37.8581,144.931193
50%,3.0,1.0,440.0,-37.803625,145.00092
75%,3.0,2.0,651.0,-37.75627,145.06
max,12.0,8.0,433014.0,-37.40853,145.52635


In [105]:
# Borramos los valores nulos o faltantes
#datos = datos.dropna(axis=0)

# Libreria para imputar valores nulos o faltantes
from sklearn.impute import SimpleImputer

# Imputar valores nulos o faltantes
imputer = SimpleImputer()
X = pd.DataFrame(imputer.fit_transform(X))

# ** Se pueden borrar el nombre de las columnas **
X.columns = columnas_seleccionadas

X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,18396.0,18396.0,18396.0,18396.0,18396.0
mean,2.93504,1.538492,558.116371,-37.809849,144.996338
std,0.958202,0.62088,3428.730081,0.073436,0.09626
min,1.0,0.0,0.0,-38.18255,144.43181
25%,2.0,1.0,250.75,-37.8454,144.950952
50%,3.0,1.538492,558.116371,-37.809849,144.996338
75%,3.0,2.0,596.0,-37.7674,145.0469
max,12.0,8.0,433014.0,-37.40853,145.52635


In [106]:
#Obtener variables categoricas
object = (X.dtypes == 'object')
object

Rooms         False
Bathroom      False
Landsize      False
Lattitude     False
Longtitude    False
dtype: bool

In [107]:
# Separar datos en entrenamiento y pruebas
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [108]:
# Definimos el modelo
modelo = DecisionTreeRegressor(random_state=0)

In [109]:
# Entrenamos el modelo
modelo.fit(X_train, y_train)

In [110]:
# Predicción
print("Predicciones para las siguientes 10 casas: ")
print(X_test)
print("Las predicciones son: ")
prediccion = modelo.predict(X_test)
print(prediccion)

Predicciones para las siguientes 10 casas: 
       Rooms  Bathroom    Landsize  Lattitude  Longtitude
3008     1.0  1.000000    0.000000 -37.878400  144.986400
9189     2.0  1.000000  308.000000 -37.790800  144.871700
5053     5.0  3.000000   98.000000 -37.784300  144.893900
8264     3.0  3.000000  506.000000 -37.805000  144.944800
15373    4.0  2.000000  785.000000 -37.852970  145.238150
...      ...       ...         ...        ...         ...
17645    3.0  1.000000  718.000000 -37.880610  145.124490
16641    4.0  2.000000  870.000000 -37.726400  145.127490
11574    2.0  1.000000  158.000000 -37.892670  145.054740
3932     2.0  1.000000    0.000000 -37.939200  145.018600
12128    3.0  1.538492  558.116371 -37.809849  144.996338

[4599 rows x 5 columns]
Las predicciones son: 
[ 520000.          495000.         1350000.         ...  737000.
  600000.          968205.97912886]


In [111]:
# Agregar la columna de predicciones y real al dataframe
#X_test['Price'] = prediccion
#X_test['Real Price'] = y_test

In [112]:
# Mostrar las predicciones y los precios reales
#print(X_test.head(10))

In [113]:
# Validacion del modelo
from sklearn.metrics import mean_absolute_error

prediccion = modelo.predict(X_test)
mean_absolute_error(y_test, prediccion)


275744.00334899296

In [114]:
# Funcion obtener error medio

def obtener_error_medio(max_leaf_nodes, X_train, X_test, y_train, y_test):
    modelo = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    modelo.fit(X_train, y_train)
    prediccion = modelo.predict(X_test)
    mae = mean_absolute_error(y_test, prediccion)
    return mae

In [115]:
# Comparar el error medio con diferentes valores de max_leaf_nodes (Profundidad del arbol)
for max_leaf_nodes in [10, 15, 30, 100, 200, 1000, 5000]:
    mi_mae = obtener_error_medio(max_leaf_nodes, X_train, X_test, y_train, y_test)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, mi_mae))

Max leaf nodes: 10  		 Mean Absolute Error:  339359
Max leaf nodes: 15  		 Mean Absolute Error:  321898
Max leaf nodes: 30  		 Mean Absolute Error:  302865
Max leaf nodes: 100  		 Mean Absolute Error:  275307
Max leaf nodes: 200  		 Mean Absolute Error:  266082
Max leaf nodes: 1000  		 Mean Absolute Error:  260961
Max leaf nodes: 5000  		 Mean Absolute Error:  276117
