In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.cluster import KMeans
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import plotly.express as px
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

# armamento del Data Frame proveniente de scikit-learn

data = datasets.fetch_california_housing()

df = pd.DataFrame(data["data"],columns=data["feature_names"])
df["MedHouseVal"] = data["target"]
df["MedHouseVal"] = df["MedHouseVal"] * 100000

df_model = df.copy() # generando una copia, asi no afectamos al Data Frame original

# removiendo los valores atípicos para la optimización del modelo

def remove_outliers(df,columns):
    for c in columns:
        df[c] = df[c].mask(zscore(df[c]).abs() > 3, np.nan)
    
    return df

df_model = remove_outliers(df_model,df.columns)

df_model.dropna(inplace=True)

df_model 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_model[data["feature_names"]],
                                                    df_model["MedHouseVal"],
                                                    test_size=0.25)

# ajuste de hiperparámetros utilizando Random Search 

xgbr_test = XGBRegressor()           

turned_parameters = {
    "n_estimators":[100,200,300],
    "max_depth":[3,4,5],
    "learning_rate":[0.3,0.4,0.5],
    "min_child_weight":[1,2,3]
}

grid_search = RandomizedSearchCV(xgbr_test,turned_parameters,cv=5)
grid_search.fit(x_train,y_train)

# asignandole al modelo los parámetros que obtuvieron mejores resultados

xgbr = XGBRegressor(n_estimators = grid_search.best_params_["n_estimators"],
                    max_depth = grid_search.best_params_["max_depth"],
                    learning_rate = grid_search.best_params_["learning_rate"],
                    min_child_weight = grid_search.best_params_["min_child_weight"])    


model = xgbr.fit(x_train,y_train)

# validando la eficiencia del modelo con los datos de prueba

print(f"\nCoeficiente de determinación: {model.score(x_test, y_test)}")

### Características en las viviendas

In [None]:
app = dash.Dash(__name__)

app.layout = html.Div(id="body",className="e5_body",children=[
    html.H1("Viviendas en California ",id="title",className="e5_title"),
        html.Div(id="div",className="e5_div",children=[
            dcc.Dropdown(id="dropdown",className="e5_dropdown",
                        options = [
                            {"label":"Ingreso medio","value":"MedInc"},
                            {"label":"Edad media","value":"HouseAge"},
                            {"label":"Promedio de habitaciones","value":"AveRooms"},
                            {"label":"Promedio de dormitorios","value":"AveBedrms"},
                            {"label":"Población","value":"Population"},
                            {"label":"Promedio de ocupación","value":"AveOccuption"}
                        ],
                        value="MedInc",
                        multi=False,
                        clearable=False)]),
            dcc.Graph(id="graph",className="e5_graph",figure={})
])

@app.callback(
    Output(component_id="graph",component_property="figure"),
    [Input(component_id="dropdown",component_property="value")]
)

def update_graph(slct_var):
    
    scatter_map = px.scatter(df,x="Longitude",y="Latitude",color=slct_var)
    
    return scatter_map
    
if __name__ == "__main__":
    app.run_server(debug=False)

### Método del codo

In [None]:
clusters = []
inertias = []

for c in range(3,12):
    kmeans = KMeans(n_clusters=c).fit(df["MedHouseVal"].values.reshape((-1,1)))
    clusters.append(c)
    inertias.append(kmeans.inertia_)
    
plt.plot(clusters,inertias,marker="o")
plt.grid("on")
plt.show()

### Clustering

In [None]:
kmeans = KMeans(n_clusters=5).fit(df["MedHouseVal"].values.reshape((-1,1)))

clusters = kmeans.labels_

df["clusters"] = clusters
df["index"] = df.index

range_values = np.array([])

for c in df["clusters"].sort_values().unique():
    cluster = df.loc[df["clusters"] == c,["clusters","MedHouseVal"]]
    max_value = str(cluster["MedHouseVal"].max())
    min_value = str(cluster["MedHouseVal"].min())
    range_values = np.append(range_values,min_value)
    range_values = np.append(range_values,max_value)
    
range_values = range_values.reshape((-1,2))
    
df["clusters"] = df["clusters"].replace(
    {
        0:f"0 ({range_values[0,0][:8]}$-{range_values[0,1][:8]}$)",
        1:f"1 ({range_values[1,0][:8]}$-{range_values[1,1][:8]}$)",
        2:f"2 ({range_values[2,0][:8]}$-{range_values[2,1][:8]}$)",
        3:f"3 ({range_values[3,0][:8]}$-{range_values[3,1][:8]}$)",
        4:f"4 ({range_values[4,0][:8]}$-{range_values[4,1][:8]}$)"
    })


fig = px.scatter(df,x="index",y="MedHouseVal",color="clusters")
fig.update_layout(xaxis_title="Houses",yaxis_title="Values",title="Range of Houses'values")
fig 

#### Con este método generamos grupos que clasifiquen rangos de precios en las casas