#### Objetivo

 * Aplicar los conceptos de regresión lineal para resolver un problema práctico utilizando Python.

In [None]:
# Se cargan las bibliotecas
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import pandas as pd


In [None]:
# Se usa biblioteca google para poder usar archivos .
from google.colab import drive
# Se conecta colab con drive.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Se carga el conjunto de datos
path ="/content/drive/MyDrive/CLASE MACHINE LEARNING/DATA_CORES/house-prices.csv"
df = pd.read_csv(path)

#### Exploración de datos

In [None]:
df.head()

Unnamed: 0,Home,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick,Neighborhood
0,1,114300,1790,2,2,2,No,East
1,2,114200,2030,4,2,3,No,East
2,3,114800,1740,3,2,1,No,East
3,4,94700,1980,3,2,3,No,East
4,5,119800,2130,3,3,3,No,East


In [None]:
# se colocan nombres de columnas en minuscula
df.columns = df.columns.str.lower().str.strip()
df.columns

Index(['home', 'price', 'sqft', 'bedrooms', 'bathrooms', 'offers', 'brick',
       'neighborhood'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   home          128 non-null    int64 
 1   price         128 non-null    int64 
 2   sqft          128 non-null    int64 
 3   bedrooms      128 non-null    int64 
 4   bathrooms     128 non-null    int64 
 5   offers        128 non-null    int64 
 6   brick         128 non-null    object
 7   neighborhood  128 non-null    object
dtypes: int64(6), object(2)
memory usage: 8.1+ KB


####  Se deduce de la nformación, que no hay faltantes

In [None]:
# Revizamos datos outliers
df.describe().round(2)

Unnamed: 0,home,price,sqft,bedrooms,bathrooms,offers
count,128.0,128.0,128.0,128.0,128.0,128.0
mean,64.5,130427.34,2000.94,3.02,2.45,2.58
std,37.09,26868.77,211.57,0.73,0.51,1.07
min,1.0,69100.0,1450.0,2.0,2.0,1.0
25%,32.75,111325.0,1880.0,3.0,2.0,2.0
50%,64.5,125950.0,2000.0,3.0,2.0,3.0
75%,96.25,148250.0,2140.0,3.0,3.0,3.0
max,128.0,211200.0,2590.0,5.0,4.0,6.0


#### No se encuentras datos que se disparen de la mayoria

In [None]:
# revizamos nulos
df.isna().sum()

Unnamed: 0,0
home,0
price,0
sqft,0
bedrooms,0
bathrooms,0
offers,0
brick,0
neighborhood,0


####No hay valores nulos

In [None]:
df["brick"].unique()

array(['No', 'Yes'], dtype=object)

In [None]:
df["neighborhood"].unique()

array(['East', 'North', 'West'], dtype=object)

In [None]:
df["price"].value_counts()

Unnamed: 0_level_0,count
price,Unnamed: 1_level_1
103200,2
105600,2
117800,2
129800,2
125700,2
...,...
119700,1
147900,1
113500,1
149900,1


In [None]:
df["sqft"].value_counts()

Unnamed: 0_level_0,count
sqft,Unnamed: 1_level_1
2000,5
1920,5
1930,5
1990,4
2150,4
...,...
2530,1
1890,1
1710,1
2070,1


In [None]:
df["bedrooms"].value_counts()

Unnamed: 0_level_0,count
bedrooms,Unnamed: 1_level_1
3,67
2,30
4,29
5,2


In [None]:
df["bathrooms"].value_counts()

Unnamed: 0_level_0,count
bathrooms,Unnamed: 1_level_1
2,72
3,55
4,1


#### Preparación de datos

In [None]:
df.columns

Index(['home', 'price', 'sqft', 'bedrooms', 'bathrooms', 'offers', 'brick',
       'neighborhood'],
      dtype='object')

In [None]:
# Se define el features y target.
X = df.drop(['price', 'home', 'offers', 'brick', 'neighborhood'], axis=1)
y = df["price"]


In [None]:
# Se dividen los datos  en train y test.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Se selecciona columnas
num_cols = ["sqft", "bedrooms", "bathrooms"]


In [None]:
# Se define transformadores.
num_transformer = Pipeline(steps=[
         ('scaler', StandardScaler())
        ])


In [None]:
# Se combina transformadores
preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, num_cols),
         ])

In [None]:
# Se integra un pipeline
pipeline = Pipeline(steps=[
       ('preprocessor', preprocessor),
       ('model', LinearRegression())
])

In [None]:
# Se entrena el modelo
pipeline.fit(X_train, y_train)

In [None]:
# Se realiza predicciones
y_pred = pipeline.predict(X_test)

In [None]:
# Evaluación del modelo
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio: {mse}')
print(f'R-cuadrado: {r2}')


Error Cuadrático Medio: 320149938.23026806
R-cuadrado: 0.456819577754888


#### El Mse es extremadamente alto y el R-cuadrado solo explica el 45.68% de la variavilidad de los datos reales.
#### El modelo no es eficiente, puesto que maneja solo un 45.6% de predicción y se debe catalogar como bajo y poco confiable.

### Predicción del precio de una casa :

In [None]:
# Se crea un DataFrame con las estructura del DataFrame del modelo
casa_nueva  = pd.DataFrame({
                           'sqft':[2000],
                       'bedrooms':[3],
                      'bathrooms':[2],
                          })


In [None]:
# Se realiza prediccion
precio_pred = pipeline.predict(casa_nueva)
print(f"Precio estimado casa de 2000 pies2: $ {precio_pred[0]:.2f}")

Precio estimado casa de 2000 pies2: $ 123537.00


In [None]:
df[(df["sqft"] == 2000) & (df["bedrooms"] == 3) & (df["bathrooms"] == 2)]

Unnamed: 0,home,price,sqft,bedrooms,bathrooms,offers,brick,neighborhood
21,22,113800,2000,3,2,4,No,North
113,114,115700,2000,3,2,3,Yes,North


#### Se concluye que el precio de 123.537.00 entregado por el modelo es relativamente mayor al precio en la muestra de datos, encontrandose dos registros con precios de 113.800 y 115.700, para la misma cantidad de pies2 y numero de dormitorios y baños.
