In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import numpy as np

df_pipe = pd.read_csv("../linear-regression/CURSO_IA_ML/Aula 2/housing.csv")

In [139]:
target = "median_house_value"
cat_cols = ['ocean_proximity']
num_cols = [
    'longitude',
    'latitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income'
]

In [146]:
def prepare_train(df: pd.DataFrame):
    prepared_data = preprocessor.fit_transform(df)

    # Pega os nomes das colunas transformadas
    cat_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_cols)

    # Junta tudo
    all_feature_names = list(num_cols) + list(cat_feature_names)

    df_transformed = pd.DataFrame(
        prepared_data,
        columns=all_feature_names
    )

    return df_transformed, all_feature_names

def prepare_predict(df: pd.DataFrame, all_feature_names: list):
    prepared_data = preprocessor.transform(df)
    df_transformed = pd.DataFrame(prepared_data, columns=all_feature_names, index=df.index)

    return df_transformed

def createTarget(df: pd.DataFrame, col_name: str):
    n = len(df[col_name])
    bins = int(np.ceil(np.log2(n) + 1))
    # bins = 5
    labels = list(range(1, bins + 1))  # Ex: [1, 2, 3, ..., k]
    df['target_class'] = pd.cut(df[target], bins=bins, labels=labels)

def stratify(df: pd.DataFrame, col_name: str):
    createTarget(df, col_name)

    print(df.head())

    train_set: pd.DataFrame = pd.DataFrame()
    test_set: pd.DataFrame = pd.DataFrame()

    splitter = StratifiedShuffleSplit(
        n_splits=1,
        test_size=0.2,
        random_state=42 # seed
    )

    split_data = splitter.split(
        df,
        df["target_class"],
    )

    for train_index, test_index in split_data:
        train_set = df.loc[train_index]
        test_set = df.loc[test_index]

    return train_set, test_set

In [125]:
num_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='median')),
])

cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_cols),
    ('num', num_pipeline, num_cols),
])

In [141]:
train_set, test_set = stratify(df_pipe, 'median_income')
target_label = train_set[target].copy() #armazenando a target (nosso y)

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  \
0       322.0       126.0         8.3252            452600.0        NEAR BAY   
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY   
2       496.0       177.0         7.2574            352100.0        NEAR BAY   
3       558.0       219.0         5.6431            341300.0        NEAR BAY   
4       565.0       259.0         3.8462            342200.0        NEAR BAY   

  target_class  
0            5  
1            4  
2            4  


In [142]:
train_prepared, all_feature_names = prepare_train(train_set)

## Avaliando o modelo

In [75]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [86]:
# Mean Absolute Percentage Error
def mape(labels, predictions):
    errors = np.abs(labels - predictions)
    relative_errors = errors / np.abs(labels)
    mape = np.mean(relative_errors) * 100
    print(f"MAPE: {mape:.2f}%")

def mae(real, pred, show = True):
    lin_mae = mean_absolute_error(real, pred)
    if show is True:
        print(f"MAE: {lin_mae:.2f}")
    return lin_mae

# MEAN SQUARED ERROR. Penaliza muito mais valores distantes da média.
def mse(real, pred):
    lin_mse = mae(real, pred, show=False)
    lin_rmse = np.sqrt(lin_mse) # raiz quadrada aqui
    # print(lin_rmse)
    print(f"MSE: {lin_rmse:.2f}")

def r2(real, pred):
    r2 = r2_score(real, pred)
    print(f"R2: {r2:.2f}")

def evaluate(real, pred):
    mse(real, pred)
    mae(real, pred)
    r2(real, pred)
    mape(real, pred)

## Modelo - Linear Regression

In [144]:
from sklearn.linear_model import LinearRegression

def linearRegressionModel():
    lin_reg = LinearRegression()
    lin_reg.fit(train_prepared, target_label)

    test_prepared = prepare_predict(test_set, all_feature_names)
    test_real = test_set[target]

    lin_reg_pred = lin_reg.predict(test_prepared)

    evaluate(test_real, lin_reg_pred)

linearRegressionModel()

MSE: 222.13
MAE: 49343.69
R2: 0.65
MAPE: 28.34%


## Modelo - Árvore de Decisão

In [145]:
from sklearn.tree import DecisionTreeRegressor

def DecisionTreeRegressorModel():
    model_dtr = DecisionTreeRegressor(max_depth=10)
    model_dtr.fit(train_prepared, target_label)

    test_prepared = prepare_predict(test_set, all_feature_names)
    test_real = test_set[target]

    dtr_pred = model_dtr.predict(test_prepared)

    evaluate(test_real, dtr_pred)

DecisionTreeRegressorModel()

MSE: 201.20
MAE: 40483.35
R2: 0.73
MAPE: 22.39%
