<a href="https://colab.research.google.com/github/daycardoso/PredictCost/blob/main/regress%C3%A3o_linear.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import cudf
import cupy as cp
from cuml.linear_model import LinearRegression as cuLR
from sklearn.linear_model import LinearRegression as skLR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# =====================
# 1. Carregamento dos Dados
# =====================
try:
    df_cudf = cudf.read_csv('/content/drive/MyDrive/Mestrado/Trabalho ML Mestrado 01-2025/df_unificado.csv')
except:
    df_pd = pd.read_csv('/content/drive/MyDrive/Mestrado/Trabalho ML Mestrado 01-2025/df_unificado.csv')
    df_cudf = cudf.from_pandas(df_pd)

# Conversão para float32
for col in df_cudf.select_dtypes(include=['float64']).columns:
    df_cudf[col] = df_cudf[col].astype(cp.float32)

X = df_cudf.iloc[:, :-1]
y = df_cudf.iloc[:, -1]

# =====================
# 2. Divisão treino/teste
# =====================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =====================
# 3. Treinamento do modelo
# =====================
try:
    linear_regression_model = cuLR()
    linear_regression_model.fit(X_train, y_train)
    y_pred = linear_regression_model.predict(X_test)
except Exception as e:
    print("cuML falhou, usando scikit-learn:", e)
    X_train_pd = X_train.to_pandas() if hasattr(X_train, "to_pandas") else X_train
    y_train_pd = y_train.to_pandas() if hasattr(y_train, "to_pandas") else y_train
    X_test_pd = X_test.to_pandas() if hasattr(X_test, "to_pandas") else X_test

    linear_regression_model = skLR()
    linear_regression_model.fit(X_train_pd, y_train_pd)
    y_pred = linear_regression_model.predict(X_test_pd)

# =====================
# 4. Avaliação
# =====================
y_test_cpu = y_test.to_pandas() if hasattr(y_test, "to_pandas") else y_test
y_pred_cpu = y_pred.to_pandas() if hasattr(y_pred, "to_pandas") else y_pred

print("R²:", r2_score(y_test_cpu, y_pred_cpu))
print("MSE:", mean_squared_error(y_test_cpu, y_pred_cpu))
print("MAE:", mean_absolute_error(y_test_cpu, y_pred_cpu))




R²: 0.712709550510821
MSE: 0.001094289967591171
MAE: 0.02773324448692172


In [8]:
import numpy as np
# Import garbage collector for memory cleanup
import gc

def nested_cv_linear_evaluation(X, y, outer_folds=5, random_state=42):
    from sklearn.model_selection import KFold
    # Import cuML and scikit-learn LinearRegression classes here as well
    from cuml.linear_model import LinearRegression as cuLR
    from sklearn.linear_model import LinearRegression as skLR

    outer_cv = KFold(n_splits=outer_folds, shuffle=True, random_state=random_state)
    outer_scores = []

    # Determine which linear regression class to use (cuML or scikit-learn)
    # Based on the previous success or failure
    # We assume if the first model run failed, subsequent ones will too,
    # so we check the type of the existing linear_regression_model
    if isinstance(linear_regression_model, cuLR):
        ModelClass = cuLR
        print("Using cuML Linear Regression in Nested CV")
    else:
        ModelClass = skLR
        print("Using scikit-learn Linear Regression in Nested CV")


    for fold_idx, (train_idx, test_idx) in enumerate(outer_cv.split(X), 1):
        # Ensure data is in the correct format for the chosen model library
        if ModelClass is skLR and hasattr(X, 'to_pandas'):
            X_tr, X_te = X.iloc[train_idx].to_pandas(), X.iloc[test_idx].to_pandas()
            y_tr, y_te = y.iloc[train_idx].to_pandas(), y.iloc[test_idx].to_pandas()
        else: # cuML or data is already not a cuDF object
            X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
            y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]

        # Create a NEW instance of the model class for this fold
        model = ModelClass()
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_te)

        # Ensure predictions and test targets are on CPU for evaluation
        y_te_cpu = y_te.to_pandas() if hasattr(y_te, "to_pandas") else y_te
        y_pred_cpu = y_pred.to_pandas() if hasattr(y_pred, "to_pandas") else y_pred


        mse = float(mean_squared_error(y_te_cpu, y_pred_cpu))
        outer_scores.append(mse)

        print(f"[Linear] Fold {fold_idx}/{outer_folds} - MSE: {mse:.4f}")

        # Libera memória
        del model, X_tr, X_te, y_tr, y_te, y_pred, y_te_cpu, y_pred_cpu
        gc.collect()
        cp.get_default_memory_pool().free_all_blocks()

    return float(np.mean(outer_scores))

In [9]:
# === Dados já carregados como X_cudf, y_cudf ===

# Avaliação do modelo linear com Nested CV (sem tuning)
linear_mse = nested_cv_linear_evaluation(X, y)
print(f"\nMSE médio Linear Regression (Nested CV): {linear_mse:.4f}")

# Definição do Random Forest com melhores hiperparâmetros encontrados antes
# rf_cls = lambda: cuRF(random_state=42, n_streams=1, **chosen_rf)

# Comparação estatística: Linear vs RF
# t_stat, p_val = five_two_cv_ttest_gpu(linear_regression_model, rf_cls, X_cudf, y_cudf)
# print(f"\n5×2-cv t-test (Linear vs RF): t = {t_stat:.4f}, p = {p_val:.4f}")


Using cuML Linear Regression in Nested CV




[Linear] Fold 1/5 - MSE: 0.0011




[Linear] Fold 2/5 - MSE: 0.0011




[Linear] Fold 3/5 - MSE: 0.0011




[Linear] Fold 4/5 - MSE: 0.0011




[Linear] Fold 5/5 - MSE: 0.0011

MSE médio Linear Regression (Nested CV): 0.0011


In [10]:
import time

def measure_inference_time(model, X_test, n_repeats=5):
    """
    Mede o tempo médio de inferência por amostra.
    Repete a medição 'n_repeats' vezes para reduzir ruído.
    """
    total_times = []
    for _ in range(n_repeats):
        start = time.perf_counter()
        y_pred = model.predict(X_test)
        end = time.perf_counter()
        total_times.append(end - start)

    avg_total_time = sum(total_times) / len(total_times)
    time_per_sample = avg_total_time / len(X_test)
    return time_per_sample


In [12]:
# Após modelo.fit(...)
X_test_eval = X_test.to_pandas() if hasattr(X_test, "to_pandas") else X_test
inference_time = measure_inference_time(linear_regression_model, X_test_eval)
print(f"Tempo médio de inferência por amostra (Linear): {inference_time * 1000:.3f} ms")


Tempo médio de inferência por amostra (Linear): 0.000 ms
