<a href="https://colab.research.google.com/github/fernandagmarcal/bigmacprices/blob/main/BigMac26_11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install --upgrade scikit-learn scikit-optimize joblib category_encoders pandas



In [5]:
import pandas as pd
import numpy as np
import os
import joblib
import time
from google.colab import files
from google.colab import drive

# Imports Scikit-Learn
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score

# Otimização
from skopt import gp_minimize
from skopt.space import Integer, Real
from skopt.utils import use_named_args

In [6]:
# === 2. CARREGAMENTO DOS DADOS ===
print("Montando Drive...")
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# Tenta carregar o arquivo
caminho_arquivo = '/content/drive/MyDrive/dataset/bigmac/BigmacPrice.csv'

try:
    if os.path.exists(caminho_arquivo):
        df = pd.read_csv(caminho_arquivo)
        print("✅ Base carregada com sucesso!")
    else:
        # Tenta upload manual se não achar no drive
        print("Arquivo não encontrado no Drive. Faça upload manual:")
        uploaded = files.upload()
        nome_arquivo = list(uploaded.keys())[0]
        df = pd.read_csv(nome_arquivo)
except Exception as e:
    print(f"Erro: {e}")
    df = pd.DataFrame()

Montando Drive...
✅ Base carregada com sucesso!


In [12]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1946 entries, 0 to 1945
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1946 non-null   datetime64[ns]
 1   currency_code  1946 non-null   object        
 2   name           1946 non-null   object        
 3   local_price    1946 non-null   float64       
 4   dollar_ex      1946 non-null   int64         
 5   dollar_price   1946 non-null   float64       
 6   year           1946 non-null   int32         
 7   month          1946 non-null   int32         
dtypes: datetime64[ns](1), float64(2), int32(2), int64(1), object(2)
memory usage: 106.6+ KB
None


In [7]:
# === 3. LIMPEZA E ENGENHARIA ===
if not df.empty:
    df = df.copy()

    # Converter data
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month

    # Remover linhas sem o alvo
    df = df.dropna(subset=['dollar_price'])

    # Selecionar colunas
    X = df[['name', 'year', 'month']]
    y = df['dollar_price']

    # Divisão Treino/Teste
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
print(X_test.info())

<class 'pandas.core.frame.DataFrame'>
Index: 390 entries, 1611 to 1740
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    390 non-null    object
 1   year    390 non-null    int32 
 2   month   390 non-null    int32 
dtypes: int32(2), object(1)
memory usage: 9.1+ KB
None


In [18]:
print(y_test.info())

<class 'pandas.core.series.Series'>
Index: 390 entries, 1611 to 1740
Series name: dollar_price
Non-Null Count  Dtype  
--------------  -----  
390 non-null    float64
dtypes: float64(1)
memory usage: 6.1 KB
None


In [8]:
    # === 4. PRE-PROCESSAMENTO (PIPELINE) ===
    categorical_features = ['name']
    numerical_features = ['year', 'month']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        ])

    # Precisamos transformar X_train para a otimização bayesiana funcionar
    # (O Voting final usará o pipeline completo, mas a otimização precisa dos dados já transformados)
    X_train_processed = preprocessor.fit_transform(X_train)

In [9]:
    # === 5. MODELAGEM ===

    # A. Linear Regression (Ridge)
    model_lr = Ridge(alpha=1.0)

    # B. HistGradientBoosting (Otimizado)
    space_hgb = [
        Real(0.01, 0.3, name='learning_rate'),
        Integer(50, 200, name='max_iter')
    ]

    @use_named_args(space_hgb)
    def objective_hgb(**params):
        model = HistGradientBoostingRegressor(**params, random_state=42)
        return -np.mean(cross_val_score(model, X_train_processed, y_train, cv=3, scoring="neg_mean_absolute_error"))

    print(">>> Otimizando HistGradientBoosting...")
    res_hgb = gp_minimize(objective_hgb, space_hgb, n_calls=15, random_state=42)
    best_hgb = HistGradientBoostingRegressor(**{k.name:v for k,v in zip(space_hgb, res_hgb.x)}, random_state=42)

    # C. Random Forest
    print(">>> Treinando Random Forest...")
    best_rf = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)

>>> Otimizando HistGradientBoosting...
>>> Treinando Random Forest...


In [10]:
    # === 6. VOTING REGRESSOR ===
    print(">>> Treinando Voting Final...")
    voting = VotingRegressor(
        estimators=[('linear', model_lr), ('hgb', best_hgb), ('rf', best_rf)],
        weights=[2, 1, 1]
    )

    modelo_final = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', voting)
    ])

    modelo_final.fit(X_train, y_train)

    # Avaliação
    score = modelo_final.score(X_test, y_test)
    print(f"R² Score no Teste: {score:.2%}")

>>> Treinando Voting Final...
R² Score no Teste: 76.83%


In [11]:
    # === 7. SALVAMENTO (COMPATÍVEL) ===
    pacote_final = {
        'modelo': modelo_final.named_steps['regressor'],
        'preprocessor': modelo_final.named_steps['preprocessor'],
        'colunas': list(X.columns)
    }

    print("Salvando arquivo...")
    joblib.dump(pacote_final, 'modelo_bigmac.pkl')

    files.download('modelo_bigmac.pkl')
    print("Download iniciado!")

Salvando arquivo...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download iniciado!
