# PASSO 00 IMPORTS

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.metrics import mean_absolute_error
import boto3
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


import requests
from datetime import datetime
import boto3

import mlflow

mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
mlflow.set_experiment("mlflow bitcoin")


<Experiment: artifact_location='mlflow-artifacts:/783824972131025485', creation_time=1737229390822, experiment_id='783824972131025485', last_update_time=1737229390822, lifecycle_stage='active', name='mlflow bitcoin', tags={}>

# PASSO 01 DATA DESCRIPTION

In [22]:
def get_historical_data(coin, days):
    url = f'https://api.coingecko.com/api/v3/coins/{coin}/market_chart?vs_currency=usd&days={days}'
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        prices = data['prices']
        df = pd.DataFrame(prices, columns=["Timestamp", "Price (USD)"])
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='ms')
        return df
    except Exception as e:
        st.error(f"Erro ao buscar dados históricos: {e}")
        return pd.DataFrame()

In [60]:
historical_data = get_historical_data('bitcoin',days=120)
historical_data.to_parquet('../data/raw/historical_data.parquet')

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Carregar os dados (supondo que já tenha sido coletado)
df_principal = pd.read_parquet('../data/raw/historical_data.parquet')

# FEATURE ENGINEERING

In [25]:
df = df_principal.copy()
df.head()

Unnamed: 0,Timestamp,Price (USD)
0,2024-11-09 00:21:53.106,76610.794662
1,2024-11-09 01:05:36.731,76471.113084
2,2024-11-09 02:16:27.972,76319.475094
3,2024-11-09 03:04:47.155,76474.132328
4,2024-11-09 04:10:55.085,76433.619276


In [26]:
df = df.rename(columns={"Timestamp":"timestamp","Price (USD)":"price (usd)"})
df

Unnamed: 0,timestamp,price (usd)
0,2024-11-09 00:21:53.106,76610.794662
1,2024-11-09 01:05:36.731,76471.113084
2,2024-11-09 02:16:27.972,76319.475094
3,2024-11-09 03:04:47.155,76474.132328
4,2024-11-09 04:10:55.085,76433.619276
...,...,...
2153,2025-02-06 20:04:01.843,95967.150040
2154,2025-02-06 21:03:56.060,96900.668886
2155,2025-02-06 22:01:12.092,96784.585045
2156,2025-02-06 23:04:09.795,96928.851576


In [27]:
df['day_of_week'] = df['timestamp'].dt.dayofweek


# Extrair o número do dia no mês
df['day_of_month'] = df['timestamp'].dt.day

In [28]:
# Média móvel de 7 dias (curto prazo)
df['moving_avg_7'] = df['price (usd)'].rolling(window=7).mean()

# Média móvel de 30 dias (médio prazo)
df['moving_avg_30'] = df['price (usd)'].rolling(window=30).mean()

# Média móvel de 90 dias (longo prazo)
df['moving_avg_90'] = df['price (usd)'].rolling(window=90).mean()

In [29]:
# Calcular a variação percentual diária (retorno)
df['daily_return'] = df['price (usd)'].pct_change()

# Calcular a variação absoluta diária
df['daily_return_abs'] = df['price (usd)'].diff()

In [30]:
df['volatility'] = df['daily_return'].rolling(window=30).std()

In [31]:
# Função para calcular o RSI
def calculate_rsi(data, window):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

# Calcular o RSI de 14 dias
df['RSI_14'] = calculate_rsi(df['price (usd)'], window=14)

In [32]:
# Média exponencial de 7 dias
df['ema_7'] = df['price (usd)'].ewm(span=7, adjust=False).mean()

# Média exponencial de 30 dias
df['ema_30'] = df['price (usd)'].ewm(span=30, adjust=False).mean()

# Variable Filtering

In [33]:
df = df.dropna()

In [34]:
# Lista das variáveis derivadas
derived_columns = [
    'day_of_week',  'day_of_month', 
    'moving_avg_7', 'moving_avg_30', 'moving_avg_90'
]

# Filtrando apenas as variáveis derivadas
df_derived = df[derived_columns]

# Exibindo as primeiras linhas das variáveis derivadas
print(df_derived.head())

    day_of_week  day_of_month  moving_avg_7  moving_avg_30  moving_avg_90
89            1            12  86846.315863   86625.368402   81193.817851
90            1            12  86988.100807   86794.615540   81312.837614
91            1            12  87164.572402   87036.036285   81452.299710
92            1            12  87493.347694   87255.061419   81590.430469
93            1            12  88018.413476   87477.746320   81736.776979


# Machine Learning Modelling

In [35]:
scaler = StandardScaler()
# Separar variáveis independentes (apenas as derivadas)
X_derived = df_derived

# Variável dependente (preço)
y = df['price (usd)']

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X_derived, y, test_size=0.3, shuffle=False)

# Normalizar as variáveis independentes
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [42]:
import numpy as np
def treinar_modelo(X_train, y_train):
    modelo = LinearRegression()
    modelo.fit(X_train, y_train)
    return modelo

def avaliar_modelo(modelo, X_test, y_test):
    y_pred = modelo.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, rmse, mae, r2

mlflow.autolog()
with mlflow.start_run(log_system_metrics=True):
     # Idealmente, chame isso antes do start_run
    modelo = treinar_modelo(X_train, y_train)
    mse, rmse, mae, r2 = avaliar_modelo(modelo, X_test, y_test)
    
    print(f'Mean Squared Error (Linear Regression): {mse}')
    print(f'Root Mean Squared Error (RMSE) (Linear Regression): {rmse}')
    print(f'Mean Absolute Error (MAE) (Linear Regression): {mae}')
    print(f'R² (Linear Regression): {r2}')


2025/02/06 20:57:39 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/02/06 20:57:39 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/02/06 20:57:39 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2025/02/06 20:57:41 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/02/06 20:57:41 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


Mean Squared Error (Linear Regression): 790461.2210352609
Root Mean Squared Error (RMSE) (Linear Regression): 889.0788609764945
Mean Absolute Error (MAE) (Linear Regression): 609.1081115820318
R² (Linear Regression): 0.9439018722223753
🏃 View run intelligent-rook-501 at: http://127.0.0.1:5000/#/experiments/783824972131025485/runs/cab2b7bd29ec49bca8a3861d65456dce
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/783824972131025485


In [44]:
X_train.columns

Index(['day_of_week', 'day_of_month', 'moving_avg_7', 'moving_avg_30',
       'moving_avg_90'],
      dtype='object')

In [58]:
#mlflow models serve   -m mlflow-artifacts:/783824972131025485/cab2b7bd29ec49bca8a3861d65456dce/artifacts/model   --no-conda   --port 5001

In [59]:
import requests

# Defina os dados aninhados na chave "inputs"
payload = {
    "dataframe_split": {
        "columns": ['day_of_week', 'day_of_month', 'moving_avg_7', 'moving_avg_30', 'moving_avg_90'],
        "data": [[5, 6, 94472.382736, 94472.382736, 94378.144222]]
    }
}

# Envie a requisição POST para o endpoint do MLflow
response = requests.post("http://localhost:5001/invocations", json=payload)

# Imprima a resposta (as predições retornadas pelo modelo)
print(response.json())


{'predictions': [96383.70687179628]}
