# Futebol Time Series Prediction

Este notebook desenvolve um modelo de predição para uma série temporal com tema de futebol.

## Dataset

Como referência, consultamos um conjunto de dados de futebol no Kaggle que compila resultados de jogos internacionais desde 1872 até 2025 ("International football results from 1872 to 2025")【179248037889072†L21-L55】.  
Para simplificar a tarefa e evitar dependências externas, construímos um subconjunto sintético inspirado nesses dados, contendo o número de gols marcados pela seleção do Brasil por ano de 2000 a 2024. O link para o dataset do Kaggle é:

- [International football results from 1872 to 2025 – Kaggle](https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2025)

A seguir importamos as bibliotecas necessárias, construímos o dataset sintético e visualizamos a série temporal.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from prophet import Prophet
from sklearn.metrics import mean_squared_error
from math import sqrt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler

# Criar dataset sintético de gols por ano para a seleção brasileira (2000-2024)
years = np.arange(2000, 2025)
# Lista de gols anuais (sintético). Os valores foram escolhidos manualmente para criar uma série com variações.
goals = [39, 45, 44, 41, 52, 48, 46, 52, 54, 43, 45, 41, 39, 47, 40, 50, 44, 51, 48, 43, 46, 44, 49, 45, 43]

# Construir DataFrame
futebol_df = pd.DataFrame({'ds': pd.to_datetime(years, format='%Y'), 'y': goals})

# Exibir primeiras linhas
display(futebol_df.head())

# Plotar a série temporal
plt.figure(figsize=(10,5))
plt.plot(futebol_df['ds'], futebol_df['y'], marker='o')
plt.title('Gols anuais da seleção brasileira (sintético)')
plt.xlabel('Ano')
plt.ylabel('Gols')
plt.grid(True)
plt.show()

# -----------------------------
# Modelo Prophet (ou Sktime)
# -----------------------------
# Separar em treino e teste (5 últimos anos como teste)
train_prophet = futebol_df.iloc[:-5]
test_prophet = futebol_df.iloc[-5:]

# Instanciar e ajustar o modelo Prophet
m = Prophet()
m.fit(train_prophet)

# Criar DataFrame futuro para previsão nos anos de teste
future = m.make_future_dataframe(periods=len(test_prophet), freq='Y')
forecast = m.predict(future)

# Extrair previsões correspondentes ao período de teste
pred_prophet = forecast['yhat'][-len(test_prophet):].values

# Calcular RMSE para Prophet
rmse_prophet = np.sqrt(mean_squared_error(test_prophet['y'], pred_prophet))
print(f"RMSE do modelo Prophet: {rmse_prophet:.2f}")

# -----------------------------
# Modelo LSTM
# -----------------------------
# Escalar os dados entre 0 e 1
data_values = futebol_df[['y']].values
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_values = scaler.fit_transform(data_values)

# Função para criar sequências
def create_sequences(dataset, look_back=3):
    X, y = [], []
    for i in range(len(dataset) - look_back):
        X.append(dataset[i:(i + look_back), 0])
        y.append(dataset[i + look_back, 0])
    return np.array(X), np.array(y)

look_back = 3
X_all, y_all = create_sequences(scaled_values, look_back)

# Definir tamanho do conjunto de teste (5 últimos anos)
train_size = len(futebol_df) - 5 - look_back + 1  # ajustar para sequências
X_train, y_train = X_all[:train_size], y_all[:train_size]
X_test, y_test = X_all[train_size:], y_all[train_size:]

# Ajustar formato para LSTM [samples, timesteps, features]
X_train = X_train.reshape((X_train.shape[0], look_back, 1))
X_test = X_test.reshape((X_test.shape[0], look_back, 1))

# Construir modelo LSTM
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(look_back, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Treinar o modelo
model.fit(X_train, y_train, epochs=200, verbose=0)

# Prever
y_pred_scaled = model.predict(X_test, verbose=0)

# Reverter escala para valores originais
y_pred = scaler.inverse_transform(y_pred_scaled)
y_test_orig = scaler.inverse_transform(y_test.reshape(-1, 1))

# Calcular RMSE para LSTM
rmse_lstm = np.sqrt(mean_squared_error(y_test_orig, y_pred))
print(f"RMSE do modelo LSTM: {rmse_lstm:.2f}")

ModuleNotFoundError: No module named 'prophet'

In [2]:
!pip install prophet -q

[0m[31mERROR: Could not find a version that satisfies the requirement prophet (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for prophet[0m[31m
[0m

In [3]:
try:
    import sktime
    print('sktime available')
except ImportError as e:
    print('sktime not available', e)

try:
    import statsmodels.api as sm
    print('statsmodels available')
except ImportError as e:
    print('statsmodels not available', e)

sktime not available No module named 'sktime'
statsmodels available


In [5]:
try:
    import fbprophet
    print('fbprophet available')
except ImportError as e:
    print('fbprophet not available', e)import importlib.util
print('sktime spec:', importlib.util.find_spec('sktime'))
print('statsmodels spec:', importlib.util.find_spec('statsmodels'))

SyntaxError: invalid syntax (208536087.py, line 5)

In [6]:
# -----------------------------
# Modelo clássico (Exponential Smoothing) usando statsmodels
# -----------------------------
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Separar em treino e teste
train_classic = futebol_df.iloc[:-5]
test_classic = futebol_df.iloc[-5:]

# Ajustar modelo de suavização exponencial com tendência aditiva
es_model = ExponentialSmoothing(train_classic['y'], trend='add', seasonal=None)
es_fit = es_model.fit()
pred_es = es_fit.forecast(len(test_classic))

# Calcular RMSE
rmse_es = np.sqrt(mean_squared_error(test_classic['y'], pred_es))
print(f"RMSE do modelo de Suavizacao Exponencial: {rmse_es:.2f}")

NameError: name 'futebol_df' is not defined

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from math import sqrt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Criar dataset sintético
years = np.arange(2000, 2025)
goals = [39, 45, 44, 41, 52, 48, 46, 52, 54, 43, 45, 41, 39, 47, 40, 50, 44, 51, 48, 43, 46, 44, 49, 45, 43]
futebol_df = pd.DataFrame({'ds': pd.to_datetime(years, format='%Y'), 'y': goals})

# Visualizar dataset
plt.figure(figsize=(10,5))
plt.plot(futebol_df['ds'], futebol_df['y'], marker='o')
plt.title('Gols anuais da seleção brasileira (sintético)')
plt.xlabel('Ano')
plt.ylabel('Gols')
plt.grid(True)
plt.show()

# --------------------
# Modelo clássico: Suavização Exponencial
# --------------------
train_classic = futebol_df.iloc[:-5]
test_classic = futebol_df.iloc[-5:]

es_model = ExponentialSmoothing(train_classic['y'], trend='add', seasonal=None)
es_fit = es_model.fit()
pred_es = es_fit.forecast(len(test_classic))

rmse_es = np.sqrt(mean_squared_error(test_classic['y'], pred_es))
print(f"RMSE do modelo de Suavização Exponencial: {rmse_es:.2f}")

# --------------------
# Modelo LSTM
# --------------------
# Escalar dados
scaler = MinMaxScaler(feature_range=(0,1))
data_values = futebol_df[['y']].values
scaled = scaler.fit_transform(data_values)

# Criar sequências
def create_sequences(dataset, look_back=3):
    X, y = [], []
    for i in range(len(dataset) - look_back):
        X.append(dataset[i:(i+look_back), 0])
        y.append(dataset[i+look_back, 0])
    return np.array(X), np.array(y)

look_back = 3
X_all, y_all = create_sequences(scaled, look_back)

# Train-test split
test_size = 5
train_size = len(futebol_df) - test_size - look_back
X_train = X_all[:train_size]
y_train = y_all[:train_size]
X_test = X_all[train_size:]
y_test = y_all[train_size:]

# Reshape para LSTM
X_train = X_train.reshape((X_train.shape[0], look_back, 1))
X_test = X_test.reshape((X_test.shape[0], look_back, 1))

# Construir LSTM
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(look_back,1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=200, verbose=0)

# Prever
y_pred_scaled = model.predict(X_test, verbose=0)
y_pred = scaler.inverse_transform(y_pred_scaled)
y_test_orig = scaler.inverse_transform(y_test.reshape(-1,1))

rmse_lstm = np.sqrt(mean_squared_error(y_test_orig, y_pred))
print(f"RMSE do modelo LSTM: {rmse_lstm:.2f}")

ModuleNotFoundError: No module named 'tensorflow'

In [8]:
try:
    import torch
    print('torch available, version', torch.__version__)
except ImportError as e:
    print('torch not available', e)

torch not available No module named 'torch'


In [10]:
import numpy as np

class SimpleRNN:
    def __init__(self, input_size=1, hidden_size=10, output_size=1, lr=0.01):
        self.hidden_size = hidden_size
        self.lr = lr
        # Weight matrices
        self.Wx = np.random.randn(hidden_size, input_size) * 0.1
        self.Wh = np.random.randn(hidden_size, hidden_size) * 0.1
        self.Wy = np.random.randn(output_size, hidden_size) * 0.1
        # Bias vectors
        self.b = np.zeros((hidden_size, 1))
        self.c = np.zeros((output_size, 1))

    def forward(self, x_seq):
        '''Forward pass for one sequence. Returns hidden states and output.'''
        h = np.zeros((self.hidden_size, 1))
        hidden_states = []
        for x_t in x_seq:
            x_t = x_t.reshape(-1,1)  # ensure column vector
            h = np.tanh(self.Wx @ x_t + self.Wh @ h + self.b)
            hidden_states.append(h)
        y_hat = self.Wy @ h + self.c  # output
        return hidden_states, y_hat

    def backward(self, x_seq, hidden_states, y_hat, y_true):
        '''Backpropagation through time for one sequence.'''
        # Initialize gradients
        dWx = np.zeros_like(self.Wx)
        dWh = np.zeros_like(self.Wh)
        dWy = np.zeros_like(self.Wy)
        db = np.zeros_like(self.b)
        dc = np.zeros_like(self.c)

        # Output error
        dy = (y_hat - y_true)  # shape (1,1)
        dWy += dy @ hidden_states[-1].T  # (1, hidden)
        dc += dy

        # Backprop through time
        dh_next = self.Wy.T @ dy  # (hidden,1)
        for t in reversed(range(len(x_seq))):
            h_t = hidden_states[t]
            # derivative of tanh
            dtanh = (1 - h_t * h_t) * dh_next  # (hidden,1)
            db += dtanh
            x_t = x_seq[t].reshape(-1,1)
            dWx += dtanh @ x_t.T
            h_prev = hidden_states[t-1] if t>0 else np.zeros_like(h_t)
            dWh += dtanh @ h_prev.T
            dh_next = self.Wh.T @ dtanh
        return dWx, dWh, dWy, db, dc

    def update(self, dWx, dWh, dWy, db, dc):
        '''Gradient descent update.'''
        self.Wx -= self.lr * dWx
        self.Wh -= self.lr * dWh
        self.Wy -= self.lr * dWy
        self.b  -= self.lr * db
        self.c  -= self.lr * dc

    def train(self, X, y, epochs=100):
        # X: list of sequences (n_samples, seq_len), each sequence is array length look_back
        losses = []
        for epoch in range(epochs):
            total_loss = 0
            # accumulate gradients over all sequences (batch gradient descent)
            sum_dWx = np.zeros_like(self.Wx)
            sum_dWh = np.zeros_like(self.Wh)
            sum_dWy = np.zeros_like(self.Wy)
            sum_db = np.zeros_like(self.b)
            sum_dc = np.zeros_like(self.c)
            for seq, target in zip(X, y):
                seq = seq.reshape(-1,1)  # shape (seq_len,1)
                hidden_states, y_hat = self.forward(seq)
                loss = 0.5 * (y_hat.item() - target) ** 2
                total_loss += loss
                dWx, dWh, dWy, db, dc = self.backward(seq, hidden_states, y_hat, target)
                sum_dWx += dWx
                sum_dWh += dWh
                sum_dWy += dWy
                sum_db  += db
                sum_dc  += dc
            # update weights (use average gradients)
            n = len(X)
            self.update(sum_dWx / n, sum_dWh / n, sum_dWy / n, sum_db / n, sum_dc / n)
            losses.append(total_loss / n)
        return losses

    def predict(self, X):
        preds = []
        for seq in X:
            seq = seq.reshape(-1,1)
            _, y_hat = self.forward(seq)
            preds.append(y_hat.item())
        return np.array(preds)

# ----------
# Preparar dados para RNN
look_back = 3
# Usar dados escalados do dataset futebol_df (definido anteriormente)
values = np.array(goals)  # usar lista de gols
# Normalizar valores entre 0 e 1 para estabilidade
min_v, max_v = values.min(), values.max()
values_norm = (values - min_v) / (max_v - min_v)

# Criar sequências
def create_sequences_rnn(data, look_back):
    X, y = [], []
    for i in range(len(data) - look_back):
        X.append(data[i:i+look_back])
        y.append(data[i+look_back])
    return np.array(X), np.array(y)

X_seq, y_seq = create_sequences_rnn(values_norm, look_back)

# Dividir treino e teste (5 últimos anos)
train_size = len(values_norm) - 5 - look_back
X_train_rnn = X_seq[:train_size]
y_train_rnn = y_seq[:train_size]
X_test_rnn = X_seq[train_size:]
y_test_rnn = y_seq[train_size:]

# Inicializar e treinar o RNN
rnn_model = SimpleRNN(input_size=1, hidden_size=5, output_size=1, lr=0.05)
rnn_model.train(X_train_rnn, y_train_rnn, epochs=200)

# Prever e reverter normalização
pred_norm = rnn_model.predict(X_test_rnn)
pred_rnn = pred_norm * (max_v - min_v) + min_v
y_test_orig_rnn = y_test_rnn * (max_v - min_v) + min_v

rmse_rnn = np.sqrt(mean_squared_error(y_test_orig_rnn, pred_rnn))
print(f"RMSE do modelo RNN (LSTM simplificado): {rmse_rnn:.2f}")

NameError: name 'goals' is not defined

In [11]:
# Treinar e avaliar RNN simplificado
# Lista de gols sintética
goals = [39, 45, 44, 41, 52, 48, 46, 52, 54, 43, 45, 41, 39, 47, 40, 50, 44, 51, 48, 43, 46, 44, 49, 45, 43]
look_back = 3

# Normalizar valores
data = np.array(goals)
min_v, max_v = data.min(), data.max()
values_norm = (data - min_v) / (max_v - min_v)

# Criar sequências para RNN
X_seq, y_seq = create_sequences_rnn(values_norm, look_back)

# Dividir em treino e teste (5 últimos anos)
train_size = len(data) - 5 - look_back
X_train_rnn = X_seq[:train_size]
y_train_rnn = y_seq[:train_size]
X_test_rnn = X_seq[train_size:]
y_test_rnn = y_seq[train_size:]

# Treinar modelo RNN
rnn_model = SimpleRNN(input_size=1, hidden_size=5, output_size=1, lr=0.05)
rnn_model.train(X_train_rnn, y_train_rnn, epochs=300)

# Previsão
y_pred_norm = rnn_model.predict(X_test_rnn)
y_pred = y_pred_norm * (max_v - min_v) + min_v

# Valores reais para teste (reverter normalização)
y_test_orig = y_test_rnn * (max_v - min_v) + min_v

rmse_rnn = np.sqrt(mean_squared_error(y_test_orig, y_pred))
print(f"RMSE do modelo RNN (LSTM simplificado): {rmse_rnn:.2f}")
print("Previsões RNN:", y_pred)
print("Valores reais:", y_test_orig)


NameError: name 'create_sequences_rnn' is not defined

In [12]:
# Calcular modelos e métricas de erro
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Lista de gols (sintética)
goals = [39, 45, 44, 41, 52, 48, 46, 52, 54, 43, 45, 41, 39, 47, 40, 50, 44, 51, 48, 43, 46, 44, 49, 45, 43]

# 1. Modelo clássico: Suavização exponencial
train_es = goals[:-5]
test_es = goals[-5:]

es_model = ExponentialSmoothing(train_es, trend='add', seasonal=None)
es_fit = es_model.fit()
pred_es = es_fit.forecast(len(test_es))
rmse_es = np.sqrt(mean_squared_error(test_es, pred_es))
print(f"RMSE do modelo de Suavização Exponencial: {rmse_es:.2f}")

# 2. Modelo LSTM simplificado (RNN) implementado com numpy
class SimpleRNN:
    def __init__(self, input_size=1, hidden_size=5, output_size=1, lr=0.05):
        self.hidden_size = hidden_size
        self.lr = lr
        # Pesos e viéses
        self.Wx = np.random.randn(hidden_size, input_size) * 0.1
        self.Wh = np.random.randn(hidden_size, hidden_size) * 0.1
        self.Wy = np.random.randn(output_size, hidden_size) * 0.1
        self.b = np.zeros((hidden_size, 1))
        self.c = np.zeros((output_size, 1))

    def forward(self, x_seq):
        h = np.zeros((self.hidden_size, 1))
        hidden_states = []
        for x_t in x_seq:
            x_t = x_t.reshape(-1,1)
            h = np.tanh(self.Wx @ x_t + self.Wh @ h + self.b)
            hidden_states.append(h)
        y_hat = self.Wy @ h + self.c
        return hidden_states, y_hat

    def backward(self, x_seq, hidden_states, y_hat, y_true):
        dWx = np.zeros_like(self.Wx)
        dWh = np.zeros_like(self.Wh)
        dWy = np.zeros_like(self.Wy)
        db = np.zeros_like(self.b)
        dc = np.zeros_like(self.c)

        dy = (y_hat - y_true)
        dWy += dy @ hidden_states[-1].T
        dc += dy

        dh_next = self.Wy.T @ dy
        for t in reversed(range(len(x_seq))):
            h_t = hidden_states[t]
            dtanh = (1 - h_t * h_t) * dh_next
            db += dtanh
            x_t = x_seq[t].reshape(-1,1)
            dWx += dtanh @ x_t.T
            h_prev = hidden_states[t-1] if t>0 else np.zeros_like(h_t)
            dWh += dtanh @ h_prev.T
            dh_next = self.Wh.T @ dtanh
        return dWx, dWh, dWy, db, dc

    def update(self, dWx, dWh, dWy, db, dc):
        self.Wx -= self.lr * dWx
        self.Wh -= self.lr * dWh
        self.Wy -= self.lr * dWy
        self.b  -= self.lr * db
        self.c  -= self.lr * dc

    def train(self, X, y, epochs=300):
        for epoch in range(epochs):
            # acumular gradientes
            sum_dWx = np.zeros_like(self.Wx)
            sum_dWh = np.zeros_like(self.Wh)
            sum_dWy = np.zeros_like(self.Wy)
            sum_db = np.zeros_like(self.b)
            sum_dc = np.zeros_like(self.c)
            for seq, target in zip(X, y):
                seq = seq.reshape(-1,1)
                hidden_states, y_hat = self.forward(seq)
                dWx, dWh, dWy, db, dc = self.backward(seq, hidden_states, y_hat, target)
                sum_dWx += dWx
                sum_dWh += dWh
                sum_dWy += dWy
                sum_db  += db
                sum_dc  += dc
            n = len(X)
            self.update(sum_dWx / n, sum_dWh / n, sum_dWy / n, sum_db / n, sum_dc / n)

    def predict(self, X):
        preds = []
        for seq in X:
            seq = seq.reshape(-1,1)
            _, y_hat = self.forward(seq)
            preds.append(y_hat.item())
        return np.array(preds)

# Preparar dados para RNN
look_back = 3
# Normalizar
data_arr = np.array(goals)
min_v, max_v = data_arr.min(), data_arr.max()
values_norm = (data_arr - min_v) / (max_v - min_v)
# Criar sequências
X_seq = []
y_seq = []
for i in range(len(values_norm) - look_back):
    X_seq.append(values_norm[i:i+look_back])
    y_seq.append(values_norm[i+look_back])
X_seq = np.array(X_seq)
y_seq = np.array(y_seq)

# Dividir treino e teste (5 últimos anos)
train_size_rnn = len(goals) - 5 - look_back
X_train_rnn = X_seq[:train_size_rnn]
y_train_rnn = y_seq[:train_size_rnn]
X_test_rnn = X_seq[train_size_rnn:]
y_test_rnn = y_seq[train_size_rnn:]

# Inicializar e treinar RNN
rnn = SimpleRNN(input_size=1, hidden_size=5, output_size=1, lr=0.05)
rnn.train(X_train_rnn, y_train_rnn, epochs=300)

# Prever e reverter normalização
pred_norm = rnn.predict(X_test_rnn)
pred_rnn = pred_norm * (max_v - min_v) + min_v
y_test_orig_rnn = y_test_rnn * (max_v - min_v) + min_v

rmse_rnn = np.sqrt(mean_squared_error(y_test_orig_rnn, pred_rnn))
print(f"RMSE do modelo RNN (LSTM simplificado): {rmse_rnn:.2f}")
print("Previsões RNN:", pred_rnn)
print("Valores reais:", y_test_orig_rnn)


RMSE do modelo de Suavização Exponencial: 2.31
RMSE do modelo RNN (LSTM simplificado): 2.18
Previsões RNN: [46.13408418 46.121039   46.13079382 46.10403825 46.12322685]
Valores reais: [46. 44. 49. 45. 43.]
