<a href="https://colab.research.google.com/github/cutemfc/retail_demand_forecast/blob/main/Week4_streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run Streamlit in Colab

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Step 1: Install Packages streamlit and cloudfare

In [2]:
!pip install streamlit -q
!pip install cloudflared



# Step 2: Add the cloudfare files

In [3]:
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!chmod +x cloudflared

# Step 3: Add entire Streamlit code below

In [4]:
# Create data/config.py
%%writefile /content/drive/MyDrive/retail_kaggle_data/data/config.py
DATA_PATH = "/content/drive/MyDrive/retail_kaggle_data/data/"
MODEL_PATH = "/content/drive/MyDrive/retail_kaggle_data/models/xgboost_model_revised.pkl"

Overwriting /content/drive/MyDrive/retail_kaggle_data/data/config.py


In [5]:
# Create data/utils
%%writefile /content/drive/MyDrive/retail_kaggle_data/data/data_utils.py
import os
import pandas as pd
import numpy as np
from scipy.stats import zscore
def load_data(DATA_PATH):
    df_store = pd.read_csv(os.path.join(DATA_PATH, 'stores.csv'))
    df_item = pd.read_csv(os.path.join(DATA_PATH, 'items.csv'))
    df_train = pd.read_csv(os.path.join(DATA_PATH, 'df_train_revised.csv')) # Only selected stores in Guyas, Top 3 Families
    return df_store, df_item, df_train

def prepare_training_data(df_train):
    df_train['date'] = pd.to_datetime(df_train['date'])
    df_train.set_index('date', inplace=True)
    # Lag features
    df_train['lag_1'] = df_train['unit_sales'].shift(1)
    df_train['lag_7'] = df_train['unit_sales'].shift(7)
    df_train['lag_30'] = df_train['unit_sales'].shift(30)
    # Rolling features
    df_train['rolling_mean_7'] = df_train['unit_sales'].rolling(window=7).mean()
    df_train['rolling_std_7'] = df_train['unit_sales'].rolling(window=7).std()
    df_train.dropna(inplace=True)
    # Time-based features
    df_train['year'] = df_train.index.year
    df_train['month'] = df_train.index.month
    df_train['day_of_week'] = df_train.index.dayofweek
    df_train['is_weekend'] = df_train['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    # Outlier detection and replacement
    z_scores = zscore(df_train['unit_sales'])
    outliers = df_train[z_scores > 5]
    df_train.loc[outliers.index, 'unit_sales'] = df_train.loc[outliers.index, 'rolling_mean_7']
    return df_train.reset_index()  # Restore 'date' as a column

def create_input_row(store_id, item_id, date, df_train):
    # Based on the input item/store
    date = pd.to_datetime(date)
    relevant_data = df_train[
        (df_train['store_nbr'] == store_id) & (df_train['item_nbr'] == item_id)
    ]

    if relevant_data.empty:
        raise ValueError("No historical data for this item/store combination.")

    latest_row = relevant_data[relevant_data['date'] < date].sort_values(by='date').iloc[-1]

    input_data = latest_row.copy()
    input_data['date'] = date
    input_data['year'] = date.year
    input_data['month'] = date.month
    input_data['day_of_week'] = date.dayofweek
    input_data['is_weekend'] = 1 if date.dayofweek >= 5 else 0

    return input_data.drop(['unit_sales', 'date'])  # X features only

Overwriting /content/drive/MyDrive/retail_kaggle_data/data/data_utils.py


In [6]:
%%writefile /content/drive/MyDrive/retail_kaggle_data/models/model_utils.py
import pickle

def load_model(MODEL_PATH):
    with open(MODEL_PATH, 'rb') as f:
        model = pickle.load(f)
    return model

def predict(model, input_df):
    return model.predict(input_df)

Overwriting /content/drive/MyDrive/retail_kaggle_data/models/model_utils.py


In [7]:
# Create utils for data visualization and prediction evaluation
%%writefile /content/drive/MyDrive/retail_kaggle_data/utils.py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import streamlit as st
from sklearn.metrics import mean_squared_error, r2_score

def run_visualization(df_train, model, split_date='2014-01-01', max_plots=5):
    split_date = pd.to_datetime(split_date)  # Ensure datetime
    rmad_values = []
    bias_values = []
    rmse_values = []
    mape_values = []
    r2_values = []
    plot_count = 0

    for (item_nbr, store_nbr), group in df_train.groupby(['item_nbr', 'store_nbr']):
        group = group.reset_index()
        group['date'] = pd.to_datetime(group['date'])  # Ensure datetime
        test_series = group[group['date'] >= split_date]

        if len(test_series) > 5:
            X_test = test_series.drop(['unit_sales', 'date'], axis=1)
            y_test = test_series['unit_sales']
            y_pred = model.predict(X_test)

            if plot_count < max_plots:
                train_series = group[group['date'] < split_date]

                plt.figure(figsize=(12, 6))
                plt.plot(train_series['date'], train_series['unit_sales'], label='Train Sales', color='black')
                plt.plot(test_series['date'], y_test, label='Actual Sales', color='blue')
                plt.plot(test_series['date'], y_pred, label='Predicted Sales', color='red')
                plt.title(f'Store {store_nbr}, Item {item_nbr}', fontsize=16)
                plt.xlabel('Date')
                plt.ylabel('Unit Sales')
                plt.xticks(rotation=45)
                plt.legend()
                st.pyplot(plt.gcf())
                plt.close()
                plot_count += 1
            # Calculate the metrics
            bias = np.mean(y_pred - y_test)
            rmad = np.mean(np.abs(y_pred - y_test))
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            mape = np.mean(np.abs((y_pred - y_test) / y_test)) * 100
            r2 = r2_score(y_test, y_pred)

            # Collect the data
            bias_values.append(bias)
            rmad_values.append(rmad)
            rmse_values.append(rmse)
            mape_values.append(mape)
            r2_values.append(r2)

            # Show the data
            st.markdown(f"### Store {store_nbr}, Item {item_nbr}")
            st.write(
                f"**Bias:** {bias:.2f} &nbsp;&nbsp; "
                f"**RMAD:** {rmad:.2f} &nbsp;&nbsp; "
                f"**RMSE:** {rmse:.2f} &nbsp;&nbsp; "
                f"**MAPE:** {mape:.2f}% &nbsp;&nbsp; "
                f"**R²:** {r2:.2f}"
            )




Overwriting /content/drive/MyDrive/retail_kaggle_data/utils.py


In [8]:
%%writefile /content/drive/MyDrive/retail_kaggle_data/app.py
import sys
import os
import streamlit as st
from data.config import DATA_PATH, MODEL_PATH
from data.data_utils import load_data, prepare_training_data, create_input_row
from models.model_utils import load_model, predict
import datetime  # Used for handling date inputs
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from utils import run_visualization


@st.cache_resource  # Only read the data once, and repeat to use it
def cache_load_data():
    df_store, df_item, df_train = load_data(DATA_PATH)
    df_train_processed = prepare_training_data(df_train)
    return df_store, df_item, df_train, df_train_processed
@st.cache_resource
def cache_load_model():
    return load_model(MODEL_PATH)

def run_app():
    st.title("Corporación Favorita Sales Forecasting")
    # load data and model from cache
    df_store, df_item, df_train, df_train_processed = cache_load_data()
    model=cache_load_model()

    # Store selection
    store_id = st.selectbox("Store", [1])  # For testing limit to one store
    item_id = st.selectbox("Item", [564533, 838216, 582865, 364606])  # For testing limit to a few items

    # Set default and allowed date range for forecasting
    default_date = datetime.date(2014, 1, 1)  # Default date is Jan 1, 2014
    min_date = datetime.date(2013, 1, 1)  # Minimum date allowed is January 1, 2013
    max_date = datetime.date(2014, 4, 1)  # Maximum date allowed is April 1, 2014

    # Date input for selecting forecast date, within the range [min_date, max_date]
    date = st.date_input("Forecast Date", value=default_date, min_value=min_date, max_value=max_date)

    # When the user clicks the "Get Forecast" button
    if st.button("Get Forecast"):
        input_data = create_input_row(store_id, item_id, date, df_train_processed)
        # Use the model to predict sales based on the input data
        prediction = predict(model, input_data.to_frame().T)
        # Display the predicted sales for the selected date
        st.write(f"Predicted Sales for {date}: {prediction[0]}")
    # Visualization
    split_date=pd.to_datetime(date)
    run_visualization(df_train, model, split_date)

# Ensure the script runs the main function if executed directly
if __name__ == "__main__":
    run_app()


Overwriting /content/drive/MyDrive/retail_kaggle_data/app.py


# Step 4: Run streamlit in the background

In [9]:
%cd /content/drive/MyDrive/retail_kaggle_data/
!streamlit run app.py &> logs.txt &

/content/drive/MyDrive/retail_kaggle_data


# Step 5: Create a Cloudfare Tunnel

In [10]:
!chmod +x cloudflared

In [None]:
!./cloudflared tunnel --url http://localhost:8501

[90m2025-05-18T21:14:57Z[0m [32mINF[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
[90m2025-05-18T21:14:57Z[0m [32mINF[0m Requesting new quick Tunnel on trycloudflare.com...
[90m2025-05-18T21:15:01Z[0m [32mINF[0m +--------------------------------------------------------------------------------------------+
[90m2025-05-18T21:15:01Z[0m [32mINF[0m |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |
[90m2025