# Robyn: Marketing Mix Modeling Application

This notebook demonstrates the usage of Robyn, a Marketing Mix Modeling (MMM) application. 
We'll go through the main steps of performing robyn_inputs and robyn_engineering.



## 1. Import Required Libraries. Define Paths.

First, be sure to setup your virtual environment. Be sure to switch over to your new environment in this notebook. 

-```cd {root_folder}```

-```python3 -m yourvenv```

-```source yourvenv/bin/activate```

-```cd Robyn/python```

-```pip install -r requirements.txt```


Then import the necessary libraries. Make sure to define your paths below.



In [None]:
import sys

base_path = "/Users/yijuilee/project_robyn/robynpy_interfaces/Robyn/R/data"
python_path = "/Users/yijuilee/robynpy_release_reviews/Robyn/python/src"
sys.path.append(base_path)
sys.path.append(python_path)

In [None]:
import os
import numpy as np
import pandas as pd
import pyreadr
from typing import Dict, Any
from robyn.data.entities.mmmdata import MMMData
from robyn.data.entities.enums import AdstockType
from robyn.data.entities.holidays_data import HolidaysData
from robyn.data.entities.hyperparameters import Hyperparameters, ChannelHyperparameters
from robyn.data.entities.calibration_input import CalibrationInput
from robyn.modeling.entities.modelrun_trials_config import TrialsConfig
from robyn.modeling.model_executor import ModelExecutor
from robyn.modeling.ridge_model_builder import RidgeModelBuilder
from robyn.modeling.entities.enums import NevergradAlgorithm, Models
from robyn.modeling.feature_engineering import FeaturizedMMMData, FeatureEngineering
from robyn.calibration.media_effect_calibration import MediaEffectCalibrator

## 2.1 Load Mock R data

We need to set the base path for the data directory.
Create a .env file in the same directory as your notebook and put in define the path to the data dir.
for example: ROBYN_BASE_PATH=.../Robyn/R/data

In [None]:
def load_data() -> Dict[str, pd.DataFrame]:
    if not base_path:
        raise EnvironmentError("Please set the ROBYN_BASE_PATH environment variable")

    simulated_weekly_path = os.path.join(base_path, "dt_simulated_weekly.RData")
    prophet_holidays_path = os.path.join(base_path, "dt_prophet_holidays.RData")

    result = pyreadr.read_r(simulated_weekly_path)
    dt_simulated_weekly = result["dt_simulated_weekly"]
    result_holidays = pyreadr.read_r(prophet_holidays_path)
    dt_prophet_holidays = result_holidays["dt_prophet_holidays"]

    return {"dt_simulated_weekly": dt_simulated_weekly, "dt_prophet_holidays": dt_prophet_holidays}


data = load_data()
data["dt_simulated_weekly"].head()
data["dt_prophet_holidays"].head()

## Setup MMM Data

We will now set up the MMM data specification which includes defining the dependent variable, independent variables, and the time window for analysis.

In [None]:
def setup_mmm_data(data: Dict[str, pd.DataFrame]) -> MMMData:
    dt_simulated_weekly = data["dt_simulated_weekly"]

    mmm_data_spec = MMMData.MMMDataSpec(
        dep_var="revenue",
        dep_var_type="revenue",
        date_var="DATE",
        context_vars=["competitor_sales_B", "events"],
        paid_media_spends=["tv_S", "ooh_S", "print_S", "facebook_S", "search_S"],
        paid_media_vars=["tv_S", "ooh_S", "print_S", "facebook_I", "search_clicks_P"],
        organic_vars=["newsletter"],
        window_start="2016-01-01",
        window_end="2018-12-31",
    )

    return MMMData(data=dt_simulated_weekly, mmmdata_spec=mmm_data_spec)


mmm_data = setup_mmm_data(data)
mmm_data.data.head()

## Feature Preprocessing

We will perform feature engineering to prepare the data for modeling. This includes transformations like adstock and other preprocessing steps.

In [None]:
hyperparameters = Hyperparameters(
    {
        "facebook_S": ChannelHyperparameters(
            alphas=[0.5, 3],
            gammas=[0.3, 1],
            thetas=[0, 0.3],
        ),
        "print_S": ChannelHyperparameters(
            alphas=[0.5, 3],
            gammas=[0.3, 1],
            thetas=[0.1, 0.4],
        ),
        "tv_S": ChannelHyperparameters(
            alphas=[0.5, 3],
            gammas=[0.3, 1],
            thetas=[0.3, 0.8],
        ),
        "search_S": ChannelHyperparameters(
            alphas=[0.5, 3],
            gammas=[0.3, 1],
            thetas=[0, 0.3],
        ),
        "ooh_S": ChannelHyperparameters(
            alphas=[0.5, 3],
            gammas=[0.3, 1],
            thetas=[0.1, 0.4],
        ),
        "newsletter": ChannelHyperparameters(
            alphas=[0.5, 3],
            gammas=[0.3, 1],
            thetas=[0.1, 0.4],
        ),
    },
    adstock=AdstockType.GEOMETRIC,
    lambda_=0.0,
    train_size=[0.5, 0.8],
)

print("Hyperparameters setup complete.")

In [None]:
# Create HolidaysData object
holidays_data = HolidaysData(
    dt_holidays=data["dt_prophet_holidays"],
    prophet_vars=["trend", "season", "holiday"],
    prophet_country="DE",
    prophet_signs=["default", "default", "default"],
)
# Setup FeaturizedMMMData
feature_engineering = FeatureEngineering(mmm_data, hyperparameters, holidays_data)

## Add Calibration Step

In [None]:
from robyn.data.entities.enums import CalibrationScope, DependentVarType
from robyn.data.entities.calibration_input import CalibrationInput, ChannelCalibrationData
from robyn.calibration.media_effect_calibration import MediaEffectCalibrator
import pandas as pd

# Create sample calibration data with explicit tuples
channel_calibration_data = {
    ("facebook_S"): ChannelCalibrationData(
        lift_start_date=pd.Timestamp("2018-05-01"),
        lift_end_date=pd.Timestamp("2018-06-10"),
        lift_abs=400000.0,
        spend=421000.0,
        confidence=0.85,
        metric=DependentVarType.REVENUE,
        calibration_scope=CalibrationScope.IMMEDIATE,
    ),
    ("tv_S"): ChannelCalibrationData(
        lift_start_date=pd.Timestamp("2018-04-03"),
        lift_end_date=pd.Timestamp("2018-06-03"),
        lift_abs=300000.0,
        spend=7100.0,
        confidence=0.8,
        metric=DependentVarType.REVENUE,
        calibration_scope=CalibrationScope.IMMEDIATE,
    ),
    ("facebook_S", "search_S"): ChannelCalibrationData(  # Tuple for combined channels
        lift_start_date=pd.Timestamp("2018-07-01"),
        lift_end_date=pd.Timestamp("2018-07-20"),
        lift_abs=700000.0,
        spend=350000.0,
        confidence=0.99,
        metric=DependentVarType.REVENUE,
        calibration_scope=CalibrationScope.IMMEDIATE,
    ),
    ("newsletter"): ChannelCalibrationData(
        lift_start_date=pd.Timestamp("2017-12-01"),
        lift_end_date=pd.Timestamp("2017-12-31"),
        lift_abs=200.0,
        spend=0.0,
        confidence=0.95,
        metric=DependentVarType.REVENUE,
        calibration_scope=CalibrationScope.IMMEDIATE,
    ),
}

# Create the CalibrationInput object directly since keys are already tuples
calibration_input = CalibrationInput(channel_data=channel_calibration_data)

# Convert to DataFrame
df_data = []
for channels, data in calibration_input.channel_data.items():
    df_data.append(
        {
            "channel": "+".join(channels),
            "lift_start_date": data.lift_start_date.strftime("%Y-%m-%d"),
            "lift_end_date": data.lift_end_date.strftime("%Y-%m-%d"),
            "lift_abs": f"{data.lift_abs:,.0f}",
            "spend": f"{data.spend:,.0f}",
            "confidence": f"{data.confidence:.2f}",
            "metric": data.metric.value,
            "calibration_scope": data.calibration_scope.value,
        }
    )

df_calibration_input = pd.DataFrame(df_data)
display(df_calibration_input)

In [None]:
import logging
from typing import Dict


# Define the coefficient function locally in the notebook
def get_model_coefficients(mmm_data: MMMData) -> Dict[str, float]:
    """Get approximate coefficients for channels based on data."""
    coefficients = {}
    dep_var = mmm_data.mmmdata_spec.dep_var

    for channel in mmm_data.mmmdata_spec.paid_media_spends + mmm_data.mmmdata_spec.organic_vars:
        # Calculate simple correlation coefficient
        corr = mmm_data.data[channel].corr(mmm_data.data[dep_var])
        coefficients[channel] = abs(corr)  # Use absolute correlation as coefficient

    return coefficients


# Configure logging to show debug messages
logging.basicConfig(
    level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)

# Get model coefficients
model_coefficients = get_model_coefficients(mmm_data)
print("\nEstimated model coefficients:")
for channel, coef in model_coefficients.items():
    print(f"{channel}: {coef:.4f}")

# Initialize calibration engine with coefficients
calibration_engine = MediaEffectCalibrator(
    mmm_data=mmm_data,
    hyperparameters=hyperparameters,
    calibration_input=calibration_input,
    model_coefficients=model_coefficients,  # Add this parameter
)

# Perform calibration
calibration_results = calibration_engine.calibrate()

print("\nCalibration Results:")
# Access channel scores specifically
for channel, score in calibration_results.channel_scores.items():
    if len(channel) == 1:
        print(f"{channel[0]}: MAPE = {score:.2%}")
    else:
        print(f"{' + '.join(channel)}: MAPE = {score:.2%}")

print(f"\nOverall Model Calibrated: {calibration_results.is_model_calibrated()}")

# Print detailed channel information
print("\nDetailed Channel Information:")
for channel_tuple, data in calibration_input.channel_data.items():
    channels = channel_tuple if len(channel_tuple) > 1 else channel_tuple[0]
    print(f"\nChannel: {channels}")
    print(f"Lift Period: {data.lift_start_date} to {data.lift_end_date}")
    print(f"Expected Lift: {data.lift_abs:,.2f}")
    print(f"Spend: {data.spend:,.2f}")
    print(f"Confidence: {data.confidence:.2%}")

    # Get actual values from data for this period
    date_col = mmm_data.mmmdata_spec.date_var
    mask = (mmm_data.data[date_col] >= data.lift_start_date) & (mmm_data.data[date_col] <= data.lift_end_date)

    if isinstance(channels, tuple):
        actual_values = sum(mmm_data.data.loc[mask, ch].sum() for ch in channels)
    else:
        actual_values = mmm_data.data.loc[mask, channels].sum()

    print(f"Actual Values Sum: {actual_values:,.2f}")

In [None]:
featurized_mmm_data = feature_engineering.perform_feature_engineering()

In [None]:
from robyn.visualization.feature_visualization import FeaturePlotter
import matplotlib.pyplot as plt

# Create a FeaturePlotter instance
feature_plotter = FeaturePlotter(mmm_data, hyperparameters)

# Plot spend-exposure relationship for each channel
for channel in mmm_data.mmmdata_spec.paid_media_spends:
    try:
        fig = feature_plotter.plot_spend_exposure(featurized_mmm_data, channel)
        plt.show()
    except ValueError as e:
        print(f"Skipping {channel}: {str(e)}")

In [None]:
# Setup ModelExecutor
model_executor = ModelExecutor(
    mmmdata=mmm_data,
    holidays_data=holidays_data,
    hyperparameters=hyperparameters,
    calibration_input=None,  # Add calibration input if available
    featurized_mmm_data=featurized_mmm_data,
)

# Setup TrialsConfig
trials_config = TrialsConfig(iterations=2000, trials=5)  # Set to the number of cores you want to use

print(
    f">>> Starting {trials_config.trials} trials with {trials_config.iterations} iterations each using {NevergradAlgorithm.TWO_POINTS_DE.value} nevergrad algorithm on x cores..."
)

# Run the model

output_models = model_executor.model_run(
    trials_config=trials_config,
    ts_validation=False,  # changed from True to False -> deacitvate
    add_penalty_factor=False,
    rssd_zero_penalty=True,
    cores=8,
    nevergrad_algo=NevergradAlgorithm.TWO_POINTS_DE,
    intercept=True,
    intercept_sign="non_negative",
    model_name=Models.RIDGE,
)
print("Model training complete.")

# TODO fix graph outputs

In [None]:
# Assuming model_outputs.trials[0] is already an object from your model
trial = output_models.trials[0]


# Function to check if an object has a 'shape' attribute
def has_shape(obj):
    return hasattr(obj, "shape")


# Get all attribute names of the object and print their shapes if they have a 'shape' attribute
attribute_names = [attr for attr in dir(trial) if not callable(getattr(trial, attr)) and not attr.startswith("__")]
for attribute_name in attribute_names:
    attribute_value = getattr(trial, attribute_name)
    if has_shape(attribute_value):
        print(f"{attribute_name}: Shape = {attribute_value.shape}")
    else:
        print(f"{attribute_name}: No shape attribute, Type = {type(attribute_value).__name__}")

In [None]:
# Assuming model_outputs.trials[0] is already an object from your model
trial = output_models.trials[0]


# Function to check if an object has a 'shape' attribute
def has_shape(obj):
    return hasattr(obj, "shape")


# Get all attribute names of the object and print their shapes if they have a 'shape' attribute
attribute_names = [attr for attr in dir(trial) if not callable(getattr(trial, attr)) and not attr.startswith("__")]
for attribute_name in attribute_names:
    attribute_value = getattr(trial, attribute_name)
    if has_shape(attribute_value):
        print(f"{attribute_name}: Shape = {attribute_value.shape}")
        # Check if the attribute is a multi-dimensional array with more than one column
        if len(attribute_value.shape) > 1 and attribute_value.shape[1] > 1:
            try:
                # Attempt to print column names if it's a structured array or DataFrame
                columns = (
                    attribute_value.columns if hasattr(attribute_value, "columns") else attribute_value.dtype.names
                )
                print(f"  Columns: {columns}")
            except AttributeError:
                print("  No column names available.")
    else:
        print(f"{attribute_name}: No shape attribute, Type = {type(attribute_value).__name__}")

In [None]:
best_model_id = output_models.select_id
print(f"Best model ID: {best_model_id}")

In [None]:
from IPython.display import Image, display
import base64
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


# 1. Display the MOO Distribution Plot
if "moo_distrb_plot" in output_models.convergence:
    moo_distrb_plot = output_models.convergence["moo_distrb_plot"]
    display(Image(data=base64.b64decode(moo_distrb_plot)))

In [None]:
# 2. Display the MOO Cloud Plot
if "moo_cloud_plot" in output_models.convergence:
    moo_cloud_plot = output_models.convergence["moo_cloud_plot"]
    display(Image(data=base64.b64decode(moo_cloud_plot)))

In [None]:
# 3. Print convergence messages
if "conv_msg" in output_models.convergence:
    for msg in output_models.convergence["conv_msg"]:
        print(msg)

In [None]:
# 4. Display time series validation and convergence plots
if "ts_validation_plot" in output_models.convergence:
    ts_validation_plot = output_models.convergence["ts_validation_plot"]
    display(Image(data=base64.b64decode(ts_validation_plot)))

In [None]:
best_model_id = output_models.select_id
print(f"Best model ID: {best_model_id}")