# Robyn: Marketing Mix Modeling Application

This notebook demonstrates the usage of Robyn, a Marketing Mix Modeling (MMM) application. 
We'll go through the main steps of performing robyn_inputs and robyn_engineering.



## 1. Import Required Libraries. Define Paths.

First, be sure to setup your virtual environment. Be sure to switch over to your new environment in this notebook. 

-```cd {root_folder}```

-```python3 -m yourvenv```

-```source yourvenv/bin/activate```

-```cd Robyn/python```

-```pip install -r requirements.txt```


Then import the necessary libraries. Make sure to define your paths below.



In [1]:
import sys

# Add Robyn to path
sys.path.append("/Users/yijuilee/robynpy_release_reviews/Robyn/python/src")

In [2]:
import os
import pandas as pd
import pyreadr
from typing import Dict
from robyn.data.entities.mmmdata import MMMData
from robyn.data.entities.enums import AdstockType
from robyn.data.entities.holidays_data import HolidaysData
from robyn.data.entities.hyperparameters import Hyperparameters, ChannelHyperparameters
from robyn.modeling.entities.modelrun_trials_config import TrialsConfig
from robyn.modeling.model_executor import ModelExecutor
from robyn.modeling.entities.enums import NevergradAlgorithm, Models
from robyn.modeling.feature_engineering import FeatureEngineering

## 2.1 Load Mock R data

We need to set the base path for the data directory.
Create a .env file in the same directory as your notebook and put in define the path to the data dir.
for example: ROBYN_BASE_PATH=.../Robyn/R/data

In [None]:
# Read the simulated data and holidays data
dt_simulated_weekly = pd.read_csv("resources/dt_simulated_weekly.csv")

dt_prophet_holidays = pd.read_csv("resources/dt_prophet_holidays.csv")

## Setup MMM Data

We will now set up the MMM data specification which includes defining the dependent variable, independent variables, and the time window for analysis.

In [None]:
def setup_mmm_data(dt_simulated_weekly) -> MMMData:

    mmm_data_spec = MMMData.MMMDataSpec(
        dep_var="revenue",
        dep_var_type="revenue",
        date_var="DATE",
        context_vars=["competitor_sales_B", "events"],
        paid_media_spends=["tv_S", "ooh_S", "print_S", "facebook_S", "search_S"],
        paid_media_vars=["tv_S", "ooh_S", "print_S", "facebook_I", "search_clicks_P"],
        organic_vars=["newsletter"],
        window_start="2016-01-01",
        window_end="2018-12-31",
        factor_vars=["events"],
    )

    return MMMData(data=dt_simulated_weekly, mmmdata_spec=mmm_data_spec)


mmm_data = setup_mmm_data(dt_simulated_weekly)
mmm_data.data.head()

## Feature Preprocessing

We will perform feature engineering to prepare the data for modeling. This includes transformations like adstock and other preprocessing steps.

In [None]:
hyperparameters = Hyperparameters(
    {
        "facebook_S": ChannelHyperparameters(
            alphas=[0.5, 3],
            gammas=[0.3, 1],
            thetas=[0, 0.3],
        ),
        "print_S": ChannelHyperparameters(
            alphas=[0.5, 3],
            gammas=[0.3, 1],
            thetas=[0.1, 0.4],
        ),
        "tv_S": ChannelHyperparameters(
            alphas=[0.5, 3],
            gammas=[0.3, 1],
            thetas=[0.3, 0.8],
        ),
        "search_S": ChannelHyperparameters(
            alphas=[0.5, 3],
            gammas=[0.3, 1],
            thetas=[0, 0.3],
        ),
        "ooh_S": ChannelHyperparameters(
            alphas=[0.5, 3],
            gammas=[0.3, 1],
            thetas=[0.1, 0.4],
        ),
        "newsletter": ChannelHyperparameters(
            alphas=[0.5, 3],
            gammas=[0.3, 1],
            thetas=[0.1, 0.4],
        ),
    },
    adstock=AdstockType.GEOMETRIC,
    lambda_=0.0,
    train_size=[0.5, 0.8],
)

print("Hyperparameters setup complete.")

In [None]:
# Create HolidaysData object
holidays_data = HolidaysData(
    dt_holidays=dt_prophet_holidays,
    prophet_vars=["trend", "season", "holiday"],
    prophet_country="DE",
    prophet_signs=["default", "default", "default"],
)
# Setup FeaturizedMMMData
feature_engineering = FeatureEngineering(mmm_data, hyperparameters, holidays_data)

In [None]:
# Setup FeaturizedMMMData
featurized_mmm_data = feature_engineering.perform_feature_engineering()

In [None]:
from robyn.visualization.feature_visualization import FeaturePlotter
import matplotlib.pyplot as plt
%matplotlib inline

# Create a FeaturePlotter instance
feature_plotter = FeaturePlotter(mmm_data, hyperparameters)
# Extract the list of results
results_list = featurized_mmm_data.modNLS["results"]
# Plot spend-exposure relationship for each channel in the results
for result in results_list:
    channel = result["channel"]
    try:
        fig = feature_plotter.plot_spend_exposure(featurized_mmm_data, channel)
        plt.show()
    except ValueError as e:
        print(f"Skipping {channel}: {str(e)}")

In [None]:
from utils.data_mapper import load_data_from_json, import_input_collect

# Load data from JSON exported from R
raw_input_collect = load_data_from_json(
    "/Users/yijuilee/project_robyn/original/Robyn_original_2/Robyn/robyn_api/data/Feature_InputCollect.json"
)

# Convert R data to Python objects
r_input_collect = import_input_collect(raw_input_collect)

# Extract individual components
r_mmm_data = r_input_collect["mmm_data"]
r_featurized_mmm_data = r_input_collect["featurized_mmm_data"]
r_holidays_data = r_input_collect["holidays_data"]
r_hyperparameters = r_input_collect["hyperparameters"]

In [None]:
print(featurized_mmm_data.dt_mod["events"].unique())

print(r_featurized_mmm_data.dt_mod["events"].unique())

In [None]:
import pandas as pd

# Assuming featurized_mmm_data and r_featurized_mmm_data are your DataFrames
# and they have been defined and populated with data
# Calculate descriptive statistics for both DataFrames
python_stats = featurized_mmm_data.dt_mod[["trend", "season", "holiday", "events"]].describe()
r_stats = r_featurized_mmm_data.dt_mod[["trend", "season", "holiday", "events"]].describe()
print(python_stats)
print(r_stats)
# Define a tolerance level for comparison
tolerance = 1000


# Function to compare two DataFrames
def compare_stats(python_stats: pd.DataFrame, r_stats: pd.DataFrame, tolerance: float):
    # Iterate over each column and statistic
    for column in python_stats.columns:
        for stat in python_stats.index:
            python_value = python_stats.loc[stat, column]
            r_value = r_stats.loc[stat, column]
            # Assert that the values are within the specified tolerance
            assert abs(python_value - r_value) <= tolerance, (
                f"Difference in {stat} for {column} exceeds tolerance: "
                f"Python value = {python_value}, R value = {r_value}"
            )


# Compare the statistics
compare_stats(python_stats, r_stats, tolerance)
print("All statistics are within the specified tolerance.")

In [None]:
print(featurized_mmm_data.dt_modRollWind["events"].unique())

print(r_featurized_mmm_data.dt_modRollWind["events"].unique())

In [None]:
# Convert 'yhat' lists to DataFrames for comparison
yhat1_df = pd.DataFrame(featurized_mmm_data.modNLS["yhat"])
yhat2_df = pd.DataFrame(r_featurized_mmm_data.modNLS["yhat"])

# Print the shape and a quick preview of 'yhat' DataFrames
print("Shape of 'yhat' from modNLS1:", yhat1_df.shape)
print("Preview of 'yhat' from modNLS1:")
print(yhat1_df.head())

print("\nShape of 'yhat' from modNLS2:", yhat2_df.shape)
print("Preview of 'yhat' from modNLS2:")
print(yhat2_df.head())

# Describe the numeric columns to compare distributions
print("\nDescription of 'yhat' from modNLS1:")
print(yhat1_df.describe())

print("\nDescription of 'yhat' from modNLS2:")
print(yhat2_df.describe())

In [None]:
import numpy as np
import pandas as pd


def compare_modNLS(modNLS1, modNLS2, tolerance=1e-1, percent_tolerance=5):
    # Print available channels for debugging
    channels1 = [result["channel"] for result in modNLS1["results"]]
    channels2 = [result["channel"] for result in modNLS2["results"]]
    print("Channels in modNLS1:", channels1)
    print("Channels in modNLS2:", channels2)

    # Print the structure of 'results' for debugging
    print("\nStructure of 'results' in modNLS1:")
    for result in modNLS1["results"]:
        print(result)

    print("\nStructure of 'results' in modNLS2:")
    for result in modNLS2["results"]:
        print(result)

    # Compare 'results' section
    for result1 in modNLS1["results"]:
        channel = result1["channel"]
        result2 = next((r for r in modNLS2["results"] if r["channel"] == channel), None)
        assert result2 is not None, f"Channel {channel} not found in second modNLS"

        # Compare R-squared values separately
        assert np.isclose(
            result1["rsq_nls"], result2["rsq_nls"], atol=tolerance
        ), f"R-squared (NLS) mismatch for {channel}: {result1['rsq_nls']} vs {result2['rsq_nls']}"

        assert np.isclose(
            result1["rsq_lm"], result2["rsq_lm"], atol=tolerance
        ), f"R-squared (LM) mismatch for {channel}: {result1['rsq_lm']} vs {result2['rsq_lm']}"

        # Compare coefficients
        for coef_key in ["Vmax", "Km", "coef_lm"]:
            coef_value1 = result1.get(coef_key)
            coef_value2 = result2.get(coef_key)
            assert coef_value2 is not None, f"Coefficient {coef_key} not found for {channel}"
            assert np.isclose(
                coef_value1, coef_value2, atol=tolerance
            ), f"Coefficient {coef_key} mismatch for {channel}: {coef_value1} vs {coef_value2}"

    # Convert 'yhat' lists to DataFrames for comparison
    yhat1_df = pd.DataFrame(modNLS1["yhat"]).sort_values(by=["ds", "channel"]).reset_index(drop=True)
    yhat2_df = pd.DataFrame(modNLS2["yhat"]).sort_values(by=["ds", "channel"]).reset_index(drop=True)

    # Print preview of 'yhat' DataFrames
    print("Preview of 'yhat' from modNLS1:")
    print(yhat1_df.head())
    print("\nPreview of 'yhat' from modNLS2:")
    print(yhat2_df.head())

    # Compare 'yhat' DataFrame
    assert yhat1_df.shape == yhat2_df.shape, "Shape mismatch in 'yhat' DataFrame"

    # Select only numeric columns for comparison
    numeric_cols = yhat1_df.select_dtypes(include=[np.number]).columns

    # Use describe to get summary statistics
    desc1 = yhat1_df[numeric_cols].describe()
    desc2 = yhat2_df[numeric_cols].describe()

    # Compare each statistic separately
    for stat in desc1.index:
        for col in numeric_cols:
            val1 = desc1.at[stat, col]
            val2 = desc2.at[stat, col]
            if not np.isclose(val1, val2, rtol=percent_tolerance):
                print(f"Mismatch in {stat} of column '{col}': {val1} vs {val2}")
                raise AssertionError(f"Mismatch in {stat} of column '{col}'")

    print("All comparisons passed within the specified tolerance.")


# Example usage
compare_modNLS(featurized_mmm_data.modNLS, r_featurized_mmm_data.modNLS)

In [None]:
print(featurized_mmm_data.modNLS.keys())

for key in featurized_mmm_data.modNLS:
    print(f"\nKey: {key}")
    print(f"Type: {type(featurized_mmm_data.modNLS[key])}")
    if isinstance(featurized_mmm_data.modNLS[key], list):
        print(f"Number of items: {len(featurized_mmm_data.modNLS[key])}")
        if len(featurized_mmm_data.modNLS[key]) > 0:
            print("Sample item:", featurized_mmm_data.modNLS[key][0])
    elif isinstance(featurized_mmm_data.modNLS[key], dict):
        print("Sample keys:", list(featurized_mmm_data.modNLS[key].keys()))
        if len(featurized_mmm_data.modNLS[key]) > 0:
            first_key = next(iter(featurized_mmm_data.modNLS[key]))
            print("Sample item:", featurized_mmm_data.modNLS[key][first_key])
    else:
        print("Value:", featurized_mmm_data.modNLS[key])

In [None]:
channel_name = "facebook_S"  # Example channel name
results = featurized_mmm_data.modNLS.get("results", [])
channel_data = next((item for item in results if item["channel"] == channel_name), None)
if channel_data:
    print(f"\nData for channel {channel_name}:")
    print(channel_data)
else:
    print(f"No data found for channel {channel_name}.")