In [46]:
import pandas as pd
import numpy as np
import random
import os ### Kristen adding os file pathing too just for the sake of it


random.seed(1)
np.random.seed(1)
# ----------------------------
# Decay Functions (Factor Form)
# ----------------------------

def exponential_decay_factor(t, decay_rate):
    """
    Exponential decay factor that starts at 1 (no decay) and decreases over time.
    
    Args:
        t (int): Time index (e.g., day count).
        decay_rate (float): Decay rate.
    
    Returns:
        float: Decay factor.
    """
    return np.exp(-decay_rate * t)

def sigmoid_decay_factor(t, midpoint, steepness):
    """
    Sigmoid decay factor that transitions from near 1 to lower values over time.
    
    Args:
        t (int): Time index (e.g., day count).
        midpoint (float): The time at which the decay is halfway.
        steepness (float): Controls how steep the decay is.
    
    Returns:
        float: Decay factor.
    """
    # Shifted and scaled sigmoid so that at t=0 the factor is ~1.
    # Adjust the formula if necessary.
    return 1 / (1 + np.exp(steepness * (t - midpoint)))

# ----------------------------
# User Class Definition
# ----------------------------

class User:
    def __init__(self, user_id, group, base_params):
        """
        Initializes a user with a unique ID, group, and baseline parameters.
        
        Args:
            user_id (str): Unique identifier for the user.
            group (str): User group (e.g., 'Baseline', 'Drifters', 'Power Users').
            base_params (dict): Baseline parameters for each metric.
                For example:
                {
                  "engagement": {"mode": 50, "std": 7},
                  "ctr": {"low": 0.01, "high": 0.05}
                }
        """
        self.user_id = user_id
        self.group = group
        self.base_params = base_params  # These remain unchanged (Time A)
    
    def generate_data(self, start_date, end_date):
        """
        Generate daily data using the base (Time A) parameters.
        
        Args:
            start_date (str): Start date in 'YYYY-MM-DD'.
            end_date (str): End date in 'YYYY-MM-DD'.
        
        Returns:
            DataFrame: Simulated data for the user.
        """
        dates = pd.date_range(start=start_date, end=end_date)
        data = []
        for d in dates:
            # Simulate Engagement using a log-normal distribution
            engagement = np.random.lognormal(
                mean=np.log(self.base_params["engagement"]["mode"]),
                sigma=self.base_params["engagement"]["std"] / 100
            )
            # Simulate CTR using a uniform distribution
            ctr = np.random.uniform(
                self.base_params["ctr"]["low"],
                self.base_params["ctr"]["high"]
            )
            data.append({
                "Date": d,
                "UserID": self.user_id,
                "Group": self.group,
                "Engagement": engagement,
                "CTR": ctr
            })
        return pd.DataFrame(data)
    
    def generate_data_with_decay(self, start_date, end_date, decay_func, decay_params):
        """
        Generate daily data for Time B using a decay function that gradually changes the baseline.
        
        For each day, the engagement parameter is decayed according to its time index,
        so early days look like Time A and later days reflect the decay.
        
        Args:
            start_date (str): Start date in 'YYYY-MM-DD' for Time B.
            end_date (str): End date in 'YYYY-MM-DD' for Time B.
            decay_func (function): Decay factor function (e.g., exponential_decay_factor).
            decay_params (dict): Additional parameters for the decay function.
        
        Returns:
            DataFrame: Simulated Time B data.
        """
        dates = pd.date_range(start=start_date, end=end_date)
        data = []
        original_mode = self.base_params["engagement"]["mode"]
        
        for i, d in enumerate(dates):
            # Compute decay factor for this day; at t=0, factor should be close to 1.
            decay_factor = decay_func(t=i, **decay_params)
            # Decay the engagement mode over time
            decayed_mode = original_mode * decay_factor
            
            engagement = np.random.lognormal(
                mean=np.log(decayed_mode),
                sigma=self.base_params["engagement"]["std"] / 100
            )
            ctr = np.random.uniform(
                self.base_params["ctr"]["low"],
                self.base_params["ctr"]["high"]
            )
            data.append({
                "Date": d,
                "UserID": self.user_id,
                "Group": self.group,
                "Engagement": engagement,
                "CTR": ctr
            })
        return pd.DataFrame(data)

# ----------------------------
# Helper Functions for Data Generation
# ----------------------------

# def generate_users(n_users):
#     """
#     Generates a list of users with randomized baseline parameters.
    
#     Args:
#         n_users (int): Number of users.
    
#     Returns:
#         list: List of User objects.
#     """
#     users = []
#     for i in range(n_users):
#         user_id = f"U{i:04d}"
#         group = random.choice(["Baseline", "Drifters", "Power Users"])
#         base_params = {
#             "engagement": {"mode": random.uniform(30, 65), "std": random.uniform(5, 10)},
#             "ctr": {"low": random.uniform(0.01, 0.03), "high": random.uniform(0.03, 0.05)}
#         }
#         users.append(User(user_id, group, base_params))
#     return users




def generate_users(n_users):
    """
    Generates a list of users with randomized baseline parameters,
    using probability-based group assignments and double-layered randomness.
    
    Args:
        n_users (int): Number of users.
    
    Returns:
        list: List of User objects.
    """
    users = []
    # Define groups and their probabilities (e.g., Baseline 63%, Drifters 21%, Power Users 16%)
    groups = ["Baseline", "Drifters", "Power Users"]
    probabilities = [0.63, 0.21, 0.16]
    
    for i in range(n_users):
        user_id = f"U{i:07d}"
        # Choose a group based on defined probabilities
        group = random.choices(groups, weights=probabilities, k=1)[0]
        
        # Set base_params based on the user's group
        if group == "Baseline":
            # Double-layer randomness for engagement mode
            n1 = random.uniform(25, 40) # Mode Min
            n2 = random.uniform(55, 70) # Mode Max
            # For the standard deviation, add another layer
            n3 = random.uniform(3, 5) + random.gauss(0,.1)
            n4 = random.uniform(7, 10) + random.gauss(0,.1)
            base_params = {
                "engagement": {
                    "mode": random.uniform(n1, n2), 
                    "std": random.uniform(n3, n4)
                },
                "ctr": {
                    "low": random.uniform(0.01, 0.03), 
                    "high": random.uniform(0.03, 0.05)
                }
            }
        elif group == "Drifters":
            # Double-layer randomness for engagement mode
            n1 = random.uniform(15, 35) 
            n2 = random.uniform(45, 80)
            # For the standard deviation, add another layer
            n3 = random.uniform(4, 6) + random.gauss(0,.1)
            n4 = random.uniform(8, 12) + random.gauss(0,.1)
            base_params = {
                "engagement": {
                    "mode": random.uniform(n1, n2), 
                    "std": random.uniform(n3, n4)
                },
                "ctr": {
                    "low": random.uniform(0.01, 0.04), 
                    "high": random.uniform(0.07, 0.1)
                }
            }
        elif group == "Power Users":
            # Double-layer randomness for engagement mode
            n1 = random.uniform(60, 80)
            n2 = random.uniform(90, 110)
            # For the standard deviation, add another layer
            n3 = random.uniform(6, 8)
            n4 = random.uniform(10, 15)
            base_params = {
                "engagement": {
                    "mode": random.uniform(n1, n2), 
                    "std": random.uniform(n3, n4)
                },
                "ctr": {
                    "low": random.uniform(0.05, 0.1), 
                    "high": random.uniform(0.1, 0.2)
                }
            }
        
        # Create and append the user with the group-specific parameters
        users.append(User(user_id, group, base_params))
        
    return users


def generate_time_series(users, start_date, end_date, use_decay=False, decay_func=None, decay_params=None):
    """
    Combines individual user data into one DataFrame.
    
    Args:
        users (list): List of User objects.
        start_date (str): Start date for the data.
        end_date (str): End date for the data.
        use_decay (bool): If True, generate data with decay (Time B).
        decay_func (function): Decay factor function to use.
        decay_params (dict): Additional parameters for the decay function.
    
    Returns:
        DataFrame: Combined data.
    """
    df_list = []
    for user in users:
        if use_decay and decay_func is not None and decay_params is not None:
            user_df = user.generate_data_with_decay(start_date, end_date, decay_func, decay_params)
        else:
            user_df = user.generate_data(start_date, end_date)
        df_list.append(user_df)
    return pd.concat(df_list, ignore_index=True)

# ----------------------------
# Example Workflow: Generating Time A and Time B
# ----------------------------

# Generate Time A data (baseline)
start_date_a = "2024-01-01"
end_date_a = "2024-06-30"
users = generate_users(n_users=100)
time_a_df = generate_time_series(users, start_date_a, end_date_a)

# For Time B, we want to apply decay gradually.
# For example, using exponential decay with a decay_rate such that at t=0 the factor is 1.
decay_rate = 0.0001  # Adjust this value to control the decay speed.
decay_params = {"decay_rate": decay_rate}

# Generate Time B data with decay.
start_date_b = "2024-07-01"
end_date_b = "2024-12-31"
time_b_df_decay = generate_time_series(users, start_date_b, end_date_b, 
                                 use_decay=True, decay_func=exponential_decay_factor, decay_params=decay_params)


### Kristen adding growth time B

def exponential_growth_factor(t, growth_rate):
    return 1 + (np.exp(growth_rate * t) - 1)

growth_rate = 0.0001
growth_params = {"growth_rate": growth_rate}

time_b_df_growth = generate_time_series(
    users, start_date_b, end_date_b,
    use_decay=True,
    decay_func=exponential_growth_factor,
    decay_params=growth_params
)

### Kristen adding in normal time B too

time_b_df_normal = generate_time_series(
    users, start_date_b, end_date_b,
    use_decay=False
)



In [47]:
time_a_df

Unnamed: 0,Date,UserID,Group,Engagement,CTR
0,2024-01-01,U0000000,Baseline,64.355985,0.010571
1,2024-01-02,U0000000,Baseline,58.951726,0.021496
2,2024-01-03,U0000000,Baseline,59.145322,0.024909
3,2024-01-04,U0000000,Baseline,57.894817,0.030044
4,2024-01-05,U0000000,Baseline,64.660806,0.017958
...,...,...,...,...,...
18195,2024-06-26,U0000099,Power Users,68.653407,0.094981
18196,2024-06-27,U0000099,Power Users,76.590397,0.136392
18197,2024-06-28,U0000099,Power Users,74.694200,0.115126
18198,2024-06-29,U0000099,Power Users,80.612177,0.132142


In [48]:
time_b_df_decay

Unnamed: 0,Date,UserID,Group,Engagement,CTR
0,2024-07-01,U0000000,Baseline,62.681661,0.040497
1,2024-07-02,U0000000,Baseline,61.307001,0.020694
2,2024-07-03,U0000000,Baseline,58.487852,0.017164
3,2024-07-04,U0000000,Baseline,59.353556,0.044251
4,2024-07-05,U0000000,Baseline,58.780718,0.024733
...,...,...,...,...,...
18395,2024-12-27,U0000099,Power Users,76.321563,0.101382
18396,2024-12-28,U0000099,Power Users,75.207846,0.094901
18397,2024-12-29,U0000099,Power Users,76.158377,0.078148
18398,2024-12-30,U0000099,Power Users,73.329913,0.174590


In [49]:
if not os.path.exists('Data'):
    print("Path doesn't exist. Adding Data folder :)")
    os.makedirs('Data')
else:
    print("Saved :D")

time_a_df.to_csv('Data/time_a.csv')
time_b_df_decay.to_csv('Data/time_b_decay.csv')
time_b_df_growth.to_csv('Data/time_b_growth.csv')
time_b_df_normal.to_csv('Data/time_b_normal.csv')

Saved :D
