In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Dataset/all_user_data.csv')
df.head()

Unnamed: 0,id,age,gender,country,latitude,longitude,traffic_source,days_since_lo,cancellations,returns,no_orders,no_items,items_per_order,total_spend,avg_order_value,income_category,children_at_home
0,457,65,M,Brazil,-9.945568,-67.83561,Search,85.0,0.0,0.0,1,1,1.0,99.989998,99.989998,high,0.0
1,64231,25,F,Brazil,-9.945568,-67.83561,Search,280.0,0.0,0.0,1,1,1.0,39.950001,39.950001,medium,0.0
2,72187,47,F,Brazil,-9.945568,-67.83561,Search,88.0,0.0,0.0,1,1,1.0,21.0,21.0,low,1.0
3,95069,48,M,Brazil,-9.857324,-69.437057,Search,645.0,1.0,1.0,1,1,1.0,39.5,39.5,low,1.0
4,21246,62,M,Brazil,-8.325245,-71.55303,Email,77.0,0.0,0.0,2,3,1.5,145.139999,72.57,high,1.0


In [3]:
# Select columns used for mock data creation
mock_df = df[['id', 'age', 'gender']]
mock_df.head()

Unnamed: 0,id,age,gender
0,457,65,M
1,64231,25,F
2,72187,47,F
3,95069,48,M
4,21246,62,M


# Income groups

In [4]:
# Function to assign income categories based on age
def assign_income_category(age):
    if age < 16:
        return np.nan  # No group assigned for people under 16
    elif 16 <= age <= 50:
        probability = np.random.normal(loc=18, scale=8)  # Centered at 25 years
        return 'low' if np.random.random() < (probability / 20) else 'medium'
    elif 24 <= age <= 60:
        probability = np.random.normal(loc=34, scale=8)  # Centered at 34 years
        return 'medium' if np.random.random() < (probability / 60) else 'high'
    elif 32 <= age <= 65:
        probability = np.random.normal(loc=50, scale=8)  # Centered at 50 years
        return 'high' if np.random.random() < (probability / 65) else 'medium'
    elif age > 65:
        # Randomly assign either 'low', 'medium', or 'high'
        return np.random.choice(['low', 'medium', 'high'])

# Apply the function to the 'age' column
mock_df['income_category'] = mock_df['age'].apply(assign_income_category)
mock_df.head()

Unnamed: 0,id,age,gender,income_category
0,457,65,M,high
1,64231,25,F,low
2,72187,47,F,low
3,95069,48,M,low
4,21246,62,M,medium


In [5]:
mock_df[(mock_df['age'] >= 35) & (mock_df['age'] <= 65)].groupby('income_category', dropna=False).count()

Unnamed: 0_level_0,id,age,gender
income_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
high,5229,5229,5229
low,7906,7906,7906
medium,6489,6489,6489


# Children at home flag

In [7]:
# Define the logistic probability function
def probability_of_children(age, a=0.2, b=35):
    return 1 / (1 + np.exp(-a * (age - b)))

# Function to generate the children-at-home flag
def generate_children_flag(age):
    prob = probability_of_children(age)
    return 1 if np.random.random() < prob else 0

# Apply the function to the 'age' column
children_df['children_at_home'] = children_df['age'].apply(lambda x: generate_children_flag(x))

children_df.head()

Unnamed: 0,id,age,children_at_home
1,64231,25,0
2,72187,47,0
3,95069,48,1
4,21246,62,1
6,34745,32,1


In [9]:
# Merge data into one dataframe
mock_df = pd.merge(mock_df, children_df[['id','children_at_home']], on = 'id', how="left")
mock_df.head()

Unnamed: 0,id,age,gender,income_category,children_at_home
0,457,65,M,high,
1,64231,25,F,low,0.0
2,72187,47,F,low,0.0
3,95069,48,M,low,1.0
4,21246,62,M,medium,1.0


In [10]:
# Replace NA values with 0
mock_df['children_at_home'][mock_df.children_at_home.isna()] = 0

In [218]:
# Export mock demo data into CSV
mock_df.to_csv('Dataset/mock_data.csv', header = True, index=False)