In [57]:
# interesting articles:
# http://leananalyticsbook.com/one-metric-that-matters/
# https://v4-alpha.getbootstrap.com/content/code/
# https://fizzle.co/sparkline/vanity-vs-actionable-metrics

import pandas as pd
import numpy as np
import uuid
from datetime import datetime
from dateutil.relativedelta import relativedelta
# set the settingWithCopyWarning in Pandas to None
pd.options.mode.chained_assignment = None  # default='warn'
%matplotlib inline

The aim of this notebook is to generate product usage data from scratch. We generate data for 10.000 users and simulate their behavior which can then we used to create cohort analysis like retention curves. More here: http://josolnik.com/generating_product_usage_data.html

### 1. Defining the main parameters

In [59]:
# number of devices to generate data for
num_device_uuids = 1000

# number of months generated for each device
num_months = 20

# starting month of product usage data
start_month = '2016-01-01'

### 2. Generating device uuids and applying them to datetime data

In [61]:
# generating unique identifiers for each device
uuids = pd.Series([str(uuid.uuid4()) for i in range(1,num_device_uuids)])
user_data = pd.DataFrame()
user_data['device_uuid'] = pd.Series(uuids).repeat(num_months)
user_data =user_data.reset_index().drop('index', 1)

# example of two different device uuids
user_data[19:21]

Unnamed: 0,device_uuid
19,da5e3464-d356-4572-b962-f2b57e730732
20,b7dbd47e-2739-4f7d-8a59-d1ec0f08d6d9


In [62]:
# defining starting and ending month for generating data
start_month_ts = pd.to_datetime(start_month)
end_month_ts = start_month_ts + relativedelta(months=+num_months - 1)
str(start_month_ts), str(end_month_ts)

('2016-01-01 00:00:00', '2017-08-01 00:00:00')

In [63]:
# months for which we will be generating data
months = pd.Series(pd.date_range(start_month_ts, end_month_ts, freq='MS'))
months

0    2016-01-01
1    2016-02-01
2    2016-03-01
3    2016-04-01
4    2016-05-01
5    2016-06-01
6    2016-07-01
7    2016-08-01
8    2016-09-01
9    2016-10-01
10   2016-11-01
11   2016-12-01
12   2017-01-01
13   2017-02-01
14   2017-03-01
15   2017-04-01
16   2017-05-01
17   2017-06-01
18   2017-07-01
19   2017-08-01
dtype: datetime64[ns]

In [81]:
# adding date column to the user data dataframe
user_data['date'] = pd.concat([months] * num_device_uuids, axis=0).reset_index().drop('index', 1)
user_data.head()

Unnamed: 0,device_uuid,date
0,da5e3464-d356-4572-b962-f2b57e730732,2016-01-01
1,da5e3464-d356-4572-b962-f2b57e730732,2016-02-01
2,da5e3464-d356-4572-b962-f2b57e730732,2016-03-01
3,da5e3464-d356-4572-b962-f2b57e730732,2016-04-01
4,da5e3464-d356-4572-b962-f2b57e730732,2016-05-01


In [84]:
# checking number of months per user
len(user_data) / len(device_uuids)

20.0

### 3. Generating usage features

In [129]:
# defining parameters for usage features (used later)
num_usage_features = 3
feature_usage_ratio = 0.8

### 3.1. Calculating feature ratios

In [130]:
# extracting all the unique device uuids from the main df
device_uuids = pd.Series(user_data['device_uuid'].unique())
user_data_with_usage = pd.DataFrame()
device_uuids[0:5]

0    da5e3464-d356-4572-b962-f2b57e730732
1    b7dbd47e-2739-4f7d-8a59-d1ec0f08d6d9
2    72f6ec5c-a6ba-48b3-b0ab-eb80c618fba8
3    81e004fa-f59f-4019-b4c1-cebe5e779b3c
4    220417c0-e032-4340-8fe0-a87045ddceff
dtype: object

In [131]:
# generate random usage data for each feature and simulate falling usage per feature
# (each feature 80% of the previous one)
# example: F1 = 1, F2 = F1*0.8 = 0.8, F3 = F2*0.8 = 0.64

# extract the names for each feature based on the pre-set number of feature
features = ['feature' + str(i) for i in range(1,num_usage_features+1)]

usage_features_ratio = {}

# set the feature_ratio for the first feature (100%)
cur_feature = 1.0

# calculate the ratio for each feature
for feature in features:
    usage_features_ratio[feature] = float("{0:.1f}".format(cur_feature))
    cur_feature*= 0.8

usage_features_ratio

{'feature1': 1.0, 'feature2': 0.8, 'feature3': 0.6}

### 3.2. Assigning cohort groups to devices

In [132]:
# set the number of cohorts to be half of number of months of data (20 months = 10 cohorts)
num_cohorts = int(num_months / 2) 
num_cohorts

10

In [133]:
# assign cohorts to users randomly (when did the user first used the app?)
cohorts = pd.DataFrame()
cohorts['device_uuid'] = device_uuids
cohorts['cohort_group'] = np.random.randint(low=1, high=num_cohorts, size=num_device_uuids-1)
cohorts.head()

Unnamed: 0,device_uuid,cohort_group
0,da5e3464-d356-4572-b962-f2b57e730732,4
1,b7dbd47e-2739-4f7d-8a59-d1ec0f08d6d9,3
2,72f6ec5c-a6ba-48b3-b0ab-eb80c618fba8,4
3,81e004fa-f59f-4019-b4c1-cebe5e779b3c,9
4,220417c0-e032-4340-8fe0-a87045ddceff,7


### 3.3. Adding cohort data

In [138]:
# finding cohort groups for each device
user_data_with_cohort_groups = pd.DataFrame()

for device_uuid in device_uuids:
    
    # 1. slice user_data, create a device data for each user
    device_uuid_data = user_data[user_data['device_uuid'] == device_uuid]
    
    # 2. find cohort group of the device and delete all unnecessary months
    device_uuid_cohort_group = int(cohorts[cohorts['device_uuid'] == device_uuid]['cohort_group'])
    device_uuid_data = device_uuid_data[device_uuid_cohort_group:]

    user_data_with_cohort_groups = user_data_with_cohort_groups.append(device_uuid_data)

In [139]:
# after assigning cohorts we lose 25% of the months (20 -> 15)
round(len(user_data_with_cohort_groups) / len(device_uuids))

15

### 3.4. Randomly generate product usage data

In [140]:
# reseting the index
user_data_with_cohort_groups = user_data_with_cohort_groups.reset_index().drop('index', axis=1)

# generate data for feature1
user_data_with_cohort_groups[features[0]] = pd.Series((
        np.random.randint(low=0, high=14, 
                          size=len(user_data_with_cohort_groups))))


# generate data for the remaining usage features (feature2 and feature3)
for feature in features[1:]:
        user_data_with_cohort_groups[feature] = user_data_with_cohort_groups[features[0]] * usage_features_ratio[feature]
        
user_data_with_cohort_groups.head()

Unnamed: 0,device_uuid,date,feature1,feature2,feature3
0,da5e3464-d356-4572-b962-f2b57e730732,2016-05-01,13,10.4,7.8
1,da5e3464-d356-4572-b962-f2b57e730732,2016-06-01,10,8.0,6.0
2,da5e3464-d356-4572-b962-f2b57e730732,2016-07-01,5,4.0,3.0
3,da5e3464-d356-4572-b962-f2b57e730732,2016-08-01,0,0.0,0.0
4,da5e3464-d356-4572-b962-f2b57e730732,2016-09-01,10,8.0,6.0


### 3.5. Simulate churn behavior

#### 2 principles:
    1. Churner stays a churner (after a month of no usage the users doesn't use the product again)
    2. 50 % of users churn in month 1 (about 50% of users have 1 month of data, about 50% have more)

In [144]:
user_data_with_usage_features_and_churn = pd.DataFrame()

for device_uuid in device_uuids:
    
    # set the churner to false
    churner = False
    # 50 % chance that the user churns in the first month
    churner_in_m1 = True if int(np.random.randint(low=0, high=2, size=1)) == 0 else False
            
    # slice device_uuid data
    device_uuid_data = user_data_with_cohort_groups[user_data_with_cohort_groups['device_uuid'] == device_uuid]
    
    # if the user churns in month 1 then assign all of the months to 0
    # (except in month 0 when the user first used the product)
    if churner_in_m1:
        device_uuid_data.loc[1, features[0:]] = 0
        
    # if the user doesn't churn in month 1 then if the user stops using the product he/she doesn't use it again
    # (churner remains a churner)
    else:
        for index, row in device_uuid_data.iterrows():
        
            # extracting the data for feature1
            if row[features[0]] == 0:
                churner = True   

            # checking if user is a churner
            if churner == True:
                # if a churner, set the value of all usage features to 0 for the remaining months
                device_uuid_data.loc[index, features[0:]] = 0
    
    # print(device_uuid_data)
            
    user_data_with_usage_features_and_churn = user_data_with_usage_features_and_churn.append(device_uuid_data)
    
# delete the months when with churned device_uuid months
user_data_with_usage_features_and_churn = user_data_with_usage_features_and_churn[user_data_with_usage_features_and_churn[features[0]] != 0]

In [145]:
user_data_with_usage_features_and_churn.head()

Unnamed: 0,device_uuid,date,feature1,feature2,feature3
0,da5e3464-d356-4572-b962-f2b57e730732,2016-05-01,13.0,10.4,7.8
1,da5e3464-d356-4572-b962-f2b57e730732,2016-06-01,10.0,8.0,6.0
2,da5e3464-d356-4572-b962-f2b57e730732,2016-07-01,5.0,4.0,3.0
16,b7dbd47e-2739-4f7d-8a59-d1ec0f08d6d9,2016-04-01,6.0,4.8,3.6
17,b7dbd47e-2739-4f7d-8a59-d1ec0f08d6d9,2016-05-01,13.0,10.4,7.8


In [146]:
# average number of months of usage per device_uuid
len(user_data_with_usage_features_and_churn) / len(device_uuids)

11.23123123123123

In [147]:
num_months

20

### 4. Generating categorical features

In [171]:
# defining categorical parameters
num_segmentation_featues = 3
platforms = ['iOS', 'Android']
countries = ['NL', 'AU', 'FR']
user_registered = [False, True]

In [172]:
segmentation_features = {'platform': platforms,
                'user_registered': user_registered,
                'country': countries}
segmentation_features

{'country': ['NL', 'AU', 'FR'],
 'platform': ['iOS', 'Android'],
 'user_registered': [False, True]}

### 4.1. Generating the feature weights

In [173]:
# Defining weights for the likelihood of a categorical feature appearing

weights_2 = [0.7, 0.3]
weights_3 = [0.6, 0.3, 0.1]

# If there are two possible variants in a categorical variable (e.g. 'Android' and 'iOS') then the data generated 
# will contain 70% of the first variant and 30% with the second variant

# If there are three possible variants in a categorical variable (e.g. 'NL', 'AU', 'FR') then the data generated will
# contain 60% of the first variant and 30% with the second variant and 10% with the third variant

### 4.1. Applying the feature weights

In [174]:
segmentation_features_df = pd.DataFrame()


for device_uuid in device_uuids[0:device_uuids_limit]:
    
    device_uuid_feature_values_dict = {}
    
    for feature_name, feature_values in segmentation_features.items():

        # generate the random feature value
        # add code if you want to add features with > 3 values
        if len(feature_values) == 2:
            feature_weights = weights_2
        elif len(feature_values) == 3:
            feature_weights = weights_3
        
        # generate the feature value from assigned weights
        feature_value = np.random.choice(feature_values, p=feature_weights)
        
        
        # device_uuid_feature_values_list.append(feature_value)
        
        # make a dict of feature_name
        device_uuid_feature_values_dict[feature_name] = feature_value
        
        
        
    # make a df with segmentation features for the device_uuid
    device_segm_features = pd.DataFrame(list(device_uuid_feature_values_dict.items())).T
    device_segm_features.columns = device_segm_features.iloc[0]
    device_segm_features['device_uuid'] = device_uuid
    device_segm_features = device_segm_features.reindex(device_segm_features.index.drop(0))
    device_segm_features['device_uuid'] = device_uuid
    # device_segm_features
        
    # append to the main df
        
    segmentation_features_df = segmentation_features_df.append(device_segm_features)
    
    
# segmentation_features_df = segmentation_features_df.reset_index().drop('index', axis=1)
segmentation_features_df = segmentation_features_df.set_index('device_uuid').reset_index()

In [175]:
segmentation_features_df.head()

Unnamed: 0,device_uuid,user_registered,country,platform
0,da5e3464-d356-4572-b962-f2b57e730732,False,NL,iOS
1,b7dbd47e-2739-4f7d-8a59-d1ec0f08d6d9,True,NL,iOS
2,72f6ec5c-a6ba-48b3-b0ab-eb80c618fba8,True,NL,iOS
3,81e004fa-f59f-4019-b4c1-cebe5e779b3c,False,AU,iOS
4,220417c0-e032-4340-8fe0-a87045ddceff,True,NL,iOS


In [176]:
final_df = pd.DataFrame()

for device_uuid in device_uuids[0:device_uuids_limit]:
    
    # extract device data from the main df
    device_uuid_df = user_data_with_usage_features_and_churn[user_data_with_usage_features_and_churn['device_uuid'] == device_uuid]
    # extract device data for segmentation features data
    device_seg_feature_values_df = segmentation_features_df[segmentation_features_df['device_uuid'] == device_uuid]

    
    # print(device_seg_feature_values_df[0:10])
    
    # assign the values of segmentation features to the main df
    for seg_feature_name in device_seg_feature_values_df.columns[1:]:
        # device_uuid_data[seg_feature_name] = device_seg_feature_values_df[seg_feature_name]
        device_uuid_df[seg_feature_name] = device_seg_feature_values_df[seg_feature_name][int(device_seg_feature_values_df[seg_feature_name].index.values)]
        # None
        # print(device_seg_feature_values_df[seg_feature_name][int(device_seg_feature_values_df[seg_feature_name].index.values)])
        # [device_seg_feature_values_df[seg_feature_name].index.values])

    # append the device data to the main df
    final_df = final_df.append(device_uuid_df)

final_df.reset_index(inplace=True, drop=True)

In [177]:
len(final_df)

11220

In [178]:
final_df

Unnamed: 0,device_uuid,date,feature1,feature2,feature3,user_registered,country,platform
0,da5e3464-d356-4572-b962-f2b57e730732,2016-05-01,13.0,10.4,7.8,False,NL,iOS
1,da5e3464-d356-4572-b962-f2b57e730732,2016-06-01,10.0,8.0,6.0,False,NL,iOS
2,da5e3464-d356-4572-b962-f2b57e730732,2016-07-01,5.0,4.0,3.0,False,NL,iOS
3,b7dbd47e-2739-4f7d-8a59-d1ec0f08d6d9,2016-04-01,6.0,4.8,3.6,True,NL,iOS
4,b7dbd47e-2739-4f7d-8a59-d1ec0f08d6d9,2016-05-01,13.0,10.4,7.8,True,NL,iOS
5,b7dbd47e-2739-4f7d-8a59-d1ec0f08d6d9,2016-06-01,7.0,5.6,4.2,True,NL,iOS
6,b7dbd47e-2739-4f7d-8a59-d1ec0f08d6d9,2016-07-01,11.0,8.8,6.6,True,NL,iOS
7,b7dbd47e-2739-4f7d-8a59-d1ec0f08d6d9,2016-08-01,12.0,9.6,7.2,True,NL,iOS
8,72f6ec5c-a6ba-48b3-b0ab-eb80c618fba8,2016-05-01,13.0,10.4,7.8,True,NL,iOS
9,72f6ec5c-a6ba-48b3-b0ab-eb80c618fba8,2016-06-01,8.0,6.4,4.8,True,NL,iOS


In [189]:
# checking proportion of usage for usage features
# (each feature 80% of the previous one)
final_df.groupby(['user_registered']).mean()

Unnamed: 0_level_0,feature1,feature2,feature3
user_registered,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,7.076971,5.661577,4.246183
True,7.008292,5.606634,4.204975


In [190]:
# checking the proportions of categorical features
# we have set weights for features with 2 possible variants as [0.7, 0.3].
# we have set user_registered = [False, True]
# that's why about 30% of users have True in their user_registered categorical variable
final_df['user_registered'].count()

11220

In [191]:
len(final_df['device_uuid'].unique())

962

In [161]:
# exporting the final dataframe for later usage
final_df.to_csv('generating_user_behavioral_data.csv')