Having conducted an initial analysis, I have gained a comprehensive understanding of the dataset, including insights into the client characteristics.
In this notebook, I will focus on analyzing the results of our A/B testing.

# Importing libraries

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches # used for the legend at step duration
import seaborn as sns
import numpy as np
import datetime

import scipy.stats as st
from scipy.stats import chi2_contingency
from scipy.stats.contingency import association
from scipy.stats import norm
import scipy.stats as stats

from functools import reduce

# Loading datasets

Importing the cleaned datasets. In the trace notebook, I make sure that the datetime is imported as datetime.

In [3]:
df_clients = pd.read_csv('../data/df_clients_CLEANED.csv')
df_trace = pd.read_csv('../data/df_trace_CLEANED.csv', parse_dates=['date_time'])
df_roster = pd.read_csv('../data/df_roster_CLEANED.csv')

# Data preparation, Splitting dataset for test and control group

In [10]:
# First separating the datasets for the test and control group
df_roster_test = df_roster[df_roster.variation == 'Test']
df_roster_control = df_roster[df_roster.variation == 'Control']

df_test = df_trace[df_trace.client_id.isin(df_roster_test.client_id.unique().tolist())]
df_control = df_trace[df_trace.client_id.isin(df_roster_control.client_id.unique().tolist())]

In [11]:
def df_prep(df):
    ''' Preparing the data for analysis.
    I created the function, cause the same preparation will be done for two datasets (for test and control group).

    Receives dataframe as parameter and returns updated dataframe.'''

    # Rename column for brevity
    df = df.rename(columns={'process_step': 'step'})

    # Replacing the step names with numbers for analysis purposes (will do subtraction later)
    df['step']  = df['step'].map({'start':1,'step_1':2,'step_2':3,'step_3':4,'confirm':5})
    
    # Sorting dataframe
    df.sort_values(by=['client_id','date_time'], ascending=False, inplace=True)

    # Dropping the columns that are not needed
    df.drop(columns=['visitor_id','visit_id'], inplace=True)
   
    # Resetting index
    df = df.reset_index(drop=True)

    
    # New columns for analysis
    # Adding new columns that I will use for comparison: previous  and next client_id
    df['prev_id'] = df['client_id'].shift(-1, fill_value=0)
    df['next_id'] = df['client_id'].shift(1, fill_value=0)

    # Adding new columns that I will use for comparison: previous and next step
    df['prev_step'] = df['step'].shift(-1, fill_value=0)
    df['next_step'] = df['step'].shift(1, fill_value=0)

    # Adding new columns that I will use for comparison: previous datetime, next datetime
    df['prev_time'] = df['date_time'].shift(-1, fill_value='1970-01-01 00:00:00')
    df['prev_time'] = pd.to_datetime(df['prev_time'],format='%Y-%m-%d %H:%M:%S')

    df['next_time'] = df['date_time'].shift(1, fill_value='1970-01-01 00:00:00')
    df['next_time'] = pd.to_datetime(df['next_time'],format='%Y-%m-%d %H:%M:%S')

    # Adding new columns that I will use for comparison: time difference between current and last step
    df['time_diff_prev'] = df['date_time'] - df['prev_time']

    # Adding new columns that I will use for comparison: step duration (difference of following datime with current one)
    df['time_diff_next'] = df['next_time'] - df['date_time']

    # Adding a new layer, to count the steps back, but making sure that I don't compare different clients
    df['subtract_step'] = np.where(df['client_id'] == df['prev_id'],(df['step'] - df['prev_step']),0)

    
    return df

In [12]:
df_test = df_prep(df_test)
df_control = df_prep(df_control)

## Define valid sessions

**Conditions for Defining Valid Sessions:**

The condition outlined below aim to accurately identify the start of a new session within a sequence of actions:
- **current action is 'step' 1**

   **AND**
- **the previous step is more than 6 minutes and 4 seconds ago OR the previous entry is from a different client**



In [13]:
# here I will add to my dataframes for the test and control group a column that will signify a new session
# creating a definition cause I will do the same for both dataframes (control and test)

def new_session(df):
    ''' Function that signifies which point at the digital fottprints are new sessions, based on the conditions:
    - current action is 'step' 1
    AND
    - the previous step is more than 6 min and 4 s ago OR the previous entry is from a different client

    Receives the dataframe as parameter.
    Returns dataframe with an extra column 'new_session' with True or False.'''

    df['new_session'] = False

    # Threshold of 6 min and 4 seconds as calculated above
    threshold = datetime.timedelta(seconds=364)

    # setting the conditions
    conditions = np.array(
    (df['step'] == 1) & 
    ((df['time_diff_prev'] > threshold) |
    (df['client_id'] != df['prev_id']))
    )

    df.loc[conditions, 'new_session'] = True
    
    # Sorting everything to ascending order, to help with following steps
    df.sort_values(by=['client_id','date_time'], ascending=True, inplace=True)

    return df

In [14]:
df_test = new_session(df_test)
df_control = new_session(df_control)

In [15]:
# Numbering seperate sessions
count = 0  # Initialize count outside the function

def session_id(new_session):
    ''' Function to number the seperate sessions'''
    
    global count  # Access the count variable from outside the function
    
    if new_session == True:
        count += 1
        return count       
    else:
        return count


# Calling session_id that is defined in functions.py
df_test['session_id'] = df_test['new_session'].apply(session_id)


# Doing the same for the 'control' dataframe
count = 0  # Resetting count outside the function

df_control['session_id'] = df_control['new_session'].apply(session_id)

## Session metrics

Conditions:

**Valid Session**:
A valid session is characterized by consecutive steps occurring within a duration of no more than 6 minutes and 4 seconds.

**Invalid Session**:
An invalid session is identified when the time interval between consecutive steps exceeds 6 minutes and 4 seconds.

**Successful Session**:
A successful session is a valid session that successfully reaches the 'confirm' stage after traversing all required steps ('start', 'step1', 'step2', 'step3'). In cases of multiple 'confirm' steps, the last one is regarded as the official confirm. Metrics for successful sessions include quantifying the total number of steps taken, the total number of steps taken backward, and calculating the duration from the session start to the final valid confirm step.

**Unsuccessful Session**:
Unsuccessful session is one that fails to reach the 'confirm' stage completely or reaches 'confirm' without passing through the necessary steps. These sessions may be flagged for further investigation to understand the reasons behind the incomplete or irregular user behavior.

In [16]:
# Variable to count steps until 'confirm'
steps_count = 1
# Variable to count how many steps back there were per session
steps_back = 0
# Variable to count how many valid 'confirms' have been per session
confirm_count = 0
# Variable to check if there are more than 1 'confirm' in the same session
confirmed = False
# Variable to check if the session is valid
valid = True
# Variables to see if per session steps 1,2,3 were completed before 'confirm'
check_1 = False
check_2 = False
check_3 = False
# Variable to measure how long a session that reached 'confirm' lasted
session_start = '1970-01-01 00:00:00'


def session_info(df):
    # To access the variable from outside the function
    global steps_count 
    global steps_back
    global confirm_count
    global confirmed
    global valid
    global check_1
    global check_2
    global check_3
    global session_start

    threshold = datetime.timedelta(seconds=364)
    
    if df['new_session'] == True:
        # Resetting the counts for the new sessions
        steps_count = 1
        steps_back = 0
        confirm_count = 0
        confirmed = False
        valid = True
        session_start = df['date_time']
        session_start = pd.to_datetime(session_start,format='%Y-%m-%d %H:%M:%S')
        check_1 = False
        check_2 = False
        checl_3 = False        
        return [confirm_count, steps_count, steps_back, df['time_diff_next'], pd.NaT, 'valid']

    elif df['time_diff_prev'] < datetime.timedelta(seconds=0):
        # this refers to the few cases, that have the same session id, but different client_id, etc
        # and they are in the same session_id because the new session needs 'start' to be defined.
        # if the next client just started from any other valid, it is invalid, but currently it is 
        # noted as the same session as the one above it.
        valid = False
        return [confirm_count, steps_count, steps_back, pd.NaT, pd.NaT, 'invalid']
    
    elif df['time_diff_prev'] < threshold and valid:
        
        if df['step'] in (1,2,3,4):
            if df['step'] == 2:
                check_1 = True
            elif df['step'] == 3:
                check_2 = True
            else:
                check_3 = True
                
            # Counting the steps back
            if df['subtract_step'] < 0:
                steps_back += abs(df['subtract_step'])            
            
            # Counting the total steps
            steps_count += 1

            return [confirm_count, steps_count, steps_back, df['time_diff_next'], pd.NaT, 'valid']
 
        # At the first valid 'confirm' I am returning the total number of steps
        elif df['step'] == 5 and not confirmed and check_1 and check_2 and check_3:
            confirm_count = 1
            confirmed = True
            steps_count += 1
            session_dur = df['date_time'] - session_start
            return [confirm_count, steps_count, steps_back, pd.NaT, session_dur,'valid']
        
        # When more than one 'confirm' happen within the valid timeframe, I need to return the updated info, 
        # and add a step count for the first 'confirm'
        elif df['step'] == 5 and confirmed and check_1 and check_2 and check_3:
            confirm_count += 1
            session_dur = df['date_time'] - session_start
            steps_count += 1
            return [confirm_count, steps_count, steps_back, pd.NaT, session_dur,'valid']
    
        # Checking the case where'confirm' is reached without passing from all the necessary steps
        elif df['step'] == 5 and not confirmed:
            # I am considering it a valid session but not a successful one
            # I will count it as an extra step, but that can be defined better later
            steps_count += 1
            return [confirm_count, steps_count, steps_back, df['time_diff_next'], pd.NaT, 'valid']
    
    elif df['time_diff_prev'] > threshold and valid:
        valid = False
        return [confirm_count, steps_count, steps_back, pd.NaT, pd.NaT, 'invalid']

    else:
        # this is the case where valid is already False
        return [confirm_count, steps_count, steps_back, pd.NaT, pd.NaT, 'invalid']

In [17]:
# Applying the above function
df_test['session_info'] = df_test.apply(session_info, axis=1)
df_control['session_info'] = df_control.apply(session_info, axis=1)


# Dropping columns that will not be needed further, saving it to a new dataframe with results
df_test_rslt = df_test.drop(columns=(['time_diff_next','time_diff_prev','new_session',
                       'prev_id','next_id','prev_step','next_step','prev_time','next_time']))
df_control_rslt = df_control.drop(columns=(['time_diff_next','time_diff_prev','new_session',
                       'prev_id','next_id','prev_step','next_step','prev_time','next_time']))


# Splitting the session info in more columns
df_test_rslt[['confirm_cnt', 'steps_cnt', 'steps_back_cnt', 'step_dur',
              'sess_dur_to_confirm','validity']] = df_test_rslt['session_info'].apply(pd.Series)
df_control_rslt[['confirm_cnt', 'steps_cnt', 'steps_back_cnt', 'step_dur',
              'sess_dur_to_confirm','validity']] = df_control_rslt['session_info'].apply(pd.Series)


# Dropping the session_info column
df_test_rslt = df_test_rslt.drop(columns=('session_info'))
df_control_rslt = df_control_rslt.drop(columns=('session_info'))


# Correcting invalid step_dur, if more than 6min and 2 sec, they are result of subtracting with the next session's time
conditions = (df_test_rslt['step_dur'] > datetime.timedelta(seconds=363)) | (df_test_rslt['step_dur'] < datetime.timedelta(seconds=0))
df_test_rslt.loc[conditions,'step_dur'] = pd.NaT

conditions = (df_control_rslt['step_dur'] > datetime.timedelta(seconds=363)) | (df_control_rslt['step_dur'] < datetime.timedelta(seconds=0))
df_control_rslt.loc[conditions,'step_dur'] = pd.NaT

# Transforming the duration to seconds
df_test_rslt['step_dur'] = df_test_rslt['step_dur'].dt.total_seconds().apply(lambda x: '{:.0f}'.format(x))
df_control_rslt['step_dur'] = df_control_rslt['step_dur'].dt.total_seconds().apply(lambda x: '{:.0f}'.format(x))

In [18]:
df_test_rslt.head()

Unnamed: 0,client_id,step,date_time,subtract_step,session_id,confirm_cnt,steps_cnt,steps_back_cnt,step_dur,sess_dur_to_confirm,validity
176698,1001101,1,2017-04-05 16:55:20,0,1,0,1,0,13,NaT,valid
176697,1001101,2,2017-04-05 16:55:33,1,1,0,2,0,29,NaT,valid
176696,1001101,3,2017-04-05 16:56:02,1,1,0,3,0,32,NaT,valid
176695,1001101,2,2017-04-05 16:56:34,-1,1,0,4,1,26,NaT,valid
176694,1001101,3,2017-04-05 16:57:00,1,1,0,5,1,20,NaT,valid


In [19]:
df_control_rslt.head()

Unnamed: 0,client_id,step,date_time,subtract_step,session_id,confirm_cnt,steps_cnt,steps_back_cnt,step_dur,sess_dur_to_confirm,validity
140535,1000062,1,2017-04-06 15:38:49,0,1,0,1,0,182.0,NaT,valid
140534,1000062,2,2017-04-06 15:41:51,1,1,0,2,0,10.0,NaT,valid
140533,1000062,3,2017-04-06 15:42:01,1,1,0,3,0,49.0,NaT,valid
140532,1000062,4,2017-04-06 15:42:50,1,1,0,4,0,,NaT,valid
140531,1000062,5,2017-04-06 15:54:49,1,1,0,4,0,,NaT,invalid


# Saving files

In [20]:
# df_test_rslt.to_csv('../Data/df_test_rslt.txt')
# df_control_rslt.to_csv('../Data/df_control_rslt.txt')