#### Load necessary libraries

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from datetime import datetime
import time

Here we will do the four kpi's (Key performance indicator) for both control and test group and we will compare them so that we know how the new design performed.

#### Let's do the four kpi's for the test_group :- "Completion rate , Average time spent, Time spent on each step, Error rate". 

In [None]:
# load the data
test_sorted = pd.read_csv("test_sorted.csv")
test_sorted.head(2)

#### Completion rate

In [None]:
# Calculate for the total confirmed process step
process_step_confirm = len(test_sorted[test_sorted['process_step'] == 'confirm'])
process_step_confirm

In [None]:
# Calculate for the total start process step
process_step_total = len(test_sorted['process_step'])
process_step_total

In [None]:
# Calculate for the completion rate
completion_rate = (process_step_confirm/process_step_total)
completion_rate

#### Average time spent

In [None]:
# Open dictionary to store time differences for each client ID
test_sorted2 = test_sorted[['client_id','time']]
test_sorted2_list = test_sorted2.values.tolist()

time_diffs = defaultdict(list)

# Parse timestamps and group by client ID
for client_id, time in test_sorted2_list:
    time_diffs[client_id].append(datetime.strptime(time, '%H, %M, %S'))

In [None]:
# Calculate time differences for each client ID
time_diffrence1 = []
time_diffrence2 = []
for client_id, time in time_diffs.items():
    time.sort()  # Ensure timestamps are sorted
    for i in range(1, len(time)):
        time_diff = time[i] - time[i - 1]
        time_diffrence1.append(client_id)
        time_diffrence2.append(time_diff)
        #print(f"Client ID: {client_id}, Time difference: {time_diff}")

In [None]:
time_differ1 = pd.DataFrame(time_diffrence1)
time_differ2 = pd.DataFrame(time_diffrence2)

In [None]:
# Set the column names for both dataframes
time_differ2.columns = ['time_diffrence']
time_differ1.columns = ['client_id']

In [None]:
# Add them
time_differ1 = pd.concat([time_differ1, time_differ2], axis = 1)

In [None]:
# total_time spent is equal/ same  with time deffrence
time_differ1.head()

In [None]:
# Save the dataframe
time_differ1.to_csv('time_differ1_test.csv')

In [None]:
# total time spent 
total_time = time_differ1['time_diffrence'].sum()
total_time

In [None]:
# average time spent 
avg_time = time_differ1['time_diffrence'].mean()
avg_time

In [None]:
# Let's merge time_diffre1 with test_merge based on client_id
test_diffre = pd.merge(test_sorted, time_differ1, on = 'client_id')

In [None]:
test_diffre.head(2)

#### Time spent on each step

In [None]:
# Convert timestamp column to datetime
test_diffre['date_time'] = pd.to_datetime(test_diffre['date_time'])

# Sort DataFrame by client ID and timestamp
test_diffre = test_diffre.sort_values(by=['client_id', 'date_time'])

# Calculate time spent on each step for each client
test_diffre['date_time'] = test_diffre.groupby('client_id')['date_time'].diff().dt.total_seconds()

# Aggregate time spent on each step for each client
time_spent_per_step = test_diffre.groupby(['client_id', 'process_step'])['date_time'].sum().reset_index()
avg_time_spent_per_step = test_diffre.groupby(['client_id', 'process_step'])['date_time'].mean().reset_index()

print(avg_time_spent_per_step)

##### mean

In [None]:
avg_time_spent_per_step['date_time'].mean()

##### median

In [None]:
avg_time_spent_per_step['date_time'].median()

##### mode

In [None]:
avg_time_spent_per_step['date_time'].mode()

##### save the dataframe

In [None]:
avg_time_spent_per_step.to_csv('test_avg_time_spent_per_step.csv')

####  Error rate

In [None]:
# display the unique client_id's
# Get unique client_ids
unique_client_ids = test_sorted['client_id'].unique()
# Iterate over each unique client_id
for client_id in unique_client_ids:
    # Extract rows based on the current client_id
    client_rows = test_sorted[test_sorted['client_id'] == client_id]

In [None]:
# To get the number of attempts a client took and number of confirmed events
def att_conf(unique_client_ids):
    for client_id in unique_client_ids:
       attempt = test_sorted[test_sorted['client_id'] == client_id] # Extract the rows for each client_Id
       len_attempt = len(test_sorted[test_sorted['client_id'] == client_id]) # Sum of how many times apper each client_id in the dataframe
       num_attempt = len_attempt/5 # How many events(True or False) are there in each 5 steps (how many full process were there?)
       num_confirm = len(attempt [attempt['process_step'] == 'confirm']) # How many confirm events are there in a specific client_id
       print(client_id, ',' , num_attempt,',', num_confirm)

In [None]:
# display the unique client_id with the attempts they taken and confirmation  
att_conf(unique_client_ids)

In [None]:
# Let's import our unique client_id with their attempt and confirm event 
test_unique_id = pd.read_csv('test_unique_client_id.csv')
test_unique_id.head(2)

In [None]:
# Check how many unique client_id we have in test
test_unique_id.shape

In [None]:
# Findout the confirmed process and the failed one
test_unique_id['is_confirmation'].value_counts()

In [None]:
# Here we will extract client_id with the date_time in 5 steps of interval.
df_error = test_sorted[['client_id','date_time']]
extracted_rows = df_error.iloc[::5]
extracted_rows.head()

In [None]:
# Let's merge the dataset
df_err_rate = pd.merge(extracted_rows, test_unique_id, on = "client_id")
df_err_rate.head(2)

In [None]:
# check hte shape of the dataset
df_err_rate.shape

In [None]:
# display the how many are each unique values
df_err_rate['is_confirmation'].value_counts()

First let us try to find out the error and confirmation rate based on the merged data. After that we try with dataset having unique client_id.

In [None]:
# With the merged dataset
total_submissions = 35332
confirmed_submissions = 27067
error_submissions = 8265
error_rate = round((error_submissions / total_submissions) * 100)
confirmation_rate = round((confirmed_submissions / total_submissions) * 100)
print(error_rate, "||", confirmation_rate )

In [None]:
# With only unique client_id dataframe
total_submissions = 26963
confirmed_submissions = 18683
error_submissions = 8280
error_rate = round((error_submissions / total_submissions) * 100)
confirmation_rate = round((confirmed_submissions / total_submissions) * 100)
print(error_rate, "||", confirmation_rate )

#### Now let's move in to the control_group :- "Completion rate , Average time spent, Time spent on each step, Error rate". 

In [None]:
# Load the dataset
control_sorted = pd.read_csv("control_sorted.csv")

#### Completion rate

In [None]:
# Calculate for the total confirmed process step
process_step_confirm = len(control_sorted[control_sorted['process_step'] == 'confirm'])
process_step_confirm

In [None]:
# Calculate for the total process step
process_step_total = len(control_sorted['process_step'])
process_step_total

In [None]:
# Calculate for the completion rate
completion_rate = (process_step_confirm/process_step_total)
completion_rate

#### Average time spent

In [None]:
# Open dictionary to store time differences for each client ID
control_sorted2 = control_sorted[['client_id','time']]
control_sorted2_list = control_sorted2.values.tolist()

time_diffs = defaultdict(list)

# Parse timestamps and group by client ID
for client_id, time in control_sorted2_list:
    time_diffs[client_id].append(datetime.strptime(time, '%H, %M, %S'))

In [None]:
# Calculate time differences for each client ID
time_diffrence1 = []
time_diffrence2 = []
for client_id, time in time_diffs.items():
    time.sort()  # Ensure timestamps are sorted
    for i in range(1, len(time)):
        time_diff = time[i] - time[i - 1]
        time_diffrence1.append(client_id)
        time_diffrence2.append(time_diff)
        #print(f"Client ID: {client_id}, Time difference: {time_diff}")

In [None]:
time_differ1 = pd.DataFrame(time_diffrence1)
time_differ2 = pd.DataFrame(time_diffrence2)

In [None]:
# Set the column names for both dataframes
time_differ2.columns = ['time_diffrence']
time_differ1.columns = ['client_id']

In [None]:
# add the two time differences 
time_differ1 = pd.concat([time_differ1, time_differ2], axis = 1)

In [None]:
# total_time spent is equal/same with time deffrence
time_differ1.head()

In [None]:
# save the dataframe
time_differ1.to_csv('time_differ1_control.csv')

In [None]:
# total time spent 
total_time = time_differ1['time_diffrence'].sum()
total_time

In [None]:
# average time spent 
avg_time = time_differ1['time_diffrence'].mean()
avg_time

In [None]:
# Let's merge time_diffre1 with control_merge based on client_id
control_diffre = pd.merge(control_sorted, time_differ1, on = 'client_id')

In [None]:
# display the first two rows of the dataframe
control_diffre.head(2)

#### Time spent on each step

In [None]:
# Convert timestamp column to datetime
control_diffre['date_time'] = pd.to_datetime(control_diffre['date_time'])

# Sort DataFrame by client ID and timestamp
control_diffre = control_diffre.sort_values(by=['client_id', 'date_time'])

# Calculate time spent on each step for each client
control_diffre['date_time'] = control_diffre.groupby('client_id')['date_time'].diff().dt.total_seconds()

# Aggregate time spent on each step for each client
time_spent_per_step = control_diffre.groupby(['client_id', 'process_step'])['date_time'].sum().reset_index()
avg_time_spent_per_step = control_diffre.groupby(['client_id', 'process_step'])['date_time'].mean().reset_index()

print(avg_time_spent_per_step)

##### mean

In [None]:
avg_time_spent_per_step['date_time'].mean()

##### median

In [None]:
avg_time_spent_per_step['date_time'].median()

##### mode

In [None]:
avg_time_spent_per_step['date_time'].mode()

In [None]:
# Save the dataset
avg_time_spent_per_step.to_csv("control_avg_time_spent_per_step.csv")

#### Error rate

#### Calculation process for submission successeful or not
How many unique client Id

Then we calculate how many times each client attempt to do the registration. 
    
eg. client 1028, total step taken / 5, devide by five becouse there are five steps a client should take to reach into confirm.

We expect in each five step there should be one confirmation to say the process got successful.Else, the process taken was not successful.

This formula will tell us how many attempts has been taken by the client "client 1028, total step taken / 5". If number of total step taken not devisible by five and even if it is devisible by five , 

if we do not get one confirm out of any five consequitive step of the processs taken by a client, this indicates that there were an error so that we will take it as 'not successful/False'. 

#### submission_successeful 
num_attempt = len(Unique client ID) devided by 5 ## To know how many attemptes has been taken 

num_confirm = len(confirm) 

Example: num_attempt = 50  

         num_confirm = 50 , if it is less than 50 there were an error process and we will give "True" to number of confirmed five step process and 
         
         "False" to the rest five or less than 5 step taken process.

In [None]:
# store unique client_id
client_id_unique = []
client_id_unique.append(control_sorted['client_id'].unique())

In [None]:
# display the unique client_id's
# Get unique client_ids
unique_client_ids = control_sorted['client_id'].unique()
# Iterate over each unique client_id
for client_id in unique_client_ids:
    # Extract rows based on the current client_id
    client_rows = control_sorted[control_sorted['client_id'] == client_id]

In [None]:
def att_conf(unique_client_ids):
    for client_id in unique_client_ids:
       attempt = control_sorted[control_sorted['client_id'] == client_id] # Extract the rows for each client_Id
       len_attempt = len(control_sorted[control_sorted['client_id'] == client_id]) # Sum of how many times apper each client_id in the dataframe
       num_attempt = len_attempt/5 # How many events(True or False) are there in each 5 steps (how many full process were there?)
       num_confirm = len(attempt [attempt['process_step'] == 'confirm']) # How many confirm events are there in a specific client_id
       print(client_id, ',' , num_attempt,',', num_confirm)

Now we take the number of confirm events and analyse it to Shows that there were no success then we will consider there were 9 attemptes 
that was unsccessful and, we will add 2 False value to the 'SubmissionSuccessful' column. And, we do the same for each client_id. 

We get and determine how many number of True or False we should include in submission_successful column. Number of confirm will tell us how many True events and to get False events we subtract Number of confirm from number of attempts. Then, we collect True and  False values to the submission_successful list. 

In [None]:
# Here we will extract client_id with the date_time in 5 steps of interval.
df_error = control_sorted[['client_id','date_time']]
extracted_rows = df_error.iloc[::5]
extracted_rows.head() 

In [None]:
# Let's import our unique client_id with their attempt and confirm event 
control_unique_id = pd.read_csv('cotrol_unique_client_id.csv')
control_unique_id.head() 

In [None]:
control_unique_id.shape

In [None]:
control_unique_id['is_confirmation'].value_counts()

In [None]:
# Let's merge the dataset
df_err_rate = pd.merge(extracted_rows, control_unique_id, on = "client_id")
df_err_rate.head()

In [None]:
df_err_rate.shape

In [None]:
df_err_rate['is_confirmation'].value_counts()

In [None]:
total_submissions = 28097
confirmed_submissions = 21610
error_submissions = 6487
error_rate = round((error_submissions / total_submissions) * 100)
confirmation_rate = round((confirmed_submissions / total_submissions) * 100)
print(error_rate, "||", confirmation_rate )

Let us check and verify with the unique client_id.

In [None]:
total_submission = 23526
confirmed_submission = 15428
error_rate = 8098
error_rate = round((error_submissions / total_submissions) * 100)
confirmation_rate = round((confirmed_submissions / total_submissions) * 100)

In [None]:
print(error_rate, "||", confirmation_rate )