In [15]:
import random
import numpy as np
from datetime import datetime, timedelta
import pandas as pd

In [16]:
random.seed(42)
np.random.seed(42)

In [17]:
#user_id	session_id	timestamp	event	device	region
def generate_fake_customer_sessions(n=600):
    events = ['search','view_flights','select_flight','add_bags','checkout','confirmation']
    devices = ['mobile','desktop']
    regions = ['US', 'UK', 'CA', 'AU', 'DE', 'FR', 'IT', 'ES', 'JP', 'CN',
    'KR', 'IN', 'BR', 'MX', 'ZA', 'RU', 'SG', 'NZ', 'AE', 'NL',
    'SE', 'CH', 'BE', 'IE', 'NO', 'FI', 'DK', 'PL', 'PT', 'GR',
    'TH', 'MY', 'PH', 'ID', 'TR', 'EG', 'SA', 'IL', 'AR', 'CL']

    base_time = datetime(2024, 1, 1) #earliest possible time (year,month,day)
    data = []

    for i in range(n):
        user_id = f"{1000 + i}"
        session_id = f"S{1000 + i}"
        device = random.choice(devices)
        region = random.choice(regions)
        
        num_events = random.randint(2, len(events)) #pick random number of events from 2 to the number of events in the list which is 6
        session_events = events[:num_events] #will choose event in order based on number of events

        #start time, random
        start_time = base_time +timedelta(days=random.randint(0,365),hours=random.randint(0,23),minutes=random.randint(0,59))

        current_time = start_time #to later ad 1-5 min between event
        for event in session_events:
            data.append({
                 'user_id': user_id,
                 'session_id': session_id,
                 'timestamp': current_time,
                 'event': event,
                 'device': device,
                 'region': region
        })
        current_time += timedelta(minutes=random.randint(1,5)) #random time 1 to 5 min
    return pd.DataFrame(data)

In [18]:
df = generate_fake_customer_sessions()

In [19]:
df.to_csv('Fake_Customer_Flight_Sessions.csv',index=False)

In [20]:
df

Unnamed: 0,user_id,session_id,timestamp,event,device,region
0,1000,S1000,2024-05-05 07:08:00,search,mobile,UK
1,1000,S1000,2024-05-05 07:08:00,view_flights,mobile,UK
2,1000,S1000,2024-05-05 07:08:00,select_flight,mobile,UK
3,1000,S1000,2024-05-05 07:08:00,add_bags,mobile,UK
4,1001,S1001,2024-01-17 00:05:00,search,mobile,IL
...,...,...,...,...,...,...
2422,1598,S1598,2024-08-25 16:25:00,add_bags,mobile,NZ
2423,1599,S1599,2024-09-07 19:28:00,search,mobile,RU
2424,1599,S1599,2024-09-07 19:28:00,view_flights,mobile,RU
2425,1599,S1599,2024-09-07 19:28:00,select_flight,mobile,RU


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2427 entries, 0 to 2426
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   user_id     2427 non-null   object        
 1   session_id  2427 non-null   object        
 2   timestamp   2427 non-null   datetime64[ns]
 3   event       2427 non-null   object        
 4   device      2427 non-null   object        
 5   region      2427 non-null   object        
dtypes: datetime64[ns](1), object(5)
memory usage: 113.9+ KB


In [22]:
df.isnull().sum()

user_id       0
session_id    0
timestamp     0
event         0
device        0
region        0
dtype: int64

In [23]:
df['device'].unique()

array(['mobile', 'desktop'], dtype=object)

In [24]:
df['event'].value_counts()

event
search           600
view_flights     600
select_flight    478
add_bags         377
checkout         255
confirmation     117
Name: count, dtype: int64

In [25]:
total_sessions = df['session_id'].nunique()
print("total sessions:", total_sessions)
completed_sessions = df[df['event'] =='confirmation']['session_id'].nunique() #how many unique session IDs had the confirmation event
print('total completed sessions', completed_sessions)
funnel_completion_rate = completed_sessions / total_sessions
print(f"Funnel completion rate: {funnel_completion_rate:.2%}") #percentage of people who completed all the events from begining to end

total sessions: 600
total completed sessions 117
Funnel completion rate: 19.50%


In [26]:
df.groupby('region')['session_id'].nunique() #number of sessions per region

region
AE    11
AR    16
AU    15
BE    17
BR    12
CA    12
CH    12
CL    15
CN    20
DE    17
DK    17
EG    19
ES    20
FI    12
FR    17
GR    14
ID    10
IE    18
IL    13
IN     9
IT    17
JP    14
KR    17
MX     6
MY    11
NL    18
NO     8
NZ    17
PH    12
PL    15
PT    18
RU    23
SA    21
SE    13
SG    22
TH    10
TR    21
UK    13
US    13
ZA    15
Name: session_id, dtype: int64

In [27]:
completed = df[df['event'] == 'confirmation']['session_id'].nunique()
total_sessions = df['session_id'].nunique()
completion_rate = completed / total_sessions

In [28]:
print(completion_rate)

0.195
