In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pd.set_option('display.max_rows', 500)  #help to see all the columns

%matplotlib inline

In [5]:
df_demo = pd.read_csv("https://raw.githubusercontent.com/AncaAl/Project-2-EDA/main/data/df_final_demo.txt")
df_pt1 = pd.read_csv("https://raw.githubusercontent.com/AncaAl/Project-2-EDA/main/data/df_final_web_data_pt_1.txt")
df_pt2 = pd.read_csv("https://raw.githubusercontent.com/AncaAl/Project-2-EDA/main/data/df_final_web_data_pt_2.txt")
df_ex_cl = pd.read_csv("https://raw.githubusercontent.com/AncaAl/Project-2-EDA/main/data/df_final_experiment_clients.txt")

In [6]:
df_pt1.shape, df_pt2.shape

((343141, 5), (412264, 5))

In [7]:
# Combine df_pt1 & df_pt2
df_web_data = pd.concat([df_pt1, df_pt2], axis=0, ignore_index=True)
df_web_data

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04
...,...,...,...,...,...
755400,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:46:10
755401,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:45:29
755402,9668240,388766751_9038881013,922267647_3096648104_968866,step_1,2017-05-24 18:44:51
755403,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:44:34


In [8]:
df_web_data.dtypes

client_id        int64
visitor_id      object
visit_id        object
process_step    object
date_time       object
dtype: object

In [9]:
# Counting and sorting the unique values for numerical column (No. of clients)
df_web_data.select_dtypes("number").nunique().sort_values(ascending=False)

client_id    120157
dtype: int64

In [10]:
# Counting and sorting the unique values for each object (string) column in descending order
df_web_data.select_dtypes("object").nunique().sort_values(ascending=False)

date_time       629363
visit_id        158095
visitor_id      130236
process_step         5
dtype: int64

In [11]:
# Checking for missing data
df_web_data.isnull().sum().sort_values(ascending=False)

client_id       0
visitor_id      0
visit_id        0
process_step    0
date_time       0
dtype: int64

### Day 1 & 2 (Week 5) - Client behavior analysis 
Answer the following questions about demographics:
1. Who are the primary clients using this online process?
2. Are the primary clients younger or older, new or long-standing?
3. Next, carry out a client behaviour analysis to answer any additional relevant questions you think are important.

In [12]:
df_web_data.head(100)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04
5,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:17:15
6,9988021,580560515_7732621733,781255054_21935453173_531117,step_1,2017-04-17 15:17:01
7,9988021,580560515_7732621733,781255054_21935453173_531117,start,2017-04-17 15:16:22
8,8320017,39393514_33118319366,960651974_70596002104_312201,confirm,2017-04-05 13:10:05
9,8320017,39393514_33118319366,960651974_70596002104_312201,step_3,2017-04-05 13:09:43


In [22]:
# 3)For each client_id how many unique visit_id(how often this client used my product)
unique_visitors = df_web_data.groupby('client_id')[["visitor_id"]].nunique()
unique_visitors.sort_values(by='visitor_id', ascending = False)

Unnamed: 0_level_0,visitor_id
client_id,Unnamed: 1_level_1
9008485,14
8167818,13
323753,11
1771611,11
6305830,11
...,...
3507622,1
3507472,1
3507471,1
3507329,1


In [23]:
# check for a specific client
filtered_df = df_web_data[df_web_data['client_id'] == 3507622]
filtered_df

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
330475,3507622,148416425_77477829360,673812863_1698414711_886709,start,2017-04-13 17:47:17
330514,3507622,148416425_77477829360,942460915_48201821382_330022,confirm,2017-04-13 22:33:09
330515,3507622,148416425_77477829360,942460915_48201821382_330022,step_3,2017-04-13 22:31:59
330516,3507622,148416425_77477829360,942460915_48201821382_330022,step_2,2017-04-13 22:31:10
330517,3507622,148416425_77477829360,942460915_48201821382_330022,step_1,2017-04-13 22:31:06
330518,3507622,148416425_77477829360,942460915_48201821382_330022,start,2017-04-13 22:31:01


In [26]:
# 4)How many times this client go to confirm prosess 
# Generating a crosstab for 'client_id' and 'process_step'
df_web_data_crosstab = pd.crosstab(df_web_data['client_id'], df_web_data['process_step'])
df_web_data_crosstab.sort_values(by='confirm', ascending = False).head(15)

process_step,confirm,start,step_1,step_2,step_3
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7347761,24,5,5,1,1
8167818,23,9,3,1,1
4127054,20,3,1,1,1
8374531,20,10,3,1,1
244312,19,1,1,1,1
465007,19,21,12,11,9
8047921,17,6,5,5,7
1637369,17,0,0,0,0
8290360,17,1,1,0,0
5836024,17,2,2,1,1


In [None]:
# 4)How many times this client go to confirm prosess 