<a href="https://colab.research.google.com/github/ganiosh92/DA-ecommerce-clickstream-analytics/blob/main/clickstream_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount on ***Google Drive***

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pwd
%cd /content/drive/My Drive/Sample-Data-From-Web/ecommerce_customer_behavior

/content
/content/drive/My Drive/Sample-Data-From-Web/ecommerce_customer_behavior


# Import all ***Libraries***

In [3]:
import pandas as pd
import numpy as np
import json
from pandas import json_normalize

In [4]:
#customer = pd.read_csv('customer.csv')
clickstream = pd.read_csv('click_stream.csv')
#transactions = pd.read_csv('transactions.csv')
#product = pd.read_csv('product.csv', on_bad_lines='skip')

# ***Traffic Source***

In [None]:
mobile_traffic = round((len(clickstream[clickstream['traffic_source'] == 'MOBILE']) / len(clickstream)) * 100, 2)
web_traffic = round((len(clickstream[clickstream['traffic_source'] == 'WEB']) / len(clickstream)) * 100, 2)

print("Percentage of Mobile Traffic = " + str(mobile_traffic)+"%")
print("Percentage of Web Traffic = " + str(web_traffic)+"%")

Percentage of Mobile Traffic = 89.98%
Percentage of Web Traffic = 10.02%


# Preprocessing ***Clickstream*** Data

In [5]:
#PREPROCESSING
#extract the mm/dd/yyyy from the date field
clickstream['event_time'] = pd.to_datetime(clickstream['event_time'])
clickstream['event_day'] = clickstream['event_time'].dt.day
clickstream['event_month'] = clickstream['event_time'].dt.month
clickstream['event_year'] = clickstream['event_time'].dt.year

#create unique session ID by concatenating session id and date of the session
#this is because there are multiple sessions with the same session id on different date which makes it hard to analyze the data
clickstream['unique_session_id'] = clickstream['session_id'] + clickstream['event_time'].dt.strftime('%Y-%m-%d')

clickstream.head(5)

Unnamed: 0,session_id,event_name,event_time,event_id,traffic_source,event_metadata,event_day,event_month,event_year,unique_session_id
0,fb0abf9e-fd1a-44dd-b5c0-2834d5a4b81c,HOMEPAGE,2019-09-06 15:54:32.821085+00:00,9c4388c4-c95b-4678-b5ca-e9cbc0734109,MOBILE,,6,9,2019,fb0abf9e-fd1a-44dd-b5c0-2834d5a4b81c2019-09-06
1,fb0abf9e-fd1a-44dd-b5c0-2834d5a4b81c,SCROLL,2019-09-06 16:03:57.821085+00:00,4690e1f5-3f99-42d3-84a5-22c4c4d8500a,MOBILE,,6,9,2019,fb0abf9e-fd1a-44dd-b5c0-2834d5a4b81c2019-09-06
2,7d440441-e67a-4d36-b324-80ffd636d166,HOMEPAGE,2019-09-01 12:05:10.322763+00:00,88aeaeb5-ec98-4859-852c-8abb483faf31,MOBILE,,1,9,2019,7d440441-e67a-4d36-b324-80ffd636d1662019-09-01
3,7d440441-e67a-4d36-b324-80ffd636d166,ADD_TO_CART,2019-09-01 12:06:33.322763+00:00,934e306e-ecc6-472f-9ccb-12c8536910a2,MOBILE,"{'product_id': 15315, 'quantity': 4, 'item_pri...",1,9,2019,7d440441-e67a-4d36-b324-80ffd636d1662019-09-01
4,7d440441-e67a-4d36-b324-80ffd636d166,BOOKING,2019-09-01 12:15:29.425431+00:00,9f4767a1-40fa-4c9c-9524-dfad18634d56,MOBILE,{'payment_status': 'Success'},1,9,2019,7d440441-e67a-4d36-b324-80ffd636d1662019-09-01


#**Bounce Rate**
the percentage of visitors who arrive at a digital storefront and “bounce,” leaving before ever progressing onto a second page. Bounce rates are calculated by dividing the number of one-page visits by the total number of visits to the site.

In [6]:
#Bounce rate in 2019
#Here we define bounce rate as when user arrives at the HOMPAGE or PROMO_PAGE but doesn't perform the following actions:
#1. CLICK i.e clicks on elements on the webpage
#2. SCROLL i.e. scrolls the webpage
#3. BOOKING i.e. pays or makes a transaction
#4. SEARCH i.e. searches for an item
#5. ADD_TO_CART i.e adds items to cart
#6. ITEM_DETAIL i.e. views an item
#7. ADD_PROMO i.e. adds promo coupon etc
for i in range(1,13):

  events = clickstream[(clickstream['event_year']==2019) & (clickstream['event_month'] == i)]
  events = events.sort_values(by = 'event_time', ascending = True)
  unique_sessions = pd.DataFrame(events.groupby('unique_session_id')['event_name'].agg(list))

  unique_sessions_with_one_event = unique_sessions[unique_sessions['event_name'].apply(lambda x: len(x) == 1)]

  unique_bounced_sessions = unique_sessions_with_one_event[unique_sessions_with_one_event['event_name'].apply(lambda x: x[0]).isin(['HOMEPAGE', 'PROMO_PAGE'])]

  bounce_rate = round((len(unique_bounced_sessions) / len(unique_sessions)) * 100, 2)

  print(str(i) + ": " + "Bounce Rate = " + str(bounce_rate) + "%")

1: Bounce Rate = 17.36%
2: Bounce Rate = 17.71%
3: Bounce Rate = 17.6%
4: Bounce Rate = 17.37%
5: Bounce Rate = 17.32%
6: Bounce Rate = 16.88%
7: Bounce Rate = 17.13%
8: Bounce Rate = 17.43%
9: Bounce Rate = 17.43%
10: Bounce Rate = 17.1%
11: Bounce Rate = 17.45%
12: Bounce Rate = 17.41%


# **Conversion Rate**

In [18]:
#Conversion rate in 2019
#Here we define a user to be converted if they made a purchase

for i in range(1,13):

  events = clickstream[(clickstream['event_year']==2019) & (clickstream['event_month'] == i)]
  events = events.sort_values(by = 'event_time', ascending = True)
  unique_sessions = pd.DataFrame(events.groupby('unique_session_id')['event_name'].agg(list))

  unique_payment_sessions = unique_sessions[unique_sessions['event_name'].apply(lambda x: "BOOKING" in x)]

  conversion_rate = round((len(unique_payment_sessions) / len(unique_sessions)) * 100,2)

  print(str(i) + ": " + "Conversion Rate = " + str(conversion_rate) + "%")

1: Conversion Rate = 17.85%
2: Conversion Rate = 18.03%
3: Conversion Rate = 18.07%
4: Conversion Rate = 18.14%
5: Conversion Rate = 17.85%
6: Conversion Rate = 17.59%
7: Conversion Rate = 18.54%
8: Conversion Rate = 17.89%
9: Conversion Rate = 18.07%
10: Conversion Rate = 17.75%
11: Conversion Rate = 18.1%
12: Conversion Rate = 17.64%


# Testing

In [None]:
#flatten the event_metadata column

#drop rows with na values
clickstream_no_na = clickstream.dropna().copy()

#create a new column and fill it with dummy value = 1
clickstream_no_na['test_col'] = "1"

#apply the function to convert event_metadata column values from string to JSON
clickstream_no_na['test_col'] = clickstream_no_na['event_metadata'].apply(func)

#impute the flattened columns to the old dataset
clickstream_no_na_flattened = pd.concat([clickstream_no_na, json_normalize(clickstream_no_na['test_col'])], axis=1)