In [None]:
import os
from google.cloud import bigquery

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r"C:\Users\ga-key.json"

client = bigquery.Client()

The dataset 'bigquery-public-data.google_analytics_sample.ga_sessions_*' from **Google Analytics Sample** as a google cloud public data has the following columns:

- visitorId
- visitNumber
- visitId
- visitStartTime
- date
- totals
- trafficSource
- device
- geoNetwork
- customDimensions
- hits
- fullVisitorId
- userId
- socialEngagement


In [6]:
# Extract count of visits in each channel grouping in 2017 July
query = """
SELECT
  channelGrouping,
  COUNT(visitId) AS number_of_visits
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE _TABLE_SUFFIX BETWEEN '20170701' AND '20170731'
GROUP BY channelGrouping
HAVING number_of_visits > 0
ORDER BY number_of_visits DESC;
"""

# Execute the query and store the results in a Pandas DataFrame
df = client.query(query).to_dataframe()

# Display the first few rows
df.head()

Unnamed: 0,channelGrouping,number_of_visits
0,Organic Search,37655
1,Direct,12306
2,Referral,9518
3,Social,7749
4,Paid Search,2105


# Data Structure

In [7]:
import pandas as pd
import numpy as np

In [8]:
# Extract count of visits in each channel grouping from a sample date
query = """
SELECT *
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170715`;
"""

# Execute the query and store the results in a Pandas DataFrame
ex = client.query(query).to_dataframe()

# Display the first few rows
ex.loc[ex['fullVisitorId'] == '5951590635558066441']

Unnamed: 0,visitorId,visitNumber,visitId,visitStartTime,date,totals,trafficSource,device,geoNetwork,customDimensions,hits,fullVisitorId,userId,clientId,channelGrouping,socialEngagementType
5,,3,1500162035,1500162035,20170715,"{'visits': 1, 'hits': 16, 'pageviews': 11, 'ti...","{'referralPath': '/offer/2145', 'campaign': '(...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...","[{'index': 4, 'value': 'North America'}]","[{'hitNumber': 1, 'time': 0, 'hour': 16, 'minu...",5951590635558066441,,,Referral,Not Socially Engaged


In [9]:
for x in ex.loc[0,'totals'].keys():
    print(x)

visits
hits
pageviews
timeOnSite
bounces
transactions
transactionRevenue
newVisits
screenviews
uniqueScreenviews
timeOnScreen
totalTransactionRevenue
sessionQualityDim


In [10]:
ex.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1721 entries, 0 to 1720
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   visitorId             0 non-null      Int64 
 1   visitNumber           1721 non-null   Int64 
 2   visitId               1721 non-null   Int64 
 3   visitStartTime        1721 non-null   Int64 
 4   date                  1721 non-null   object
 5   totals                1721 non-null   object
 6   trafficSource         1721 non-null   object
 7   device                1721 non-null   object
 8   geoNetwork            1721 non-null   object
 9   customDimensions      1721 non-null   object
 10  hits                  1721 non-null   object
 11  fullVisitorId         1721 non-null   object
 12  userId                0 non-null      object
 13  clientId              0 non-null      object
 14  channelGrouping       1721 non-null   object
 15  socialEngagementType  1721 non-null   

In [11]:
ex.loc[2, 'totals']

{'visits': 1,
 'hits': 15,
 'pageviews': 14,
 'timeOnSite': 189,
 'bounces': None,
 'transactions': None,
 'transactionRevenue': None,
 'newVisits': 1,
 'screenviews': None,
 'uniqueScreenviews': None,
 'timeOnScreen': None,
 'totalTransactionRevenue': None,
 'sessionQualityDim': 13}

In [12]:
# Display all the hits during the session
for id in [2]:
    sub_df = pd.DataFrame(pd.DataFrame(ex.loc[id, 'hits'].tolist()))
    columns = sub_df.columns
    print(sub_df[columns[:15]].info())
    print(sub_df[columns[15:]].info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   hitNumber      15 non-null     int64 
 1   time           15 non-null     int64 
 2   hour           15 non-null     int64 
 3   minute         15 non-null     int64 
 4   isSecure       0 non-null      object
 5   isInteraction  15 non-null     bool  
 6   isEntrance     1 non-null      object
 7   isExit         1 non-null      object
 8   referer        5 non-null      object
 9   page           15 non-null     object
 10  transaction    10 non-null     object
 11  item           10 non-null     object
 12  contentInfo    0 non-null      object
 13  appInfo        15 non-null     object
 14  exceptionInfo  15 non-null     object
dtypes: bool(1), int64(4), object(10)
memory usage: 1.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 18 columns)

In [13]:
# Display the first few rows of the new DataFrame
sub_df.iloc[:,9:].tail(2)

Unnamed: 0,page,transaction,item,contentInfo,appInfo,exceptionInfo,eventInfo,product,promotion,promotionActionInfo,...,customVariables,customDimensions,customMetrics,type,social,latencyTracking,sourcePropertyInfo,contentGroup,dataSource,publisher_infos
13,"{'pagePath': '/google+redesign/apparel', 'host...","{'transactionId': None, 'transactionRevenue': ...","{'transactionId': None, 'productName': None, '...",,"{'name': None, 'version': None, 'id': None, 'i...","{'description': None, 'isFatal': True, 'except...",,"[{'productSKU': 'GGOEAHPA004110', 'v2ProductNa...",[],,...,[],[],[],PAGE,"{'socialInteractionNetwork': None, 'socialInte...",,,"{'contentGroup1': '(not set)', 'contentGroup2'...",web,[]
14,"{'pagePath': '/home', 'hostname': 'shop.google...",,,,"{'name': None, 'version': None, 'id': None, 'i...","{'description': None, 'isFatal': True, 'except...",,[],"[{'promoId': 'Apparel Row 1', 'promoName': 'Ap...","{'promoIsView': True, 'promoIsClick': None}",...,[],[],[],PAGE,"{'socialInteractionNetwork': None, 'socialInte...",,,"{'contentGroup1': '(not set)', 'contentGroup2'...",web,[]


In [14]:
# Look at the page columns
page_df = pd.DataFrame(sub_df['page'].tolist())
page_df

Unnamed: 0,pagePath,hostname,pageTitle,searchKeyword,searchCategory,pagePathLevel1,pagePathLevel2,pagePathLevel3,pagePathLevel4
0,/google+redesign/shop+by+brand/youtube,shop.googlemerchandisestore.com,YouTube | Shop by Brand | Google Merchandise S...,,,/google+redesign/,/shop+by+brand/,/youtube,
1,/google+redesign/apparel,shop.googlemerchandisestore.com,Apparel | Google Merchandise Store,,,/google+redesign/,/apparel,,
2,/google+redesign/apparel,shop.googlemerchandisestore.com,Apparel | Google Merchandise Store,,,/google+redesign/,/apparel,,
3,/google+redesign/apparel/quickview,shop.googlemerchandisestore.com,Apparel | Google Merchandise Store,,,/google+redesign/,/apparel/,/quickview,
4,/google+redesign/apparel,shop.googlemerchandisestore.com,Apparel | Google Merchandise Store,,,/google+redesign/,/apparel,,
5,/google+redesign/apparel/mens,shop.googlemerchandisestore.com,Men's Apparel | Google Merchandise Store,,,/google+redesign/,/apparel/,/mens,
6,/google+redesign/shop+by+brand/youtube,shop.googlemerchandisestore.com,YouTube | Shop by Brand | Google Merchandise S...,,,/google+redesign/,/shop+by+brand/,/youtube,
7,/google+redesign/apparel,shop.googlemerchandisestore.com,Apparel | Google Merchandise Store,,,/google+redesign/,/apparel,,
8,/home,shop.googlemerchandisestore.com,Home,,,/home,,,
9,/google+redesign/shop+by+brand/youtube,shop.googlemerchandisestore.com,YouTube | Shop by Brand | Google Merchandise S...,,,/google+redesign/,/shop+by+brand/,/youtube,


Notice that time is the time of the hit since the entry of the website in milliseconds, where hour and minute represents the time of the hit interaction. Page contains all the page the visitor has hit.

In [15]:
ex['channelGrouping'].unique()

array(['Affiliates', 'Organic Search', 'Referral', 'Social', 'Direct',
       'Paid Search', 'Display'], dtype=object)

visitorId, userId, and clientId provide no useful information.

In [16]:
for col in ex.select_dtypes(object):
    try:
        ex[f'{col}_info'] = [x.keys() for x in ex[col]]
        print(ex[f'{col}_info'].value_counts())
        print("\n")
    except AttributeError:
        continue

totals_info
(visits, hits, pageviews, timeOnSite, bounces, transactions, transactionRevenue, newVisits, screenviews, uniqueScreenviews, timeOnScreen, totalTransactionRevenue, sessionQualityDim)    1721
Name: count, dtype: int64


trafficSource_info
(referralPath, campaign, source, medium, keyword, adContent, adwordsClickInfo, isTrueDirect, campaignCode)    1721
Name: count, dtype: int64


device_info
(browser, browserVersion, browserSize, operatingSystem, operatingSystemVersion, isMobile, mobileDeviceBranding, mobileDeviceModel, mobileInputSelector, mobileDeviceInfo, mobileDeviceMarketingName, flashVersion, javaEnabled, language, screenColors, screenResolution, deviceCategory)    1721
Name: count, dtype: int64


geoNetwork_info
(continent, subContinent, country, region, metro, city, cityId, networkDomain, latitude, longitude, networkLocation)    1721
Name: count, dtype: int64




In [17]:
pd.DataFrame(ex['totals'].tolist()).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1721 entries, 0 to 1720
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   visits                   1721 non-null   int64  
 1   hits                     1721 non-null   int64  
 2   pageviews                1721 non-null   int64  
 3   timeOnSite               752 non-null    float64
 4   bounces                  966 non-null    float64
 5   transactions             16 non-null     float64
 6   transactionRevenue       16 non-null     float64
 7   newVisits                1407 non-null   float64
 8   screenviews              0 non-null      object 
 9   uniqueScreenviews        0 non-null      object 
 10  timeOnScreen             0 non-null      object 
 11  totalTransactionRevenue  16 non-null     float64
 12  sessionQualityDim        1721 non-null   int64  
dtypes: float64(6), int64(4), object(3)
memory usage: 174.9+ KB


In [18]:
sub_df = pd.DataFrame(ex['trafficSource'].tolist())
sub_df.head(5)

Unnamed: 0,referralPath,campaign,source,medium,keyword,adContent,adwordsClickInfo,isTrueDirect,campaignCode
0,,Data Share Promo,Partners,affiliate,,,"{'campaignId': None, 'adGroupId': None, 'creat...",,
1,,(not set),google,organic,(not provided),,"{'campaignId': None, 'adGroupId': None, 'creat...",,
2,,(not set),google,organic,(not provided),,"{'campaignId': None, 'adGroupId': None, 'creat...",,
3,,(not set),google,organic,Youtube,,"{'campaignId': None, 'adGroupId': None, 'creat...",,
4,,(not set),google,organic,(not provided),,"{'campaignId': None, 'adGroupId': None, 'creat...",,


In [19]:
pd.DataFrame(ex['device'].tolist()).head()

Unnamed: 0,browser,browserVersion,browserSize,operatingSystem,operatingSystemVersion,isMobile,mobileDeviceBranding,mobileDeviceModel,mobileInputSelector,mobileDeviceInfo,mobileDeviceMarketingName,flashVersion,javaEnabled,language,screenColors,screenResolution,deviceCategory
0,Chrome,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,,not available in demo dataset,not available in demo dataset,not available in demo dataset,mobile
1,Chrome,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,,not available in demo dataset,not available in demo dataset,not available in demo dataset,mobile
2,Safari,not available in demo dataset,not available in demo dataset,iOS,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,,not available in demo dataset,not available in demo dataset,not available in demo dataset,mobile
3,Opera Mini,not available in demo dataset,not available in demo dataset,(not set),not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,,not available in demo dataset,not available in demo dataset,not available in demo dataset,mobile
4,Chrome,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop


In [20]:
pd.DataFrame(ex['geoNetwork'].tolist()).head()

Unnamed: 0,continent,subContinent,country,region,metro,city,cityId,networkDomain,latitude,longitude,networkLocation
0,Europe,Eastern Europe,Czechia,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,iol.cz,not available in demo dataset,not available in demo dataset,not available in demo dataset
1,Americas,Northern America,United States,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,cox.net,not available in demo dataset,not available in demo dataset,not available in demo dataset
2,Americas,Northern America,Canada,Quebec,(not set),Montreal,not available in demo dataset,unknown.unknown,not available in demo dataset,not available in demo dataset,not available in demo dataset
3,Asia,Southern Asia,India,Karnataka,(not set),Bengaluru,not available in demo dataset,unknown.unknown,not available in demo dataset,not available in demo dataset,not available in demo dataset
4,Americas,Northern America,United States,California,San Francisco-Oakland-San Jose CA,Palo Alto,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,not available in demo dataset


# Traffic Analysis and Funnel Analysis

Focus on visitors that has converted once or none from 2016-August to 2017-July

In [26]:
# Funnel analysis on the different pages clicked
hits_query = """
WITH hits_table AS (
    SELECT 
        fullVisitorId,
        visitNumber,
        MAX(h.time)/1000 AS secondsSpent, -- Total time spent on the site
        MIN(h.hour) AS hourOfDay, -- Hour the hit occurred
        SUM(CASE WHEN h.type = 'EVENT' THEN 1 ELSE 0 END) AS eventCount, -- Total number of events
        STRING_AGG(DISTINCT 
            CASE 
                WHEN CONTAINS_SUBSTR(h.page.pagePathLevel2, 'apparel') AND h.page.pageTitle IS NOT NULL 
                THEN SPLIT(h.page.pageTitle, '|')[SAFE_OFFSET(0)] 
            END, ', ') AS productsBrowsed
    FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`,
    UNNEST(hits) AS h
    WHERE _TABLE_SUFFIX BETWEEN '20160801' AND '20170801'
    GROUP BY fullVisitorId, visitNumber
)

SELECT
    g.fullVisitorId,
    g.date,
    g.visitNumber,
    FORMAT_DATE('%A', PARSE_DATE('%Y%m%d', g.date)) AS weekday,
    COALESCE(g.totals.transactions), -- Total number of transactions
    COALESCE(g.totals.transactionRevenue), -- Total spent
    g.totals.hits, -- Total number of hits
    g.totals.pageViews, -- Total number of page views
    g.trafficSource.source AS source,
    g.trafficSource.medium AS medium,
    g.geoNetwork.continent,
    g.geoNetwork.country,
    g.device.browser,
    g.device.operatingSystem,
    g.device.deviceCategory,
    h.secondsSpent,
    h.hourOfDay,
    h.eventCount,
    h.productsBrowsed
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` AS g
JOIN hits_table AS h
    ON g.fullVisitorId = h.fullVisitorId AND g.visitNumber = h.visitNumber
WHERE g._TABLE_SUFFIX BETWEEN '20160801' AND '20170801'
ORDER BY g.fullVisitorId, h.visitNumber;
"""

In [27]:
# Execute the query and store the results in a Pandas DataFrame
hits_df = client.query(hits_query).to_dataframe()

# Display the first few rows
hits_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   fullVisitorId       903653 non-null  object 
 1   date                903653 non-null  object 
 2   visitNumber         903653 non-null  Int64  
 3   weekday             903653 non-null  object 
 4   transactions        11552 non-null   Int64  
 5   transactionRevenue  11515 non-null   Int64  
 6   hits                903653 non-null  Int64  
 7   pageViews           903553 non-null  Int64  
 8   source              903653 non-null  object 
 9   medium              903653 non-null  object 
 10  continent           903653 non-null  object 
 11  country             903653 non-null  object 
 12  browser             903653 non-null  object 
 13  operatingSystem     903653 non-null  object 
 14  deviceCategory      903653 non-null  object 
 15  secondsSpent        903653 non-nul

In [28]:
# Turn 'date' to datetime object
hits_df['date'] = pd.to_datetime(hits_df['date'], format = "%Y%m%d")

# Turn 'fullVisitorId' to string
hits_df['fullVisitorId'] = hits_df['fullVisitorId'].astype(str)

In [29]:
# Save to CSV in a specific directory
hits_df.to_csv("agg_data/hits.csv", index=False)

In [24]:
# Save to Excel in a specific directory
hits_df.to_excel("agg_data/hits.xlsx", index=False)