Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


#Important
This content are intended for educational and informational purposes only.

## Conversion Blockers Analysis
<br>
In this analysis we will be looking into main user characteristics captured by Google Analytics which can affect website UX and how they impact e-commerce transaction rate.
<br>
**Key notes / assumptions**
<br>
For the following analysis, we will call specific data properties (i.e. Browser version) a FEATURE, and each value of a feature (i.e. <i>Chrome V10.1</i>), a LABEL


## Step 1: Setup 

### Install all dependencies and authorize bigQuery access 

---



In [None]:
# Import all necessary libs
from google.colab import auth
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import display, HTML

# Authenticate the user to query datasets  in Google BigQuery
auth.authenticate_user()
%matplotlib inline


###Define analysis parameters

In [None]:
#@title Define the data source in BigQuery:
project_id = 'bigquery-public-data' #@param
dataset_name = 'google_analytics_sample' #@param
table_name = 'ga_sessions_*'#@param
start_date = '2014-10-01'#@param {type:"date"}
end_date = '2019-12-12'#@param{type:"date"}
billing_project_id = 'my-project' #@param

##Step 2: Create analysis building blocks

On the following coding blocks, we will create functions that will allow us to easily run the analysis multiple times, one for each feature

###Create query builder function based on tamplate

In [None]:
#assemble dynamic content dictionary
dc = {}
dc['project_id'] = project_id
dc['dataset_name'] = dataset_name
dc['table_name'] = table_name
dc['start_date'] = start_date.replace('-','')
dc['end_date'] = end_date.replace('-','')

#render final query function
def render_final_query(dc, display = False):
  q1 = '''
  #fetch # of transaction, sessions and transaction rate for each feature value
  WITH t0 AS 
  (SELECT
    {feature} AS feature,
    SUM(IFNULL(sessions.totals.transactions, 0)) AS transactions,
    COUNT(sessions.visitStartTime) AS count_sessions,
    SUM(IFNULL(sessions.totals.transactions, 0))/COUNT(sessions.visitStartTime) AS transaction_rate
  FROM
  `{project_id}.{dataset_name}.{table_name}` as sessions,
    UNNEST(hits) AS hits
  WHERE
      hits.hitNumber = 1 AND
      date BETWEEN '{start_date}'
      AND '{end_date}'
  GROUP BY 1
  ),

  #calculate % of total sessions of each feature value and global (avg) transaction rate 
  t1 AS
  (
    SELECT 
    *,
    SUM(count_sessions) OVER() AS total_sessions,
    SUM(transactions) OVER() AS total_transaction,
    AVG(transaction_rate) OVER() AS average_transaction_rate,
    count_sessions/SUM(count_sessions) OVER() AS sessions_percentage
    FROM t0
    ORDER BY transaction_rate
  )

  #limit results to only values that represent over 2% of all sessions
  #and, for remaining lines evaluate if they are bellow stdev limit 
  SELECT *,
  IF(transaction_rate < average_transaction_rate * 0.2, true, false) AS bellow_limit
  from t1
  WHERE sessions_percentage > 0.01
    '''.format(**dc)
  if display:
    print('Final BigQuery SQL:')
    print(q1)
  return q1

In [None]:
#run bigQuery query function
def run_big_query(q):
  return pd.io.gbq.read_gbq(q, project_id=billing_project_id, verbose=False, dialect='standard')

### Create function to Display Query results in bar chart 


In [None]:
def plot_graph(df, title):
  #define column colors:
  colors = []
  for index, row in df.iterrows():
    bellow_limit = df['bellow_limit'][index]
    if(bellow_limit):
      colors.append('r') #set color to red
    else:
      colors.append('b') #set color to blue


  # Specify this list of colors as the `color` option to `plot`.
  df.plot(x='feature', y='transaction_rate', kind='bar', stacked=False, color = colors, title = title, yticks=[])

## Step 3: Run entire pipeline for each feature and plot results


In [None]:
#uncomment each line to enable that analysis
features = [
("Operating System","CONCAT(sessions.device.operatingSystem, ' ', sessions.device.operatingSystemVersion)"),
("Browser","CONCAT( sessions.device.browser, ' ', sessions.device.browserversion)"),
("Language","sessions.device.language"),
#("Device Type","sessions.device.deviceCategory"),
#("Country","sessions.geoNetwork.country"),
#("Region","sessions.geoNetwork.region"),
#("City","sessions.geoNetwork.city"),
#("Landing Page","CONCAT(hits.page.hostname, hits.page.pagePath)"),
#("Screen Pixels (e5)","IF(ARRAY_LENGTH(SPLIT(sessions.device.screenResolution,'x')) = 2,ROUND(CAST(SPLIT(sessions.device.screenResolution,'x')[OFFSET(0)] AS INT64) * CAST(SPLIT(sessions.device.screenResolution,'x')[OFFSET(1)] AS INT64)/100000), Null)")
]

In [None]:
#for each feature Tuple
for item in features:
  #define custom values for SQL Query generation
  dc['feature'] = item[1]
  #generate sql
  q = render_final_query(dc, display=True)

  # REMOVE LINE BELLOW to execute query (this might result in bigQuery costs)

  
  #run query in BQ
  df = run_big_query(q)
  #print query results
  print("Results for " + item[0])
  display(df)
  print(" ")
  #plot graph
  plot_graph(df, item[0])