# Basic product analysis

In this notebook, we briefly demonstrate how you can easily do basic product analysis on your data.

## Getting started

In [None]:
# import Bach
from bach_open_taxonomy import ObjectivFrame 
from bach import display_sql_as_markdown

In [None]:
# connect to SQL db
of = ObjectivFrame.from_objectiv_data(db_url='postgresql://@localhost:5433/objectiv',
                                      table_name='data',
                                      time_aggregation='YYYY-MM-DD',
                                      start_date='2022-03-10')

In [None]:
# adding specific contexts to the data
of['application'] = of.global_contexts.gc.get_from_context_with_type_series(type='ApplicationContext', key='id')
of['root_location'] = of.location_stack.ls.get_from_context_with_type_series(type='RootLocationContext', key='id')
of['referrer'] = of.global_contexts.gc.get_from_context_with_type_series(type='HttpContext', key='referrer')
of['feature_nice_name'] = of.location_stack.ls.nice_name
of['utm_source'] = of.global_contexts.gc.get_from_context_with_type_series(type='MarketingContext', key='source')
of['utm_medium'] = of.global_contexts.gc.get_from_context_with_type_series(type='MarketingContext', key='medium')
of['utm_campaign'] = of.global_contexts.gc.get_from_context_with_type_series(type='MarketingContext', key='campaign')
of['utm_content'] = of.global_contexts.gc.get_from_context_with_type_series(type='MarketingContext', key='content')
of['utm_term'] = of.global_contexts.gc.get_from_context_with_type_series(type='MarketingContext', key='term')

In [None]:
# have a look at the data
of.sort_values('session_id', ascending=False).head()

In [None]:
# explore the data with describe
of.describe(include='all').head()

## How many users do we have?

In [None]:
# model hub: unique users, monthly
users = of.model_hub.aggregate.unique_users()
users.to_frame().sort_index(ascending=False).head(20)

In [None]:
# model hub: unique users, daily
users = of.model_hub.aggregate.unique_users()
users.to_frame().sort_index(ascending=False).head(10)

In [None]:
# TODO WHEN WE CAN GROUP: model hub: uniqu users, by main product area (called RootLocation)
users_root = of.groupby(['application', 'root_location']).agg({'user_id':'nunique'})
users_root.sort_values('user_id_nunique', ascending=False).head(10)

## What is their time spent?

In [None]:
# model hub: duration, total month
duration = of.model_hub.aggregate.session_duration(time_aggregation='YYYY-MM')
duration.to_frame().sort_index(ascending=False).head()

In [None]:
# model hub: duration, daily
duration = of.model_hub.aggregate.session_duration()
duration.to_frame().sort_index(ascending=False).head()

In [None]:
# duration for RootLocation home 
of[(of.root_location == 'home')].model_hub.aggregate.session_duration(time_aggregation='YYYY-MM').head()

In [None]:
# duration for RootLocation blog
of[(of.root_location == 'blog')].model_hub.aggregate.session_duration(time_aggregation='YYYY-MM').head()

In [None]:
# duration for the whole docs
of[(of.application == 'objectiv-docs')].model_hub.aggregate.session_duration(time_aggregation='YYYY-MM').head()

In [None]:
# how is this time spent distributed?

# first calculate the duration of each session
session_duration = of.groupby(['session_id']).agg({'moment':['min', 'max']})
session_duration['duration'] = session_duration['moment_max'] - session_duration['moment_min']

# materizalize the df
session_duration.materialize(inplace=True)

# define the quantiles 
quantiles = [0.25, 0.50, 0.75]

# distribution of time spemt
session_duration['duration'].quantile(q=quantiles).to_frame().head()

## What are the top user interactions?

In [None]:
# select PressEvent and MediaStart event, to focus on user actions
users_feature = of[(of.event_type == 'PressEvent') |
                   (of.event_type == 'MediaStartEvent')]

# users by feature
users_feature = users_feature.groupby(['application', 'feature_nice_name', 'event_type']).agg({'user_id':'nunique'})
users_feature.sort_values('user_id_nunique', ascending=False).head()

## What users do in each of the main product areas?

In [None]:
# TODO: fix with model hub when we can group 

# select homepage RootLocation only
home_users = of[(of.root_location == 'home')]

# select website only and exclude the docs
home_users = home_users[(home_users.application == 'objectiv-website')]

# select PressEvent and MediaStart event, to focus on user actions
home_users = home_users[(home_users.event_type == 'PressEvent') |
                        (home_users.event_type == 'MediaStartEvent')]

# home users by feature
home_users = home_users.groupby(['feature_nice_name', 'event_type']).agg({'user_id':'nunique'})
home_users.sort_values('user_id_nunique', ascending=False).head()

## What users do most in docs?

In [None]:
# select docs RootLocation only
docs_users = of[(of.application == 'objectiv-docs')]

# select PressEvent and MediaStart event, to focus on user actions
docs_users = docs_users[(docs_users.event_type == 'PressEvent') |
                        (docs_users.event_type == 'MediaStartEvent')]
    
# docs users by feature
docs_users = docs_users.groupby(['feature_nice_name', 'event_type']).agg({'user_id':'nunique'})
docs_users.sort_values('user_id_nunique', ascending=False).head()

## Where are users coming from?

In [None]:
# users by referrer
referrer_users = of.groupby(['referrer']).agg({'user_id':'nunique'})
referrer_users.sort_values('user_id_nunique', ascending=False).head()

## How are marketing campaigns performing?

In [None]:
# users by marketing campaign
campaign_users = of.groupby(['utm_source', 'utm_medium', 'utm_campaign', 'utm_content', 'utm_term']).agg({'user_id':'nunique'})
campaign_users = campaign_users.reset_index().dropna(axis=0, how='any', subset='utm_source')


campaign_users.sort_values('utm_source', ascending=True).head(20)

In [None]:
# select PressEvent and MediaStart event, to focus on user actions
users_feature_campaign = of[(of.event_type == 'PressEvent') |
                         (of.event_type == 'MediaStartEvent')]

# users by feature per campaign source & term
users_feature_campaign = users_feature_campaign.groupby(['utm_source', 'utm_term', 'feature_nice_name', 'event_type']).agg({'user_id':'nunique'})
users_feature_campaign = users_feature_campaign.reset_index().dropna(axis=0, how='any', subset='utm_source')

users_feature_campaign.sort_values(['utm_source', 'utm_term', 'user_id_nunique'], ascending=[True, True, False]).head(50)

## How are conversions doing?

In [None]:
# define which events to use as conversion events
of.add_conversion_event(location_stack=of.location_stack.json[{'id': 'objectiv-on-github', 
                                                               '_type': 'LinkContext'}:].fillna(
                                       of.location_stack.json[{'id': 'github', '_type': 'LinkContext'}:]).fillna(
                                       of.location_stack.json[{'id': 'star-us-on-github', '_type': 'LinkContext'}:]),
                        event_type='PressEvent',
                        name='github_press')

In [None]:
# model hub: calculate conversions
conversions = of.model_hub.filter(of.model_hub.map.is_conversion_event('github_press'))\
                .model_hub.aggregate.unique_users()

conversions.to_frame().sort_index(ascending=False).head(10)

In [None]:
# use earlier model hub outputs to calculate conversion rate
conversion_rate = users.to_frame().merge(conversions.to_frame(), how='inner', on='time_aggregation', suffixes=['_total', '_converted'])
conversion_rate['conversion_rate'] = conversion_rate['unique_users_converted'] / conversion_rate['unique_users_total']
conversion_rate.drop(columns=['unique_users_converted', 'unique_users_total'])

conversion_rate.sort_index(ascending=False).head(10)

In [None]:
# from where do users convert most?
conversion_locations = of.model_hub.filter(of.model_hub.map.is_conversion_event('github_press'))\
                         .groupby(['application', 'feature_nice_name', 'event_type'])\
                         .agg({'user_id':'nunique'})

conversion_locations.sort_values('user_id_nunique', ascending=False).head()

In [None]:
# what are users doing before they convert?

# select sessions with a conversion
converted_users = of.model_hub.filter(of.model_hub.map.conversions_counter(name='github_press')>=1)

# from those, select hits where number of conversions was still 0
converted_users = converted_users.mh.filter(converted_users.model_hub.map.conversions_in_time('github_press')==0)

# select PressEvent and MediaStart event, to focus on user actions
converted_users = converted_users[(converted_users.event_type == 'PressEvent') |
                 (converted_users.event_type == 'MediaStartEvent')]

converted_users.groupby(['application', 'feature_nice_name', 'event_type']).agg({'user_id':'nunique'})\
    .sort_values('user_id_nunique', ascending=False).head(10)

In [None]:
# how much time do users spent before they convert?
converted_users.model_hub.aggregate.session_duration(time_aggregation='YYYY-MM').to_frame().head()

## Get the SQL for any analysis

In [None]:
# just one analysis as an example, this works for anything you do with Objectiv Bach
display_sql_as_markdown(conversions)