## Purpose: The purpose of this notebook is to visualize trends and correlations in the bank's event based data.
### Input: Two tables
* df: Full event table, Row = financial event, Columns = Information about the event, including the performing customer, date, event type, etc
* df_cust_info: Customer starting info table, Row = one customer, Columns = Information about the customer, including the age, gender, start date, and location
### Output: Interactive plots for data visualization
### Guidelines: For each plot, use the dropdowns to select the specific data you wish to visualize
### Assumptions: The EVENT_DATE_COL is a datetime object. For plot #5, with real data the insights will be more valuable. Current dataset constructed with each customer averaging exactly 10 events/month.

Import Statements

In [1]:
import snowflake.connector
from jupyter_dash import JupyterDash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date, datetime, timedelta
import seaborn as sns
import os

Snowflake connection details

In [2]:
account = 'or10868.uae-north.azure'
username = 'insait'
password = 'Insait123'
database = 'FAKEDB'
schema = 'EDA116'
warehouse = 'COMPUTE_WH'
role ='ACCOUNTADMIN'


table = 'event_logs_full_with_dep'

customer_info = 'customer_info'

Constants

In [3]:
# EVENT TABLE COLS - fill with "" if not applicable
EVENT_DATE_COL = 'event_date'
CUST_ID_COL = 'customer_id'
EVENT_TYPE_COL = 'event_type'
EVENT_DESC_COL = 'event_description'
EVENT_ID_COL = 'event_id'

# CUSTOMER INFO TABLE - fill with "" if not applicable
AGE_COL = 'age'
GENDER_COL = 'gender'
START_D_COL = 'start_date'
LOCATION_COL = 'location'
SALARY_COL = 'salary'

Retrieve the table from snowflake and load it into a pandas dataframe

In [4]:
# Snowflake connection
conn = snowflake.connector.connect(
    user=username,
    password=password,
    account=account,
    warehouse=warehouse,
    database=database,
    schema=schema
)

# Create a cursor object
cursor = conn.cursor()

# Pull data, want 1,000,000 random rows
limit_num = 1000000

# Get event log tables, put them into one data frame
cursor.execute(f"SELECT * FROM {schema}.{table} order by RANDOM(42) LIMIT "+str(limit_num))

data = cursor.fetchall()

# CLIENT TO DO: order the columns list properly to fit table structure
df = pd.DataFrame(data=data, columns = [CUST_ID_COL, EVENT_TYPE_COL, EVENT_ID_COL, EVENT_DATE_COL, 
                                        EVENT_DESC_COL])
df.sort_values(by=EVENT_DATE_COL, inplace=True)

if EVENT_DESC_COL == '':
    df[EVENT_DESC_COL] = pd.Series(df[EVENT_TYPE_COL])

# Add row cat column to get only event rows later on
df['row_category'] = pd.Series(['event' for _ in range(len(df))])

# create more accessible date columns given the datetime object
df['year'] = df[EVENT_DATE_COL].dt.year
df['quarter'] = df[EVENT_DATE_COL].dt.quarter

df['month'] = df[EVENT_DATE_COL].dt.month
df['week'] = df[EVENT_DATE_COL].dt.isocalendar().week.astype(int)
df['day'] = df[EVENT_DATE_COL].dt.day
df['hour'] = df[EVENT_DATE_COL].dt.hour
df['day_of_week'] = df[EVENT_DATE_COL].dt.dayofweek 


# Customer info
cursor.execute(f"SELECT * FROM {schema}.{customer_info}")
data = cursor.fetchall()

# CLIENT TO DO: order the columns list properly to fit table structure - add more as needed
df_cust_info = pd.DataFrame(data=data, columns = [CUST_ID_COL, AGE_COL, GENDER_COL, START_D_COL, LOCATION_COL, SALARY_COL])
splitting_cols_numeric = [AGE_COL,SALARY_COL,START_D_COL]
splitting_cols_cat = [AGE_COL,GENDER_COL,LOCATION_COL]
# Add row cat column to get only event rows later on
df_cust_info['row_category'] = pd.Series(['cust_info' for _ in range(len(df_cust_info))])


df = pd.concat([df,df_cust_info])

Initialize Dash app and create the layout using relevant components

In [5]:
app = JupyterDash(__name__)


time_intervals = ['quarter','month','week','day','day_of_week'] 

if EVENT_TYPE_COL:
    event_types = list(df[df['row_category'] == 'event'][EVENT_TYPE_COL].unique())
else:
    event_types = []
    
# Calculate number of months:
dfcopy = df[df['row_category'] == 'event']

start_date = dfcopy.iloc[0][EVENT_DATE_COL].date() # assumption that EVENT_DATE_COL exists
end_date = dfcopy.iloc[-1][EVENT_DATE_COL].date()

# Extract year and month components from both date objects
start_year, start_month = start_date.year, start_date.month
end_year, end_month = end_date.year, end_date.month

# Calculate the difference in years and months
year_diff = end_year - start_year
month_diff = end_month - start_month

# Calculate the total number of months spanned
total_months_spanned = year_diff * 12 + month_diff
    

customers = list(df[CUST_ID_COL].unique()) # assumption that CUST_ID_COL exists

app.layout = html.Div([
    html.H1("Exploratory Data Analysis of Fake Dataset", style={'textAlign': 'center'}),
    html.H2("Data comes from table with structure: each row is an event, columns: customer_id, event_type, event_id, event_date, event_description", style={'textAlign': 'center'}),

    ## Time series
    html.H3("This plot shows event frequency over differing periods of time. It helps answer questions similar to the following:", style={'textAlign': 'center'}),
    html.H3("How do event occurrences vary over time? Are there any noticeable trends or seasonality patterns?", style={'textAlign': 'center'}),
    html.H3("Which events are more popular than others? Are there specific days or times when events are more likely to occur?", style={'textAlign': 'center'}),
    html.Label('Select date range:', style={'fontSize': 20}),
    dcc.DatePickerRange(
        id='freq-range',
        min_date_allowed=df[EVENT_DATE_COL].min().date(),
        max_date_allowed=df[EVENT_DATE_COL].max().date(),
        initial_visible_month=df[EVENT_DATE_COL].min().date(),
        end_date=df[EVENT_DATE_COL].max().date()
    ),
    html.Br(),
    html.Label('Select Time Interval:', style={'fontSize': 20}),
    dcc.Dropdown(
        id='time-interval-select',
        options=[{'label': i, 'value': i} for i in time_intervals], 
        value=time_intervals[0]
    ),

    html.Label('Select event category:', style={'fontSize': 20}),
    dcc.Dropdown(
        id='eventtype-select',
        options=[{'label': 'All events', 'value': 'All events'}]+[{'label': i, 'value': i} for i in event_types],
        value='All events'
    ),
    
    html.Label('Select event:', style={'fontSize': 20}),
    dcc.Dropdown(id='event-select'),
    
    html.Label('Divide distribution by:', style={'fontSize': 20}),
    dcc.Dropdown(
        id='distribution-select',
        options=[{'label': i, 'value': i} for i in splitting_cols_cat], # make this a dyamic list to choose from
        value=splitting_cols_cat[0]
    ),
    
    html.Div([
    html.Label('Size of age bucket:', style={'fontSize': 20}),
    dcc.Slider(
        id='agehist-select',
        min = 1,
        max = 15,
        value = 1,
        marks={i: {'label': str(i),} 
               for i in range(1,16)},
        step = None
    )], id = 'efreqage-select'),
    
    html.Button(id='showgender-button', n_clicks=0),
    
    html.Button(id='load-efreqplot-button', n_clicks=0),
    
    
    html.Div([dcc.Graph(id='bar-plot')],id = "barplot_div"),
    
    ## Heatmap
    html.H3("This heatmap shows the correlation between events. It helps answer questions similar to the following:", style={'textAlign': 'center'}),
    html.H3("Are there events that commonly occur together? If a client performs one event, are they likely to perform another?", style={'textAlign': 'center'}),
    html.Label('Select x-axis event category:', style={'fontSize': 20}),
    dcc.Dropdown(
        id='eventtype-select-heatx',
        options=[{'label': i, 'value': i} for i in event_types], # what does event_types look like?
        value=event_types[0] if event_types else None
    ),

    
    html.Label('Select y-axis event category:', style={'fontSize': 20}),
    dcc.Dropdown(
        id='eventtype-select-heaty',
        options=[{'label': i, 'value': i} for i in event_types], # what does event_types look like?
        value=event_types[0] if event_types else None
    ),
    
    html.Label('Select date range:', style={'fontSize': 20}),
    dcc.DatePickerRange(
        id='heat-range',
        min_date_allowed=df[EVENT_DATE_COL].min().date(),
        max_date_allowed=df[EVENT_DATE_COL].max().date(),
        initial_visible_month=df[EVENT_DATE_COL].min().date(),
        end_date=df[EVENT_DATE_COL].max().date()
    ),
    
    
    html.Button(id='load-heatmap-button', n_clicks=0),
    
    html.Div([dcc.Graph(id='heatmap-plot')],id = "heatmap_div"),
    
    ## Events by customer
    html.H3("This bar graph shows the event distribution by customer. It helps answer questions similar to the following:", style={'textAlign': 'center'}),
    html.H3("How does the distribution of events vary across different customers?", style={'textAlign': 'center'}),
    html.Label('Select customer:', style={'fontSize': 20}),
    dcc.Dropdown(
        id='customer-select',
        options=[{'label': i, 'value': i} for i in customers],
        value=customers[0],
        multi = True
    ),
    
    html.Label('Select event category:', style={'fontSize': 20}),
    dcc.Dropdown(
        id='eventtype-select-byc',
        options=[{'label': i, 'value': i} for i in ['All events'] + event_types],
        value='All events'
    ),
    
    html.Label('Select event:', style={'fontSize': 20}),
    dcc.Dropdown(id='event-select-byc'),
    
    html.Label('Select date range:', style={'fontSize': 20}),
    dcc.DatePickerRange(
        id='eventbyc-range',
        min_date_allowed=df[EVENT_DATE_COL].min().date(),
        max_date_allowed=df[EVENT_DATE_COL].max().date(),
        initial_visible_month=df[EVENT_DATE_COL].min().date(),
        end_date=df[EVENT_DATE_COL].max().date()
    ),
    
    html.Button(id='load-edist-button', n_clicks=0),
    
    html.Div([dcc.Graph(id='bar-plot-byc')],id = "barplotbyc_div"),
    
    ## Time between events
    html.H3("This bar plot shows the number of times a client did a specific event.", style={'textAlign': 'center'}),
    
    html.Label('Select event category', style={'fontSize': 20}),
    dcc.Dropdown(
        id='eventtype-select-inter-start',
        options=[{'label': i, 'value': i} for i in ['All events'] + event_types],
        value='All events'
    ),
    
    html.Label('Select event:', style={'fontSize': 20}),
    dcc.Dropdown(id='event-select-inter-start'),
    
    
    html.Label('Select date range:', style={'fontSize': 20}),
    dcc.DatePickerRange(
        id='timeinter-range',
        min_date_allowed=df[EVENT_DATE_COL].min().date(),
        max_date_allowed=df[EVENT_DATE_COL].max().date(),
        initial_visible_month=df[EVENT_DATE_COL].min().date(),
        end_date=df[EVENT_DATE_COL].max().date()
    ),
    html.Br(),
    html.Button(id='load-timeinterval-button', n_clicks=0),
    html.Button(id='timeintervalbreakdown-button', n_clicks=0),
    
    html.Div([dcc.Graph(id='scatter-plot-inter')],id = "timeinterval_div"),
    
    
    # Event frequency scatter
    html.H3("This scatter plot correlates numeric customer details, like age or account balance, with event frequency.", style={'textAlign': 'center'}),
    html.H3(" It helps answer questions similar to the following: How does event frequency vary between different customer age groups?", style={'textAlign': 'center'}),
    html.H3("What is the average event frequency for customers with different levels of account balances?", style={'textAlign': 'center'}),
    
    # X axis: customer age, account balance, account age
    html.Label('Select x-axis value:', style={'fontSize': 20}),
    dcc.Dropdown(
        id='efreq-scatter-xaxis',
        options=[{'label': i, 'value': i} for i in splitting_cols_numeric], # make dynamic splitting
        value=splitting_cols_numeric[0]
    ),
    
    html.Label('Select date range:', style={'fontSize': 20}),
    dcc.DatePickerRange(
        id='efreq-range',
        min_date_allowed=df[EVENT_DATE_COL].min().date(),
        max_date_allowed=df[EVENT_DATE_COL].max().date(),
        initial_visible_month=df[EVENT_DATE_COL].min().date(),
        end_date=df[EVENT_DATE_COL].max().date()
    ),
    
    html.Br(),
    html.Label('Select event category', style={'fontSize': 20}),
    dcc.Dropdown(
        id='efreq-cat',
        options=[{'label': i, 'value': i} for i in ['All events'] + event_types],
        value='All events'
    ),
    
    html.Label('Select event:', style={'fontSize': 20}),
    dcc.Dropdown(id='efreq-event'),
    
    html.Div([
    html.Label('Number of bins:', style={'fontSize': 20}),
    dcc.Slider(
        id='bin-select',
        min = 1,
        max = 10,
        value = 1,
        marks={i: {'label': str(i),} 
               for i in range(1,11)},
        step = None
    )], id = 'efreqbin-select'),
    
    html.Button(id='efreq-type-button', n_clicks=0),
    
    html.Button(id='load-efreq-button', n_clicks=0),
    
    html.Div([dcc.Graph(id='scatter-plot-efreq')],id = "efreq_div"),

])

Define the callback functions for the various interactive dash components

In [6]:
## Timeseries
@app.callback(
    Output('event-select', 'options'),
    Input('eventtype-select', 'value'))

def update_event_select(event_type):
    if event_type == 'All events':
        return [{'label': 'All events', 'value': 'All events'}]
    
    # consider if event_type_col or event_desc_col don't exist
    options = [{'label': option, 'value': option} for option in list(df[df[EVENT_TYPE_COL] == event_type][EVENT_DESC_COL].unique())]
    options = [{'label': 'All events in '+event_type, 'value': event_type}] + options
    return options

@app.callback(
    Output('efreqage-select', 'style'),
    Input('distribution-select', 'value'))

def update_age_dist_select(dist):
    if dist == AGE_COL:
        return {'display': 'block'}
    return {'display': 'none'}


@app.callback(
    Output('showgender-button', 'children'),
    [Input('showgender-button', 'n_clicks'), Input('distribution-select', 'value')])

def update_gender_button(n_clicks, dist):
    if n_clicks%2 == 0:
        return "Show " + dist + " distribution"
    else:
        return "Hide " + dist + " distribution"


@app.callback(
    Output('load-efreqplot-button', 'children'),
    Input('load-efreqplot-button', 'n_clicks'))

def update_efreqplot_button(n_clicks):
    if n_clicks%2 == 0:
        return "Load event frequency plot"
    else:
        return "Hide event frequency plot"

app.clientside_callback(
    """
    function(n_clicks) {
        if(n_clicks % 2 != 0){
            return {'display': 'block'}
        } else {
            return {'display': 'none'}
        }
    }
    """,
    Output('barplot_div', 'style'),
    Input('load-efreqplot-button', 'n_clicks')
)

@app.callback(
    Output('bar-plot', 'figure'),
    [Input('time-interval-select', 'value'), Input('event-select', 'value'),
     Input('showgender-button', 'n_clicks'), Input('distribution-select', 'value'),
    Input('agehist-select', 'value'),Input('freq-range', 'start_date'), Input('freq-range', 'end_date')])

def update_barplot(time_interval, event, gender_n_clicks, dist,agehist, start, end):
    dfcopy = df.copy()
    dfc = dfcopy[dfcopy['row_category'] == 'event']
    
    if start and end:
        start = datetime.strptime(start, '%Y-%m-%d')
        end = datetime.strptime(end, '%Y-%m-%d')
        dfc = dfc[(dfc[EVENT_DATE_COL] >= start) & (dfc[EVENT_DATE_COL] <= end)]
    
    

    # For the given time interval, retreive the unique choices and assign them as the x axis values
    if time_interval == 'day_of_week':
        x_data = [i for i in dfc[time_interval].unique()] 
    
    else:
        
        start_date = dfc[EVENT_DATE_COL].min()
        
        if time_interval == 'quarter':
            dfc[time_interval] = dfc[EVENT_DATE_COL].apply(lambda x: (x.year - start_date.year) * 4 + (((x.month - 1) // 3 + 1) - ((start_date.month - 1) // 3 + 1)))
        
        elif time_interval == 'month':
            dfc[time_interval] = dfc[EVENT_DATE_COL].apply(lambda x: (x.year - start_date.year) * 12 + (x.month - start_date.month))
        
        elif time_interval == 'week':
            dfc[time_interval] = dfc[EVENT_DATE_COL].apply(lambda x: (x - start_date).days // 7)
        
        elif time_interval == 'day':
            dfc[time_interval] = dfc[EVENT_DATE_COL].apply(lambda x: (x - start_date).days)
        
        
        x_data = [i for i in dfc[time_interval].unique()]
    
    x_data.sort()
    
    if gender_n_clicks % 2 != 0: #show gender distribution
        dfcust = dfcopy[dfcopy['row_category'] == 'cust_info']
        
        if dist == AGE_COL:
            
        # Create an additional column of age ranges
            bin_size = int(agehist)
            min_range = (dfcust[AGE_COL].astype(int).min() // bin_size) * bin_size if dfcust[AGE_COL].astype(int).min() % bin_size != 0 else ((dfcust[AGE_COL].astype(int).min() // bin_size) - 1) * bin_size 
            max_range = ((dfcust[AGE_COL].astype(int).max() // bin_size) + 1) * bin_size if dfcust[AGE_COL].astype(int).max() % bin_size != 0 else dfcust[AGE_COL].astype(int).max()
            age_ranges = list(range(min_range, max_range + 1, bin_size))

            dfcust['age_group'] = pd.cut(dfcust[AGE_COL].astype(int), bins=age_ranges)
            dfcust['age_group'] = dfcust['age_group'].astype(str)

            # Extract start values using string manipulation
            dfcust['age_group'] = dfcust['age_group'].str.extract(r'\((.*),').astype(int)
            
            dist = 'age_group'
        
        if dist != 'age_group':
            dfc = dfc.drop([dist], axis=1)
            
        dfc = pd.merge(dfc, dfcust[[CUST_ID_COL, dist]], left_on=CUST_ID_COL, right_on=CUST_ID_COL, how='left')
        
        
        # Filter the data according to the choice
        if event == 'All events':
            y_data = dfc.groupby(by=[dist])[time_interval].value_counts().sort_index()
        elif event in event_types:
            y_data = dfc[dfc[EVENT_TYPE_COL] == event].groupby(by=[dist])[time_interval].value_counts().sort_index()
        else:
            y_data = dfc[dfc[EVENT_DESC_COL] == event].groupby(by=[dist])[time_interval].value_counts().sort_index()
        
        trace = []

        for g in sorted(list(dfcust[dist].unique())):
            if dist == 'age_group':
                name = str(g) + " - " + str(g + bin_size)
            else:
                name = g
            trace.append(go.Bar(x=x_data, y=y_data.loc[g], name=name))

    else:
        # Filter the data according to the choice
        if event == 'All events':
            y_data = dfc[time_interval].value_counts().sort_index()
        elif event in event_types:
            y_data = dfc[dfc[EVENT_TYPE_COL] == event][time_interval].value_counts().sort_index()
        else:
            y_data = dfc[dfc[EVENT_DESC_COL] == event][time_interval].value_counts().sort_index()


    # Create the bar plot
    
        trace = [go.Bar(x=x_data, y=y_data)]

    # Create the layout
    layout = go.Layout(title='Events by Time Interval', xaxis=dict(title=str(time_interval)), yaxis=dict(title="Event Count"), barmode="group")

    # Create the figure
    fig = go.Figure(data=trace, layout=layout)

    return fig

## Heatmap
@app.callback(
    Output('load-heatmap-button', 'children'),
    Input('load-heatmap-button', 'n_clicks'))

def update_heatmap_button(n_clicks):
    if n_clicks%2 == 0:
        return "Load heatmap"
    else:
        return "Hide heatmap"
    
app.clientside_callback(
    """
    function(n_clicks) {
        if(n_clicks % 2 != 0){
            return {'display': 'block'}
        } else {
            return {'display': 'none'}
        }
    }
    """,
    Output('heatmap_div', 'style'),
    Input('load-heatmap-button', 'n_clicks')
)

@app.callback(
    Output('heatmap-plot', 'figure'),
    [Input('eventtype-select-heatx', 'value'), Input('eventtype-select-heaty', 'value'),
     Input('heat-range', 'start_date'), Input('heat-range', 'end_date')])

def update_heatmap(event_x, event_y,start, end):
    dfcopy = df.copy()
    dfc = dfcopy[dfcopy['row_category'] == 'event']
    dfc = dfc[[CUST_ID_COL,EVENT_TYPE_COL,EVENT_DESC_COL,EVENT_DATE_COL]]
    # Filter the data according to the two event types chosen
    dfc = dfc[(dfc[EVENT_TYPE_COL] == event_x) | (dfc[EVENT_TYPE_COL] == event_y)]
    
    if start and end:
        start = datetime.strptime(start, '%Y-%m-%d')
        end = datetime.strptime(end, '%Y-%m-%d')
        dfc = dfc[(dfc[EVENT_DATE_COL] >= start) & (dfc[EVENT_DATE_COL] <= end)]
    

    # Pivot the DataFrame to get a binary matrix where rows are clients, columns are events,
    # and values indicate whether a client performed the event or not.
    pivot_df = dfc[[CUST_ID_COL,EVENT_DESC_COL]].pivot_table(index=CUST_ID_COL, columns=EVENT_DESC_COL, aggfunc='size', fill_value=0)

    # Calculate the correlation between different events (i.e., how often they occur together).
    correlation_matrix = pivot_df.corr()

    # Correlation is meaningless when considering the same event on both axes, so fill with NaN value
    if event_x == event_y:
        np.fill_diagonal(correlation_matrix.values, np.nan)

    else:
        x_vals = list(dfc[dfc[EVENT_TYPE_COL] == event_x][EVENT_DESC_COL].unique())
        y_vals = list(dfc[dfc[EVENT_TYPE_COL] == event_y][EVENT_DESC_COL].unique())
        correlation_matrix = correlation_matrix[x_vals]
        correlation_matrix = correlation_matrix.filter(items = y_vals, axis=0)

    # Create the heatmap trace
    trace = go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.index,
    colorscale='Viridis',)

    # Create the layout
    layout = go.Layout(title='Event Co-occurrence Heatmap', xaxis=dict(title=event_x), yaxis=dict(title=event_y))

    # Create the figure
    fig = go.Figure(data=[trace], layout=layout)

    return fig

    
## Event distribution by customer
@app.callback(
    Output('event-select-byc', 'options'),
    Input('eventtype-select-byc', 'value'))

def update_event_select_byc(event_type):
    if event_type == 'All events':
        return [{'label': 'All events', 'value': 'All events'}]
    options = [{'label': option, 'value': option} for option in list(df[df[EVENT_TYPE_COL] == event_type][EVENT_DESC_COL].unique())]
    options = [{'label': 'All events in '+event_type, 'value': 'All events in '+event_type}] + options
    return options

@app.callback(
    Output('load-edist-button', 'children'),
    Input('load-edist-button', 'n_clicks'))

def update_edist_button(n_clicks):
    if n_clicks%2 == 0:
        return "Load event distribution plot"
    else:
        return "Hide event distribution plot"

app.clientside_callback(
    """
    function(n_clicks) {
        if(n_clicks % 2 != 0){
            return {'display': 'block'}
        } else {
            return {'display': 'none'}
        }
    }
    """,
    Output('barplotbyc_div', 'style'),
    Input('load-edist-button', 'n_clicks')
)

@app.callback(
    Output('bar-plot-byc', 'figure'),
    [Input('customer-select', 'value'), Input('eventtype-select-byc', 'value'), 
     Input('event-select-byc', 'value'), Input('eventbyc-range', 'start_date'), Input('eventbyc-range', 'end_date')])

def update_barplot_byc(custs, event_type, event, start, end):
    dfcopy = df.copy()
    dfcopy = dfcopy[dfcopy['row_category'] == 'event']
    dfc = dfcopy

    # Create query string to retrieve customer info for all selected customers
    query_string = CUST_ID_COL + ' == "' + custs[0] + '"'

    for cust in custs[1:]:
        query_string += ' or '+CUST_ID_COL+' == "' + cust + '"'

    dfc = dfc.query(query_string)

    # Filter the data according to the user's choice of event 
    if event_type != "All events": 

        if event == 'All events in '+event_type:
            dfc = dfc[dfc[EVENT_TYPE_COL] == event_type]
        else:
            dfc = dfc[dfc[EVENT_DESC_COL] == event]
            
    if start and end:
        start = datetime.strptime(start, '%Y-%m-%d')
        end = datetime.strptime(end, '%Y-%m-%d')
        dfc = dfc[(dfc[EVENT_DATE_COL] >= start) & (dfc[EVENT_DATE_COL] <= end)]
    
        span = (end.year - start.year) * 12 + (end.month - start.month) + (float((end.day - start.day)/30))
        
    
    else:
        span = total_months_spanned
        start = dfcopy[EVENT_DATE_COL].min()
        end = dfcopy[EVENT_DATE_COL].max()
        

    x_data = list(dfc[EVENT_DESC_COL].unique())
    
    

    # Create the bar plot
    trace = []
    for cust in custs:
        
        # For each customer, retrieve the counts of the given events. Must reindex by the x axis for value alignment
        y_data = list(dfc[dfc[CUST_ID_COL] == cust][EVENT_DESC_COL].value_counts().reindex(list(dfc[EVENT_DESC_COL].unique()), fill_value=0))
        trace.append(go.Bar(x=x_data, y=y_data, name=cust + ": " + str(round(len(dfcopy[(dfcopy[CUST_ID_COL] == cust) & (dfcopy[EVENT_DATE_COL] >= start) & (dfcopy[EVENT_DATE_COL] <= end)]) / span, 2)) + " events per month in given range"))

    # Create the layout
    layout = go.Layout(title='Event Distribution by Customer', xaxis=dict(title="Events"), yaxis=dict(title="Event Count"), barmode="group", showlegend = True)

    # Create the figure
    fig = go.Figure(data=trace, layout=layout)

    return fig

    
# Time interval between events
@app.callback(
    Output('event-select-inter-start', 'options'),
    Input('eventtype-select-inter-start', 'value'))

def update_event_select_inter_start(event_type):
    if event_type == 'All events':
        return [{'label': 'All events', 'value': 'All events'}]
    options = [{'label': option, 'value': option} for option in list(df[df[EVENT_TYPE_COL] == event_type][EVENT_DESC_COL].unique())]
    options = [{'label': 'All events in '+event_type, 'value': event_type}] + options
    return options


@app.callback(
    Output('load-timeinterval-button', 'children'),
    Input('load-timeinterval-button', 'n_clicks'))

def update_timeinterval_button(n_clicks):
    if n_clicks%2 == 0:
        return "Load time interval plot"
    else:
        return "Hide time interval plot"
    
@app.callback(
Output('timeintervalbreakdown-button', 'children'),
Input('timeintervalbreakdown-button', 'n_clicks'))

def update_timeintervalbreakdown_button(n_clicks):
    if n_clicks%2 == 0:
        return "Hide event description breakdown"
    else:
        return "Show event description breakdown"

app.clientside_callback(
    """
    function(n_clicks) {
        if(n_clicks % 2 != 0){
            return {'display': 'block'}
        } else {
            return {'display': 'none'}
        }
    }
    """,
    Output('timeinterval_div', 'style'),
    Input('load-timeinterval-button', 'n_clicks')
)

@app.callback(
    Output('scatter-plot-inter', 'figure'),
    [Input('event-select-inter-start', 'value'),Input('timeinter-range', 'start_date'), 
     Input('timeinter-range', 'end_date'),Input('timeintervalbreakdown-button', 'n_clicks')])

def update_scatterplot_inter(event, start, end, n_clicks):
    
    dfcopy = df.copy()
    dfc = dfcopy[dfcopy['row_category'] == 'event']
    dfc = dfc[[CUST_ID_COL,EVENT_TYPE_COL,EVENT_DESC_COL,EVENT_DATE_COL]]
    
    if start and end:
        start = datetime.strptime(start, '%Y-%m-%d')
        end = datetime.strptime(end, '%Y-%m-%d')
        dfc = dfc[(dfc[EVENT_DATE_COL] >= start) & (dfc[EVENT_DATE_COL] <= end)]
        
    
    if event in event_types:
        dfc = dfc[dfc[EVENT_TYPE_COL] == event]
    elif event != 'All events':
        dfc = dfc[dfc[EVENT_DESC_COL] == event]
    
    
    events_to_check = dfc[EVENT_DESC_COL].unique()
    trace = []
    
    if n_clicks % 2 != 0:
        # Count the occurrences of each unique customer
        customer_counts = dfc[CUST_ID_COL].value_counts()

        # Count the number of customers who appear once, twice, three times, and so on
        summary = customer_counts.value_counts().sort_index()

        x_data = list(summary.index)
        y_data = list(summary)

        # Create the bar plot
        trace.append(go.Bar(x=x_data, y=y_data, name = event))

    
    
    else:
        for event in events_to_check:

            dfc2 = dfc[dfc[EVENT_DESC_COL] == event]
            # Count the occurrences of each unique customer
            customer_counts = dfc2[CUST_ID_COL].value_counts()

            # Count the number of customers who appear once, twice, three times, and so on
            summary = customer_counts.value_counts().sort_index()

            x_data = list(summary.index)
            y_data = list(summary)

            # Create the bar plot
            trace.append(go.Bar(x=x_data, y=y_data, name = event))


    # Create the layout
    layout = go.Layout(title='Customer Count by event', xaxis=dict(title="Number of times event was completed"), yaxis=dict(title="Customer count"), barmode="stack")

    # Create the figure
    fig = go.Figure(data=trace, layout=layout)
    
    return fig


# Event frequency scatter
@app.callback(
    Output('efreq-event', 'options'),
    Input('efreq-cat', 'value'))

def update_efreq_event(event_type):
    if event_type == 'All events':
        return [{'label': 'All events', 'value': 'All events'}]
    options = [{'label': option, 'value': option} for option in list(df[df[EVENT_TYPE_COL] == event_type][EVENT_DESC_COL].unique())]
    options = [{'label': 'All events in '+event_type, 'value': event_type}] + options
    return options

@app.callback(
    Output('efreqbin-select', 'style'),
    Input('efreq-type-button', 'n_clicks'))

def update_efreqbin_select(n_clicks):
    if n_clicks % 2 != 0:
        return {'display': 'block'}
    return {'display': 'none'}

@app.callback(
    Output('efreq-type-button', 'children'),
    Input('efreq-type-button', 'n_clicks'))

def update_efreqtype_button(n_clicks):
    if n_clicks%2 == 0:
        return "Histogram"
    else:
        return "Scatter"

@app.callback(
    Output('load-efreq-button', 'children'),
    Input('load-efreq-button', 'n_clicks'))

def update_efreq_button(n_clicks):
    if n_clicks%2 == 0:
        return "Load event frequency plot"
    else:
        return "Hide event frequency plot"

app.clientside_callback(
    """
    function(n_clicks) {
        if(n_clicks % 2 != 0){
            return {'display': 'block'}
        } else {
            return {'display': 'none'}
        }
    }
    """,
    Output('efreq_div', 'style'),
    Input('load-efreq-button', 'n_clicks')
)

@app.callback(
    Output('scatter-plot-efreq', 'figure'),
    [Input('efreq-scatter-xaxis', 'value'),Input('efreq-range', 'start_date'), 
     Input('efreq-range', 'end_date'),Input('efreq-event', 'value'), Input('efreq-type-button', 'n_clicks'),
     Input('bin-select', 'value')])

def update_scatterplot_efreq(x_axis, start, end, event, n_clicks, num_bins):
    
    dfcopy = df.copy()
    dfc = dfcopy[dfcopy['row_category'] == 'event']
    
    if start and end:
        start = datetime.strptime(start, '%Y-%m-%d')
        end = datetime.strptime(end, '%Y-%m-%d')
        dfc = dfc[(dfc[EVENT_DATE_COL] >= start) & (dfc[EVENT_DATE_COL] <= end)]
        
    if event in event_types:
        dfc = dfc[dfc[EVENT_TYPE_COL] == event]
    elif event != 'All events':
        dfc = dfc[dfc[EVENT_DESC_COL] == event]
        
    # Copy the cust_info df to retreive customer ages and account ages
    dfcust = dfcopy[dfcopy['row_category'] == 'cust_info']
    dfcust = dfcust[[CUST_ID_COL,AGE_COL,START_D_COL, SALARY_COL]]
    dfcust[AGE_COL] = dfcust[AGE_COL].astype(int)
        
    if x_axis == START_D_COL:

        dfcust = dfcust.sort_values(by=[START_D_COL], ascending = False)
        x_data = pd.Series([(datetime.now() - start_date) for start_date in dfcust[START_D_COL]]).dt.total_seconds() / 86400.0
        
    else:
        dfcust = dfcust.sort_values(by=[x_axis])
        x_data = dfcust[x_axis]
    
    if n_clicks % 2 != 0:
        
        if x_axis == START_D_COL:
            dfcust['bins'] = pd.cut(pd.Series([(datetime.now() - start_date) for start_date in dfcust[START_D_COL]]).dt.total_seconds() / 86400.0 ,bins = num_bins)
        else:
            dfcust['bins'] = pd.cut(dfcust[x_axis],bins = num_bins)
        
    merged_df = dfcust.merge(dfc.groupby(CUST_ID_COL).size().reset_index(name='count'), on=CUST_ID_COL, how='left')
        
    if start and end:
    
        span = (end.year - start.year) * 12 + (end.month - start.month) + (float((end.day - start.day)/30))
    else:
        span = total_months_spanned
    
    if n_clicks % 2 == 0:
        y_data = [num / span for num in merged_df['count'].fillna(0).astype(float).tolist()]
        trace = [go.Scatter(x=x_data, y=y_data, mode='markers', marker=dict(size=8))]
        
    else:
        y_data = merged_df.groupby(by = ['bins'])['count'].mean()
        y_data = [num / span for num in y_data]
        x_data = sorted(list(dfcust['bins'].astype(str).unique()))
        trace = [go.Bar(x = x_data, y = y_data)]
        

    layout = go.Layout(title='Plot of ' + x_axis + ' vs. Event Frequency',
                       xaxis=dict(title=x_axis if x_axis != START_D_COL else x_axis + ' (number of days ago)'),
                       yaxis=dict(title='Event frequency (per month)'),
                       hovermode='closest')

    fig = go.Figure(data=trace, layout=layout)

    return fig


Run the dash app

In [None]:
app.run_server(debug=True, use_reloader=False)
# app.run(jupyter_mode='external',debug=True) 
# app.run_server(debug=True, use_reloader=False, port=1223, jupyter_mode="external")