In [7]:
import pandas as pd
import numpy as np
import missingno
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from chart_studio.plotly import plotly
from plotly.offline import plot
import plotly.graph_objs as go


###### EDA FUNCTIONS


def eda(df):
    """Given dataframe, generate exploratory data analysis"""
    # check that input is pandas dataframe
    if type(df) != pd.core.frame.DataFrame:
        raise TypeError("Only pandas dataframe is allowed as input")
        
    # replace field that's entirely space (or empty) with NaN
    df = df.replace(r'^\s*$', np.nan, regex=True)

    print("Preview of data:")
    display(df.head(3))

    print("\nTo check: \n (1) Total number of entries \n (2) Column types \n (3) Any null values\n")
    print(df.info())

    # generate preview of entries with null values
    if df.isnull().any(axis=None):
        print("\nPreview of data with null values:")
        display(df[df.isnull().any(axis=1)].head(3))
        missingno.matrix(df)
        plt.show()

    # generate count statistics of duplicate entries
    if len(df[df.duplicated()]) > 0:
        print("\n***Number of duplicated entries: ", len(df[df.duplicated()]))
        display(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head())
    else:
        print("\nNo duplicated entries found")

    # EDA of categorical data
    categorical_eda(df)
    
def top5(df):
    """Given dataframe, generate top 5 unique values for non-numeric data"""
    columns = df.select_dtypes(include=['object', 'category']).columns
    for col in columns:
        print("Top 5 unique values of " + col)
        print(df[col].value_counts().reset_index().rename(columns={"index": col, col: "Count"})[
              :min(5, len(df[col].value_counts()))])
        print(" ")
    
    
def categorical_eda(df, hue=None):
    """Given dataframe, generate EDA of categorical data"""
    print("\nTo check: \nUnique count of non-numeric data\n")
    print(df.select_dtypes(include=['object', 'category']).nunique())
    top5(df)
    # Plot count distribution of categorical data
    for col in df.select_dtypes(include='category').columns:
        fig = sns.catplot(x=col, kind="count", data=df, hue=hue)
        fig.set_xticklabels(rotation=90)
        plt.show()
        
        
###### FUNNEL ANALYSIS FUNCTIONS


def create_funnel_df(df, steps, from_date=None, to_date=None):
    """
    Function used to create a dataframe that can be passed to functions for generating funnel plots.
    """
    # filter df for only events in the steps list
    df = df[['user_id', 'event_name', 'timestamp']]
    df = df[df['event_name'].isin(steps)]

    values = []
    # for the rest steps, create a df and filter only for that step
    for i, step in enumerate(steps):
        if i == 0:
            dfs = {}

            dfs[step] = df[df['event_name'] == step] \
                .sort_values(['user_id', 'timestamp'], ascending=True) \
                .drop_duplicates(subset=['user_id', 'event_name'], keep='first')

            # filter df of 1st step according to dates
            if from_date:
                dfs[step] = dfs[step][(dfs[step]['timestamp'] >= from_date)]

            if to_date:
                dfs[step] = dfs[step][(dfs[step]['timestamp'] <= to_date)]

        else:
            dfs[step] = df[df['event_name'] == step]

            # outer join with previous step
            merged = pd.merge(dfs[steps[i - 1]], dfs[step], on='user_id', how='outer')

            # keep only rows for which the user_ids appear in the previous step
            valid_ids = dfs[steps[i - 1]]['user_id'].unique()
            merged = merged[merged['user_id'].isin(valid_ids)]

            # keep only events that happened after previous step and sort by time
            merged = merged[merged['timestamp_y'] >= (merged['timestamp_x'])].sort_values('timestamp_y', ascending=True)

            # take the minimum time of the valid ones for each user
            merged = merged.drop_duplicates(subset=['user_id', 'event_name_x', 'event_name_y'], keep='first')

            # keep only the necessary columns and rename them to match the original structure
            merged = merged[['user_id', 'event_name_y', 'timestamp_y']].rename({'event_name_y': 'event_name',
                                                                         'timestamp_y': 'timestamp'}, axis=1)

            # include the df in the df dictionary so that it can be joined to the next step's df
            dfs[step] = merged

        # append number of users to the "values" list
        values.append(len(dfs[step]))

    # create dataframe
    funnel_df = pd.DataFrame({'step': steps, 'val': values})
    # calculate percentage conversion for each step
    funnel_df['pct'] = (100 - 100 * abs(funnel_df['val'].pct_change()).fillna(0)).astype(int)
    # shift val by one to plot faded bars of previous step in background
    funnel_df['val-1'] = funnel_df['val'].shift(1)
    # calculate percentage conversion between each step and the first step in the funnel
    funnel_df['pct_from_first'] = (funnel_df['val'] / funnel_df['val'].loc[0] * 100).fillna(0).astype(int)

    return funnel_df



def group_funnel_dfs(events, steps, col):
    """
    Function used to create a dict of funnel dataframes used to generate a stacked funnel plot.
    """
    dict_ = {}
    
    # get the distinct_ids for each property that we are grouping by
    ids = dict(events.groupby([col])['user_id'].apply(set))

    for entry in events[col].dropna().unique():
        ids_list = ids[entry]
        df = events[events['user_id'].isin(ids_list)].copy()
        if len(df[df['event_name'] == steps[0]]) > 0:
           dict_[entry] = create_funnel_df(df, steps)

    return dict_


def plot_stacked_funnel(events, steps, col=None, from_date=None, to_date=None, step_interval=0):
    """
    Function used for producing a (stacked) funnel plot.
    """
    # create list to append each trace to
    # this will be passed to "go.Figure" at the end
    data = []

    # if col is provided, create a funnel_df for each entry in the "col"
    if col:
        # generate dict of funnel dataframes
        dict_ = group_funnel_dfs(events, steps, col)
        title = 'Funnel plot per {}'.format(col)
    else:
        funnel_df = create_funnel_df(events, steps, from_date=from_date, to_date=to_date)
        dict_ = {'Total': funnel_df}
        title = 'Funnel plot'

    for t in dict_.keys():
        trace = go.Funnel(
            name=t,
            y=dict_[t].step.values,
            x=dict_[t].val.values,
            textinfo="label+value+percent previous+percent initial",
            textfont = {"size": 22, "color": "black"}
#             marker = {"color": ["#24a0db", "#24a0db", "#24a0db", "#24a0db", "#24a0db"]}
        )
        data.append(trace)

    layout = go.Layout(margin={"l": 180, "r": 0, "t": 30, "b": 0, "pad": 0},
                       paper_bgcolor='rgba(0,0,0,0)',
                       plot_bgcolor='rgba(0,0,0,0)',
                       funnelmode="stack",
                       showlegend=True,
                       hovermode='closest',
                       title='Funnel plot per {}'.format(col),
                       legend=dict(orientation="v",
                                   bgcolor='#E2E2E2',
                                   xanchor='left',
                                   font=dict(
                                       size=12)
                                   )
                       )

    return go.Figure(data, layout)