In [1]:
from dash import Dash, dcc, html, Input, Output, State
from dash.exceptions import PreventUpdate
import advertools as adv
import adviz
import plotly.express as px
import pandas as pd
from dash_bootstrap_templates import load_figure_template
load_figure_template('all')
pd.set_option('future.no_silent_downcasting', True)
import dash_bootstrap_components as dbc
dbc_css = "https://cdn.jsdelivr.net/gh/AnnMarieW/dash-bootstrap-templates/dbc.min.css"
app = Dash(__name__, external_stylesheets=[dbc.themes.COSMO, dbc_css])
template = 'cosmo'
pq_columns = adv.crawlytics.parquet_columns('nasa_crawl.parquet')
str_columns = pq_columns[pq_columns['type'].eq('string')]['column'] #.str.replace(r'\.|\{', '_', regex=True)
num_columns = pq_columns[pq_columns['type'].eq('double') | pq_columns['type'].astype(str).str.startswith('int')]['column']
pq_path = 'nasa_crawl.parquet'


app.layout = html.Div([
    html.Br(),
    html.H1('Crawl data interactive analytics'), html.Br(),
    dbc.Row([
        dbc.Col([
            dbc.Label('Choose column:'), html.Br(),
            dcc.Dropdown(id='dropdown', options=pq_columns['column']),

        ], lg=3),
        dbc.Col([
            dbc.Label('Choose ngrams:'), html.Br(),
            dcc.Slider(id='slider', min=1, max=3, step=1, value=1, included=False, dots=True)
        ], lg=3),

    ]),
    html.Br(),
    dcc.Loading(html.Div(id='output'), type='graph'),
], className='dbc', style={'marginLeft': '7%', 'marginRight': '4%'})

@app.callback(
    Output('output', 'children'),
    Input('dropdown', 'value'),
    Input('slider', 'value'))
def set_dropdown_options(column, slider):
    if not column:
        raise PreventUpdate
    columns = ['url', column]
    if column == 'url':
        columns = ['url']
    df = pd.read_parquet(pq_path, columns=columns)
    if column == 'status':
        fig = adviz.status_codes(df[column], theme=template, height=550)
        return html.Div([
            html.H2(f'Status codes'),
            dcc.Graph(figure=fig)
        ])
    if column == 'url':
        urldf = adv.url_to_df(df['url'])
        dir_df = urldf.filter(regex=r'^dir_\d+')
        valcounts_df = dir_df.iloc[:, :slider].value_counts().reset_index()
        fig = px.bar(
            valcounts_df[:20][::-1],
            x='count',
            y=valcounts_df.iloc[:20, :slider].astype(str).add('/').astype(str).sum(axis=1)[::-1],
            orientation='h',
            template=template,
            height=600,
            width=850)
        fig.layout.yaxis.title = 'URL directory counts'
        return html.Div([
            html.H2(f'Top URL directories:'),
            dcc.Graph(figure=fig)
        ])

    if column in str_columns.values:
        word_freq_df = adv.word_frequency(df[column].dropna(), phrase_len=slider)
        fig = px.bar(
            word_freq_df[:20][::-1],
            x='abs_freq',
            y='word',
            labels={'abs_freq': 'Count'},
            orientation='h', height=600, width=850, template=template)
        return html.Div([
            html.H2(f'Word frequencey: {column}'),
            dcc.Graph(figure=fig)
        ])
    if column in num_columns.values:
        fig = px.ecdf(
            df,
            x=column,
            lines=False,
            markers=True,
            ecdfnorm='percent',
            hover_name='url',
            hover_data=[column],
            height=650,
            marginal='histogram',
            template=template)
        fig.layout.yaxis.ticksuffix = '%'
        return html.Div([
            html.H2(f'Word frequencey: {column}'),
            dcc.Graph(figure=fig)
        ])

app.run(jupyter_mode='external')

Dash app running on http://127.0.0.1:8050/
