In [1]:
from dash import Dash, html, dcc, Input, Output, dash_table
import dash
import pandas as pd
import plotly.express as px

import pandas as pd
from os import path
import numpy as np
import matplotlib.pyplot as plt
import sqlite3

In [2]:
def get_artist_name(artist_code):
    conn = sqlite3.connect('CroLyrics_data/info.db')
    c = conn.cursor()
    c.execute("SELECT name FROM artists WHERE code=:code", {'code': artist_code})
    return c.fetchone()

def get_number_of_songs(artist_code):
    conn = sqlite3.connect('CroLyrics_data/info.db')
    c = conn.cursor()
    c.execute("SELECT number_of_songs FROM artists WHERE code=:code", {'code': artist_code})
    return c.fetchone()

In [3]:
def generate_table(df, word):
    return html.Div(children = [html.H6(children=f'List of songs mentioning {word}'),
    	dash_table.DataTable(
            id='datatable-interactivity',
            columns=[
            {"name": i, "id": i, "deletable": False, "selectable": False, "presentation": 'markdown'} for i in df.columns
        ],
        data=df.to_dict('records'),
        editable=False,
        filter_action="native",
        sort_action="native",
        sort_mode="multi",
        row_selectable=False,
        row_deletable=False,
        page_action="native",
        page_current= 0,
        page_size= 10,
    )])


In [4]:
def get_subdf(subgroup): 
    if subgroup == "Location": dataframe = locations
    elif subgroup == "Name": dataframe = names
    elif subgroup == "Organization": dataframe = organizations
    elif subgroup == "General Noun": dataframe = nouns
    elif subgroup == "Verb": dataframe = verbs
    elif subgroup == "Adjective": dataframe = adjectives
    elif subgroup == "Adverb": dataframe = adverbs
    elif subgroup == "Proper Noun": dataframe = proper_nouns
    elif subgroup == "All words": dataframe = df_unique_lemmas
    return dataframe


def get_artist_code(artist_name):
    conn = sqlite3.connect('CroLyrics_data/info.db')
    c = conn.cursor()
    c.execute("SELECT code FROM artists WHERE name=:name", {'name': artist_name})
    return c.fetchone()

In [None]:
def get_bar_plot(x, y, colors, x_label, y_label, hover=None, title = ""):
    fig = px.bar(x=x, text=y, orientation='h', color = colors, color_discrete_map = "identity", title = title)
    
    fig.update_yaxes(title=y_label, showticklabels=False, showgrid=False)
    fig.update_xaxes(title=x_label, showgrid=False)
    fig.update_layout(margin=dict(l=20, r=20, t=25, b=20), plot_bgcolor='white')
    if hover: fig.update_traces(hovertemplate=hover)
    return fig

def get_bar_colors(y, clickData, main_color="DimGray", highlight_color="crimson"):
    colors = [main_color] * len(y)
    if clickData: 
        clicked_artist = clickData['points'][0]['text']
        colors[np.where(np.array(y) == clicked_artist)[0][0]] = highlight_color
    return colors

In [5]:
conn = sqlite3.connect('CroLyrics_data/info.db')
c = conn.cursor()
c.execute("SELECT name FROM artists")
all_artists = [a[0] for a in c.fetchall()]
all_artists.sort()
conn.close()

In [6]:
nlp_file_path = f"CroLyrics_data/nlp_all.csv"
df = pd.read_csv(nlp_file_path)
df_main_words = df[df.upos.isin(["ADV", "ADJ", "NOUN", "VERB", "PROPN"])]
df_main_words.reset_index(drop=True, inplace=True)

df_unique_lemmas = df_main_words.drop_duplicates(subset=["lemma", "Song_ID"]).reset_index(drop=False)
df_unique_lemmas["Artist_ID"] = df_unique_lemmas.Song_ID.str.split("_").map(lambda x: x[0])
locations = df_unique_lemmas[(df_unique_lemmas["ner"].isin(["B-LOC", "I-LOC"])) & (df_unique_lemmas["upos"]=="PROPN")]#["lemma"]#.value_counts()
names = df_unique_lemmas[(df_unique_lemmas["ner"].isin(["B-PER", "I-PER"])) & (df_unique_lemmas["upos"]=="PROPN")]#["lemma"]#.value_counts()
organizations = df_unique_lemmas[(df_unique_lemmas["ner"].isin(["B-ORG", "I-ORG"])) & (df_unique_lemmas["upos"]=="PROPN")]#["lemma"]#.value_counts()
nouns = df_unique_lemmas[(df_unique_lemmas.upos=="NOUN")]#["lemma"]#.value_counts()
verbs = df_unique_lemmas[(df_unique_lemmas.upos=="VERB")]#["lemma"]#.value_counts()
adjectives = df_unique_lemmas[(df_unique_lemmas.upos=="ADJ")]#["lemma"]#.value_counts()
adverbs = df_unique_lemmas[(df_unique_lemmas.upos=="ADV")]#["lemma"]#.value_counts()
proper_nouns =  df_unique_lemmas[(df_unique_lemmas.upos=="PROPN")]


In [7]:
wordtypes = ["All words", "Proper Noun", "General Noun", "Verb", "Adjective", "Adverb"]

In [69]:
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = Dash(__name__, external_stylesheets=external_stylesheets)


app.layout = html.Div([
    html.Div(className='row', children=[
        html.Div(className='three columns', children = [
                dcc.Dropdown(
                    wordtypes,
                    'All words',
                    id='barplot-subgroup',
                ),
                dcc.RadioItems(
                ['10', '20'],
                '10',
                id='show-top-x',
                labelStyle={'display': 'inline-block'})

                ], style={'width': '30%', 'display': 'inline-block'}),
        html.Div(className='three columns', children = [
            dcc.RadioItems(
                ['Absolute', 'Percentage'],
                'Absolute',
                id='abs-vs-perc-artists',
                labelStyle={'display': 'inline-block'}
            )
        ], style={'width': '30%', 'display': 'inline-block', 'padding': '0% 0%'})
    ], style={'padding': '5% 5% 0% 5%'}),

    html.Div(className='row', children=[
        html.Div(className='three columns', children = [
            dcc.Graph(
                id='barplot-most-common'
            )
        ], style={'display': 'inline-block', 'width': '30%'}),

        html.Div(className='three columns', children = [
            dcc.Graph(id='barplot-artists'),
        ], style={'display': 'inline-block', 'width': '30%'}),

        html.Div(className='three columns', id='songs-table', children=[], style={'display': 'inline-block', 'width': '30%'})

    ], style={'padding': '0% 5%'}),

    html.Div(className='row', children=[
        html.Div(className='three columns', children = [
                dcc.Dropdown(
                    all_artists,
                    all_artists[0],
                    id='barplot-subgroup-artist',
                ),
                dcc.Dropdown(
                    wordtypes,
                    'All words',
                    id='barplot-subgroup-wordtype',
                )], style={'width': '30%', 'display': 'inline-block'})
        

    ], style={'padding': '5% 5% 0% 5%'}),


    html.Div(className='row', children=[
        html.Div(className='three columns', children = [
            dcc.Graph(
                id='barplot-most-common-by-artist',
                clickData={'points': [{'text': 'Hrvatska'}]}
            )
        ], style={'display': 'inline-block', 'width': '30%'})

    ], style={'padding': '0% 5%'}),

   
    html.Div([dcc.Store(id='intermediate-df')])
])




def create_artists_plot(artists, counts, total_counts, abs_or_perc, lemma, clickData):
    if abs_or_perc == "Absolute": 
        x = counts
        text = artists
        x_label = "Number of songs"
        hover = 'Artist: %{text} <br>Number of songs: %{x}'
    elif abs_or_perc == "Percentage": 
        sorted_perc = [[x, y] for x, y in sorted(zip(counts/total_counts, artists), reverse=True)]
        x = [s[0]*100 for s in sorted_perc]
        text = [s[1] for s in sorted_perc]
        x_label = "Percentage of songs"
        hover = 'Artist: %{text} <br>% of songs: %{x}'
    
    x=np.flip(x[0:10])
    y=np.flip(text[0:10])
    colors = get_bar_colors(y, clickData)
    fig = get_bar_plot(x, y, colors, x_label=x_label, y_label="Artist", hover= hover, title=f"Top 10 artists mentioning the word {lemma}")

    return fig


@app.callback(
    Output('barplot-most-common', 'figure'),
    Input('barplot-most-common', 'clickData'),
    Input('barplot-subgroup', 'value'),
    Input('show-top-x', 'value'))
def update_graph(clickData, subgroup, n):
    dataframe = get_subdf(subgroup)
    value_counts = dataframe.lemma.value_counts()
    n= int(n)
    x = np.flip(value_counts[0:n].values)
    y = np.flip(value_counts[0:n].index)
    
    colors = get_bar_colors(y, clickData)
    fig = get_bar_plot(x, y, colors, x_label = "Number of songs", y_label="Word", hover= 'Word: %{text} <br>Number of songs: %{x}')

 
    return fig







@app.callback(
    Output('barplot-artists', 'figure'),
    Input('barplot-most-common', 'clickData'),
    Input('barplot-artists', 'clickData'),
    Input('abs-vs-perc-artists', 'value'),
    Input('barplot-subgroup', 'value'))
def update_artists_plot(clickData, clickData_artist, abs_or_perc, subgroup):
   
    dataframe = get_subdf(subgroup)
    
    ctx = dash.callback_context
    input_id = ctx.triggered[0]['prop_id'].split('.')[0]

    if clickData and (input_id == "barplot-most-common" or input_id == "abs-vs-perc-artists"or input_id == "barplot-artists"): lemma = clickData['points'][0]['text']
    else: lemma = dataframe.lemma.value_counts().index[0]

    if input_id != "barplot-artists": clickData_artist=None

    value_counts_artist = dataframe[dataframe.lemma == lemma].Artist_ID.value_counts()
    artists = value_counts_artist.index.map(lambda x: get_artist_name(x)[0])
    counts = value_counts_artist.values
    total_counts = [get_number_of_songs(a)[0] for a in value_counts_artist.index]
        
    return create_artists_plot(artists, counts, total_counts, abs_or_perc, lemma, clickData_artist)


@app.callback(
    [Output('songs-table', 'children'), Output('barplot-most-common', 'clickData')],
    [Input('barplot-most-common', 'clickData'),
    Input('barplot-subgroup', 'value'),
    Input('barplot-artists', 'clickData')])
def update_songs_table(clickData_word, subgroup, clickData_artist):
    dataframe = get_subdf(subgroup)

    ctx = dash.callback_context
    input_id = ctx.triggered[0]['prop_id']
    
    if (input_id == "barplot-most-common.clickData") or (input_id == "barplot-artists.clickData"): lemma = clickData_word['points'][0]['text']
    else: 
        lemma = dataframe.lemma.value_counts().index[0]
        if clickData_word: clickData_word['points'][0]['text']=lemma

    
    if input_id == "barplot-artists.clickData": 
        artist_id = get_artist_code(clickData_artist['points'][0]['text'])[0]
        songs_ids = dataframe[(dataframe.lemma == lemma) & (dataframe.Artist_ID == artist_id)].Song_ID.values
    else: songs_ids = dataframe[dataframe.lemma == lemma].Song_ID.values

    all_data = []
    conn = sqlite3.connect('CroLyrics_data/info.db')
    c = conn.cursor()
    for i in range(0, len(songs_ids), 999):
        placeholder= '?'
        placeholders= ', '.join(placeholder for unused in songs_ids[i:i+999])
        query= 'SELECT name, artist_name, views, url FROM songs_info WHERE code IN (%s)' % placeholders
        c.execute(query, songs_ids[i:i+999])
        all_data.extend(c.fetchall())

    df_songs = pd.DataFrame(all_data)
    df_songs.columns = ["Song", "Artist", "Views", "Lyrics Url"]
    df_songs["Song"] = "[" + df_songs["Song"] + "](" + df_songs["Lyrics Url"] + ")"
    df_songs = df_songs.drop("Lyrics Url", axis=1)
    df_songs = df_songs.sort_values("Views", ascending=False)

    conn.close()
    return generate_table(df_songs, lemma), clickData_word



@app.callback(
    Output('barplot-most-common-by-artist', 'figure'),
    Input('barplot-subgroup-artist', 'value'),
    Input('barplot-subgroup-wordtype', 'value'))
def update_graph_by_artist(artist, wordtype):
 
    dataframe = get_subdf(wordtype)
    artist_id = get_artist_code(artist)[0]
    value_counts = dataframe[dataframe.Artist_ID == artist_id].lemma.value_counts()
       
       
    fig = px.bar(
                x=np.flip(value_counts[0:10].values), text=np.flip(value_counts[0:10].index), 
               orientation='h')
    
    fig.update_traces(hovertemplate='Word: %{text} <br>Number of songs: %{x}')
    
    fig.update_yaxes(title=wordtype, showticklabels=False)
    fig.update_xaxes(title="Number of songs")
    fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
    return fig

if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is run