In [2]:
from dash import Dash, html, dcc, Input, Output, dash_table, State
import dash
import pandas as pd
import plotly.express as px

import pandas as pd
from os import path
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import plotly.graph_objects as go
import dash_daq as daq

import plotly.io as pio




In [None]:
main_color="DimGray"
highlight_color="crimson"

colors = {
    'background': '#111111',
    'text': "lightgray",#'white', #'#7FDBFF',
    'graph_background': 'rgb(30,30,30)',
    'bars': px.colors.qualitative.Plotly[0],
    'highlight': px.colors.qualitative.Plotly[1],
    'padding': 'rgb(30,30,30)'
}

In [70]:
wordtypes = ["All words", "Proper Noun", "General Noun", "Verb", "Adjective", "Adverb", "Determiner"]
upos_wordtypes = ["PNOUN", "NOUN", "VERB", "ADJ", "ADV", "DET"]
upos_dict = dict(zip(wordtypes[1:], upos_wordtypes))

In [3]:
def get_artist_name(artist_code):
    conn = sqlite3.connect('CroLyrics_data/info.db')
    c = conn.cursor()
    c.execute("SELECT name FROM artists WHERE code=:code", {'code': artist_code})
    return c.fetchone()

def get_number_of_songs(artist_code):
    conn = sqlite3.connect('CroLyrics_data/info.db')
    c = conn.cursor()
    c.execute("SELECT number_of_songs FROM artists WHERE code=:code", {'code': artist_code})
    return c.fetchone()

In [176]:
def get_top_n(l, n, reverse=False):
    result=[]
    if n>len(l): n=len(l)
    elif n==0: return result
    if not reverse:
        for i in l:
            if i<l[n-1]: break
            result.append(i)
    else:
        l = np.flip(l)
        for i in l:
            if i>l[n-1]: break
            result.append(i)
            
    return result
    

In [4]:
def generate_table(df, word):
    return html.Div(children = [html.H6(children=f'List of songs mentioning {word}'),
    	dash_table.DataTable(
            id='datatable-interactivity',
            columns=[
            {"name": i, "id": i, "deletable": False, "selectable": False, "presentation": 'markdown'} for i in df.columns
        ],
        data=df.to_dict('records'),
        editable=False,
        filter_action="native",
        sort_action="native",
        sort_mode="multi",
        row_selectable=False,
        row_deletable=False,
        page_action="native",
        page_current= 0,
        page_size= 10,
        style_header={
        'backgroundColor': 'rgb(30, 30, 30)',
        'color': 'white'
    },
    style_data={
        'backgroundColor': 'rgb(50, 50, 50)',
        'color': 'white'}

    )])


In [6]:
def get_subdf(subgroup): 
    if subgroup == "Location": dataframe = locations
    elif subgroup == "Name": dataframe = names
    elif subgroup == "Organization": dataframe = organizations
    elif subgroup == "General Noun": dataframe = nouns
    elif subgroup == "Verb": dataframe = verbs
    elif subgroup == "Adjective": dataframe = adjectives
    elif subgroup == "Adverb": dataframe = adverbs
    elif subgroup == "Proper Noun": dataframe = proper_nouns
    elif subgroup == "All words": dataframe = df_unique_lemmas
    elif subgroup == "Determiner": dataframe = determiners
    return dataframe

def get_subdf_by_wordlist(word_list): 
    dataframe = df_unique_lemmas[df_unique_lemmas.lemma.isin(word_list)]
    return dataframe



def get_artist_code(artist_name):
    conn = sqlite3.connect('CroLyrics_data/info.db')
    c = conn.cursor()
    c.execute("SELECT code FROM artists WHERE name=:name", {'name': artist_name})
    return c.fetchone()

In [7]:
def get_bar_plot(x, y, colors, x_label, y_label, hover=None, title = "", template ="plotly_dark", background_color = colors['graph_background']):
    fig = px.bar(x=x, text=y, orientation='h', color = colors, color_discrete_map = "identity", title = title, template=template)
    
    fig.update_yaxes(title=y_label, showticklabels=False, showgrid=False)
    fig.update_xaxes(title=x_label, showgrid=False)
    fig.update_layout(margin=dict(l=50, r=50, t=50, b=50), plot_bgcolor=background_color, paper_bgcolor =background_color)
    if hover: fig.update_traces(hovertemplate=hover)
    return fig

def get_bar_colors(y, highlighted=None, main_color=colors['bars'], highlight_color=colors['highlight']):
    colors = [main_color] * len(y)
    if highlighted: 
        colors[np.where(np.array(y) == highlighted)[0][0]] = highlight_color
    return colors

In [8]:
conn = sqlite3.connect('CroLyrics_data/info.db')
c = conn.cursor()
c.execute("SELECT name FROM artists")
all_artists = [a[0] for a in c.fetchall()]
all_artists.sort()
conn.close()

In [9]:
nlp_file_path = f"CroLyrics_data/nlp_all.csv"
df = pd.read_csv(nlp_file_path)
df_main_words = df[df.upos.isin(["ADV", "ADJ", "NOUN", "VERB", "PROPN", "DET"])]
df_main_words.reset_index(drop=True, inplace=True)

df_unique_lemmas = df_main_words.drop_duplicates(subset=["lemma", "Song_ID"]).reset_index(drop=False)
df_unique_lemmas["Artist_ID"] = df_unique_lemmas.Song_ID.str.split("_").map(lambda x: x[0])
locations = df_unique_lemmas[(df_unique_lemmas["ner"].isin(["B-LOC", "I-LOC"])) & (df_unique_lemmas["upos"]=="PROPN")]#["lemma"]#.value_counts()
names = df_unique_lemmas[(df_unique_lemmas["ner"].isin(["B-PER", "I-PER"])) & (df_unique_lemmas["upos"]=="PROPN")]#["lemma"]#.value_counts()
organizations = df_unique_lemmas[(df_unique_lemmas["ner"].isin(["B-ORG", "I-ORG"])) & (df_unique_lemmas["upos"]=="PROPN")]#["lemma"]#.value_counts()
nouns = df_unique_lemmas[(df_unique_lemmas.upos=="NOUN")]#["lemma"]#.value_counts()
verbs = df_unique_lemmas[(df_unique_lemmas.upos=="VERB")]#["lemma"]#.value_counts()
adjectives = df_unique_lemmas[(df_unique_lemmas.upos=="ADJ")]#["lemma"]#.value_counts()
adverbs = df_unique_lemmas[(df_unique_lemmas.upos=="ADV")]#["lemma"]#.value_counts()
proper_nouns =  df_unique_lemmas[(df_unique_lemmas.upos=="PROPN")]
determiners =  df_unique_lemmas[(df_unique_lemmas.upos=="DET")]



In [10]:
def get_connected_words(df, wordtype1, wordtype2):
  wordtype1_indices = df[df.upos==wordtype1].index
  if wordtype1_indices[-1] == len(df)-1: wordtype1_indices = wordtype1_indices[0:-1]

  wordtype2_series = df.loc[wordtype1_indices+1, "upos"] == wordtype2
  wordtype2_indices = wordtype2_series.index[np.where(wordtype2_series)]
  wordtype1_df = df.loc[wordtype2_indices-1][["lemma", "text", "Song_ID", "Song"]].reset_index(drop=True)
  wordtype1_df.columns = [wordtype1+"_lemma", wordtype1+"_text", "Song_ID", "Song"]
  wordtype2_df = df.loc[wordtype2_indices][["lemma", "text", "Song_ID"]].reset_index(drop=True)
  wordtype2_df.columns = [wordtype2+"_lemma", wordtype2+"_text", "Song_ID_2"]

  result_df = wordtype1_df.merge(wordtype2_df, left_index=True, right_index=True)
  result_df = result_df[result_df.Song_ID == result_df.Song_ID_2].reset_index(drop=True)
  result_df.drop("Song_ID_2", axis=1, inplace=True)
  result_df["concat_lemma"] = result_df[wordtype1+"_lemma"]+ " " +result_df[wordtype2+"_lemma"]
  result_df["concat_text"] = result_df[wordtype1+"_text"]+ " " +result_df[wordtype2+"_text"]
  return result_df


In [18]:
df.upos.unique()

array(['ADV', 'PUNCT', 'ADP', 'PRON', 'AUX', 'INTJ', 'DET', 'ADJ', 'NOUN',
       'PART', 'VERB', 'SCONJ', 'CCONJ', 'X', 'NUM', 'PROPN', 'SYM'],
      dtype=object)

In [31]:
df[df.upos=="CCONJ"]

Unnamed: 0,id,text,lemma,upos,xpos,feats,head,deprel,misc,ner,...,Number[psor],Poss,Definite,Polarity,Reflex,Voice,Foreign,NumType,Animacy,Gender[psor]
46,1,A,a,CCONJ,Cc,,6,discourse,,O,...,,,,,,,,,,
102,6,i,i,CCONJ,Cc,,7,discourse,,O,...,,,,,,,,,,
219,12,i,i,CCONJ,Cc,,13,cc,,O,...,,,,,,,,,,
247,12,i,i,CCONJ,Cc,,13,cc,,O,...,,,,,,,,,,
269,9,I,i,CCONJ,Cc,,10,discourse,,O,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
671000,27,al,ali,CCONJ,Cc,,31,cc,SpaceAfter=No,O,...,,,,,,,,,,
671047,2,i,i,CCONJ,Cc,,3,cc,,O,...,,,,,,,,,,
671050,5,i,i,CCONJ,Cc,,8,cc,,O,...,,,,,,,,,,
671121,2,i,i,CCONJ,Cc,,3,cc,,O,...,,,,,,,,,,


In [44]:
  upos_to_include = ["ADV", "ADP", "PRON", "AUX", "INTJ", "DET", "ADJ", "NOUN", "PART", "VERB", "SCONJ", "CCONJ", 'NUM', 'PROPN']
  word_indices = df[df.lemma=="ljubav"].index

  connected_words_bool = df.loc[word_indices-1, "upos"].isin(upos_to_include)
  connected_words_indices = connected_words_bool.index[np.where(connected_words_bool)]



In [None]:
word_df = df.loc[words_before_indices+1][["lemma", "text", "Song_ID", "Song"]].reset_index(drop=True)

word_before_df = df.loc[words_before_indices][["lemma", "text", "Song_ID", "Song"]].reset_index(drop=True)
word_before_df.columns = ["before_lemma", "before_text", "before_Song_ID"].reset_index(drop=True)

word_after_df = df.loc[words_before_indices][["lemma", "text", "Song_ID", "Song"]].reset_index(drop=True)
word_before_df.columns = ["before_lemma", "before_text", "before_Song_ID"].reset_index(drop=True)

In [54]:
def get_connected_words(df, word, before_or_after = "before", upos_to_include = ["ADV", "ADP", "PRON", "AUX", "INTJ", "DET", "ADJ", "NOUN", "PART", "VERB", "SCONJ", "CCONJ", 'NUM', 'PROPN']):
  word_indices = df[df.lemma==word].index
  

  if before_or_after == "before": shift = -1
  elif before_or_after == "after": shift = 1
    
  connected_words_bool = df.loc[word_indices+shift, "upos"].isin(upos_to_include)
  connected_words_indices = connected_words_bool.index[np.where(connected_words_bool)]

  connected_words_df = df.loc[connected_words_indices][["lemma", "text", "Song_ID"]].reset_index(drop=True)
  connected_words_df.columns = [before_or_after+"_lemma", before_or_after+"_text", "connected_Song_ID"]

  word_df = df.loc[connected_words_indices-shift][["text", "Song_ID"]].reset_index(drop=True)

  result_df = word_df.merge(connected_words_df, left_index=True, right_index=True)
  result_df = result_df[result_df.Song_ID == result_df.connected_Song_ID].reset_index(drop=True)
  result_df.drop("connected_Song_ID", axis=1, inplace=True)

  return result_df

In [64]:
get_connected_words(df, "Zagreb", before_or_after = "before", upos_to_include = ["ADV", "PRON", "DET", "ADJ", "NOUN", "VERB", 'NUM', 'PROPN']).before_lemma.value_counts()[0:20]


nov       1
ti        1
hodati    1
Name: before_lemma, dtype: int64

In [73]:
upos_dict.values()

dict_values(['PNOUN', 'NOUN', 'VERB', 'ADJ', 'ADV', 'DET'])

In [74]:
get_connected_words(df, "ljubav", before_or_after = "after", upos_to_include = upos_dict.values()).after_lemma.value_counts()[0:20]


moj        90
tvoj       33
jedini     31
koji       27
sav        27
naš        21
nemati     19
imati      18
taj        17
moći       15
svoj       15
prav       12
star       12
nov        12
velik      12
nikad      12
živjeti    11
čuvati     11
dati       11
uzeti      10
Name: after_lemma, dtype: int64

In [85]:
get_connected_words(df, "ljubav", before_or_after = "after", upos_to_include = [upos_dict[k] for k in ["Proper Noun"]]).after_lemma.value_counts()[0:20]


Series([], Name: after_lemma, dtype: int64)

In [11]:
adj_noun = get_connected_words(df, "ADJ", "NOUN")
det_noun = get_connected_words(df, "DET", "NOUN")
adv_verb = get_connected_words(df, "ADV", "VERB")


In [12]:
det_noun_unique = det_noun.drop_duplicates(subset=["Song_ID", "concat_lemma"])
adj_noun_unique = adj_noun.drop_duplicates(subset=["Song_ID", "concat_lemma"])
adv_verb_unique = adv_verb.drop_duplicates(subset=["Song_ID", "concat_lemma"])


In [13]:
def get_sankey_data(df, source_words, source_column, target_column, num_targets=5):

    num_sources = len(source_words)

    label = source_words.copy()
    link_size = []
    target = []
    song_ids = []
    for source_word in source_words:
        top_n_words = df[df[source_column]==source_word][target_column].value_counts()[0:num_targets]
        
        link_size.extend(top_n_words.values)
        for target_word in top_n_words.index:
            song_ids.append(df[(df[source_column]==source_word) & (df[target_column]==target_word)].Song_ID.values)

            if not target_word in label:
                label.append(target_word)
                target.append(len(label)-1)
            else: target.append(label.index(target_word))

    source = []
    for i in range(num_sources):
        source.extend([i]*num_targets)
    
    return label, source, target, link_size, song_ids

In [14]:
def get_nodes_color(label, target, link_size):
    first_color = "blue"
    first_color_light = "lightblue"
    second_color = "yellow"
    second_color_light = "lightyellow"
    colors = [first_color, second_color]
    for i, _ in enumerate(label[2:]):
        if target.count(i+2)==1:
            if i<len(target)/2: colors.append(first_color)
            else: colors.append(second_color)

        else:
            compare_sizes = [link_size[i] for i in np.where(np.array(target) == i+2)[0]]
            if compare_sizes[0]>compare_sizes[1]: colors.append(first_color_light)
            else: colors.append(second_color_light)
    return colors

In [15]:

def get_sankey_diagram(label, source, target, link_size):

  node_colors = px.colors.qualitative.Plotly[0:max(source)+1]
  node_colors = node_colors + [main_color]*(len(label)-len(node_colors))
  
  fig = go.Figure(data=[go.Sankey(
      node = dict(
        pad = 15,
        thickness = 20,
        line = dict(color = "black", width = 0.5),
        label = label
      ),
      link = dict(
        source = source,
        target = target,
        value = link_size,
        color = [node_colors[s] for s in source]
        
    ))])

  
  fig.update_traces(node_color = node_colors)
  fig.layout.template = 'plotly_dark'
  fig.update_layout(margin=dict(l=50, r=50, t=50, b=50), plot_bgcolor='rgb(30,30,30)', paper_bgcolor ='rgb(30,30,30)')
  return fig

In [179]:
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = Dash(__name__, external_stylesheets=external_stylesheets)


app.layout = html.Div([
    html.Div(className='row', children=[html.H3(className='twelve columns', children=['Words used in the largest number of songs'])], style={'padding': '5% 5% 0% 5%'}),
    html.Div(className='row', children=[
        
        html.Div(className='three columns', children = [
                html.H6('Select the words by POS tag'),
                dcc.Dropdown(
                    wordtypes,
                    'All words',
                    id='barplot-subgroup', style={'color': colors['background']}
                ),
                html.Div(children = [
                    html.Span('Show top:'),
                    dcc.RadioItems(
                    ['10', '20'],
                    '10',
                    id='show-top-x',
                    labelStyle={'display': 'inline-block'})]
                ),
                dcc.Input(
                id="search-word",
                type="text",
                placeholder="", 
                debounce=True
                ),

                ], style={'width': '30%', 'display': 'inline-block', 'padding': '0px 0px 10px 0px'}),

        html.Div(className='three columns', children = [
            html.H6('Artists using the word'),
            dcc.RadioItems(
                ['Absolute', 'Percentage'],
                'Absolute',
                id='abs-vs-perc-artists',
                labelStyle={'display': 'inline-block'}),
            dcc.RadioItems(
                ['Descending', 'Ascending'],
                'Descending',
                id='desc-vs-asc-artist',
                labelStyle={'display': 'inline-block'}
            )
        ], style={'width': '30%', 'display': 'inline-block', 'padding': '50px 0px 0px 0px'})
    ], style={'padding': '0% 5% 0% 5%'}),

    html.Div(className='row', children=[
        html.Div(className='three columns', children = [
            dcc.Graph(
                id='barplot-most-common'
            )
        ], style={'display': 'inline-block', 'width': '30%'}),

        html.Div(className='three columns', children = [
            dcc.Graph(id='barplot-artists'),
        ], style={'display': 'inline-block', 'width': '30%'}),

        html.Div(className='three columns', id='songs-table', children=[], style={'display': 'inline-block', 'width': '30%'})

    ], style={'padding': '0% 5%'}),



    html.Div(className='row', children=[
        html.Div(className='three columns', children = [
                dcc.RadioItems(
                    ['5', '10'],
                    '5',
                    id='show-top-x-nouns',
                    labelStyle={'display': 'inline-block'}
                )
        ], style={'display': 'inline-block', 'width': '30%'})
    ], style={'padding': '5% 5% 0% 5%'}),


    html.Div(className='row', children=[
        html.Div(className='three columns', children = [
            dcc.Graph(
                id='sankey-diagram-noun-adjective'
            ),
        ], style={'display': 'inline-block', 'width': '30%'}),

        html.Div(className='three columns', id='songs-table-sankey', children=[], style={'display': 'inline-block', 'width': '30%'})

    ], style={'padding': '0% 5%'}),


    html.Div(className='row', children=[
        html.Div(className='three columns', children = [
            dcc.Input(
            id="sankey-diagram-search-input",
            type="text",
            placeholder="", 
            debounce=True
            ),

            dcc.Dropdown(
                    ["Noun", "Verb", "Adjective", "Adverb"],
                    'Noun',
                    id='sankey-diagram-search-upos', style={'color': colors['background']}
                ),

            dcc.Graph(
                id='sankey-diagram-search'
            ),
        ], style={'display': 'inline-block', 'width': '30%'}),

        html.Div(className='three columns', id='songs-table-sankey-search', children=[], style={'display': 'inline-block', 'width': '30%'}) 
    ], style={'padding': '0% 5%'}),


        html.Div(className='row', children=[
        html.Div(className='three columns', children = [
                dcc.Dropdown(
                    all_artists,
                    all_artists[0],
                    id='barplot-subgroup-artist',
                ),
                dcc.Dropdown(
                    wordtypes,
                    'All words',
                    id='barplot-subgroup-wordtype',
                )], style={'width': '30%', 'display': 'inline-block'})
        

    ], style={'padding': '5% 5% 0% 5%'}),


    html.Div(className='row', children=[
        html.Div(className='three columns', children = [
            dcc.Graph(
                id='barplot-most-common-by-artist'
            )
        ], style={'display': 'inline-block', 'width': '30%'})

    ], style={'padding': '0% 5%'}),


    html.Div(children = [dcc.Store(id='highlighted-word'), dcc.Store(id='search-or-top'), dcc.Store(id='song_ids-sankey'), dcc.Store(id='song_ids-sankey-search')])

   
  
], style={'backgroundColor': colors['background'], 'color': colors['text'], 'height':'100vh', 'width':'100%', 'height':'100%', 'top':'0px', 'left':'0px'})




def create_artists_plot(artists, counts, total_counts, abs_or_perc, desc_or_asc, lemma, clickData):
    if abs_or_perc == "Absolute": 
        x = counts
        text = artists
        x_label = "Number of songs"
        hover = 'Artist: %{text} <br>Number of songs: %{x}'
    elif abs_or_perc == "Percentage": 
        sorted_perc = [[x, y] for x, y in sorted(zip(counts/total_counts, artists), reverse=True)]
        x = [s[0]*100 for s in sorted_perc]
        text = [s[1] for s in sorted_perc]
        x_label = "Percentage of songs"
        hover = 'Artist: %{text} <br>% of songs: %{x}'
    
    if desc_or_asc == "Descending":
        
        x=np.flip(get_top_n(x, 10))
        y=np.flip(text[0:len(x)])

    elif desc_or_asc == "Ascending":
        x=np.flip(get_top_n(x, 10, reverse=True))
        y=text[-len(x):]

    if clickData: highlighted = clickData['points'][0]['text']
    else: highlighted = None
   

    colors = get_bar_colors(y, highlighted)
    fig = get_bar_plot(x, y, colors, x_label=x_label, y_label="Artist", hover= hover, title=f"Artists mentioning the word '{lemma}'")

    return fig

@app.callback(Output('search-word', 'value'), Input('barplot-subgroup', 'value'))
def clear_searchbox(subgroup):
    return ""

@app.callback(
    [Output('barplot-most-common', 'figure'), Output('search-or-top', 'data'), Output('highlighted-word', 'data')],
    [Input('barplot-most-common', 'clickData'),
    Input('barplot-subgroup', 'value'),
    Input('show-top-x', 'value'),
    Input('search-word', 'value'),
    State('search-or-top', 'data')])
def update_graph(clickData, subgroup, n, words, search_or_top):
    n= int(n)
    
    ctx = dash.callback_context
    input_id = ctx.triggered[0]['prop_id']

    if input_id == "barplot-subgroup.value": search_or_top = "top"
    elif input_id == "search-word.value": search_or_top = "search"

    if search_or_top=="search" and len(words)>0:
        wordlist = [w.strip() for w in words.split(",")]
        dataframe = get_subdf_by_wordlist(wordlist)

    else: 
        dataframe = get_subdf(subgroup)
    
    value_counts = dataframe.lemma.value_counts()

    x=np.flip(get_top_n(value_counts.values, n))
    y=np.flip(value_counts[0:len(x)].index)
    
    if clickData: 
        highlighted = clickData['points'][0]['text']
        if not highlighted in y: highlighted = y[-1]
    else: highlighted = y[-1]
    colors = get_bar_colors(y, highlighted)
    
    fig = get_bar_plot(x, y, colors, x_label = "Number of songs", y_label="Word", hover= 'Word: %{text} <br>Number of songs: %{x}')

 
    return fig, search_or_top, highlighted







@app.callback(
    Output('barplot-artists', 'figure'),
    Input('highlighted-word', 'data'),
    Input('barplot-artists', 'clickData'),
    Input('abs-vs-perc-artists', 'value'),
    Input('desc-vs-asc-artist', 'value'),
    Input('barplot-subgroup', 'value'),
    Input('search-word', 'value'),
    Input('search-or-top', 'data'))
def update_artists_plot(highlighted_word, clickData_artist, abs_or_perc, desc_or_asc, subgroup, words, search_or_top):
    
    
    ctx = dash.callback_context
    input_id = ctx.triggered[0]['prop_id']
    if search_or_top == "search" and len(words)>0:
        wordlist = [w.strip() for w in words.split(",")]
        dataframe = get_subdf_by_wordlist(wordlist)

    else: dataframe = get_subdf(subgroup)
    
    

    lemma = highlighted_word
    if input_id != "barplot-artists.clickData": clickData_artist=None

    value_counts_artist = dataframe[dataframe.lemma == lemma].Artist_ID.value_counts()
    artists = value_counts_artist.index.map(lambda x: get_artist_name(x)[0])
    counts = value_counts_artist.values
    total_counts = [get_number_of_songs(a)[0] for a in value_counts_artist.index]
    
    return create_artists_plot(artists, counts, total_counts, abs_or_perc, desc_or_asc, lemma, clickData_artist)


@app.callback(
    Output('songs-table', 'children'),
    Input('highlighted-word', 'data'),
    Input('barplot-subgroup', 'value'),
    Input('barplot-artists', 'clickData'),
    Input('search-word', 'value'),
    Input('search-or-top', 'data'))
def update_songs_table(highlighted_word, subgroup, clickData_artist, words, search_or_top):


    ctx = dash.callback_context
    input_id = ctx.triggered[0]['prop_id']

    if search_or_top == "search" and len(words)>0:
        wordlist = [w.strip() for w in words.split(",")]
        dataframe = get_subdf_by_wordlist(wordlist)

    else: dataframe = get_subdf(subgroup)


    lemma = highlighted_word

    if input_id == "barplot-artists.clickData": 
        artist_id = get_artist_code(clickData_artist['points'][0]['text'])[0]
        songs_ids = dataframe[(dataframe.lemma == lemma) & (dataframe.Artist_ID == artist_id)].Song_ID.values
    else: songs_ids = dataframe[dataframe.lemma == lemma].Song_ID.values

    all_data = []
    conn = sqlite3.connect('CroLyrics_data/info.db')
    c = conn.cursor()
    for i in range(0, len(songs_ids), 999):
        placeholder= '?'
        placeholders= ', '.join(placeholder for unused in songs_ids[i:i+999])
        query= 'SELECT name, artist_name, views, url FROM songs_info WHERE code IN (%s)' % placeholders
        c.execute(query, songs_ids[i:i+999])
        all_data.extend(c.fetchall())

    df_songs = pd.DataFrame(all_data)
    df_songs.columns = ["Song", "Artist", "Views", "Lyrics Url"]
    df_songs["Song"] = "[" + df_songs["Song"] + "](" + df_songs["Lyrics Url"] + ")"
    df_songs = df_songs.drop("Lyrics Url", axis=1)
    df_songs = df_songs.sort_values("Views", ascending=False)

    conn.close()
    return generate_table(df_songs, f"the word '{lemma}'")



@app.callback(
    [Output('sankey-diagram-noun-adjective', 'figure'), Output('song_ids-sankey', 'data')],
    [Input('show-top-x-nouns', 'value')])
def update_sankey_diagram_noun_adj(num_sources):
    num_sources = int(num_sources)
    source_words = list(adj_noun_unique.NOUN_lemma.value_counts()[0:num_sources].index)
    label, source, target, link_size, song_ids = get_sankey_data(adj_noun_unique, source_words, "NOUN_lemma", "ADJ_lemma")
    fig = get_sankey_diagram(label, source, target, link_size)
    return fig, [song_ids, [f"'{label[t]} {label[s]}'" for s,t in zip(source, target)]]

@app.callback(
    Output('songs-table-sankey', 'children'),
    Input('sankey-diagram-noun-adjective', 'clickData'),
    Input('song_ids-sankey', 'data'))
def update_songs_table_sankey(clickData, sankey_data):

    if clickData: point_number = clickData['points'][0]['pointNumber']
    else: point_number = 0
    songs_ids = sankey_data[0][point_number]
    source_target = sankey_data[1][point_number]

    all_data = []
    conn = sqlite3.connect('CroLyrics_data/info.db')
    c = conn.cursor()
    for i in range(0, len(songs_ids), 999):
        placeholder= '?'
        placeholders= ', '.join(placeholder for unused in songs_ids[i:i+999])
        query= 'SELECT name, artist_name, views, url FROM songs_info WHERE code IN (%s)' % placeholders
        c.execute(query, songs_ids[i:i+999])
        all_data.extend(c.fetchall())

    df_songs = pd.DataFrame(all_data)
    df_songs.columns = ["Song", "Artist", "Views", "Lyrics Url"]
    df_songs["Song"] = "[" + df_songs["Song"] + "](" + df_songs["Lyrics Url"] + ")"
    df_songs = df_songs.drop("Lyrics Url", axis=1)
    df_songs = df_songs.sort_values("Views", ascending=False)

    conn.close()
    return generate_table(df_songs, f"the phrase {source_target}")



@app.callback(
    [Output('sankey-diagram-search', 'figure'), Output('song_ids-sankey-search', 'data')],
    [Input('sankey-diagram-search-upos', 'value'), Input("sankey-diagram-search-input", "value")])
def update_sankey_diagram(upos, words):
    if upos == "Noun" or upos == "Adjective": dataframe = adj_noun_unique

    elif upos == "Verb" or upos == "Adverb": dataframe = adv_verb_unique

    if upos == "Adjective": 
        source = "ADJ"
        target = "NOUN"
    elif upos == "Adverb": 
        source = "ADV"
        target = "VERB"

    elif upos=="Noun":
        source = "NOUN"
        target = "ADJ"
    
    else:
        source = "VERB"
        target = "ADV"

    num_sources = 10
    

    #num_sources = int(num_sources)
    #source_words = list(dataframe[source + "_lemma"].value_counts()[0:num_sources].index)
    source_words = [w.strip() for w in words.split(",")]
    label, source, target, link_size, song_ids = get_sankey_data(dataframe, source_words, source + "_lemma", target + "_lemma")
    fig = get_sankey_diagram(label, source, target, link_size)
    return fig, [song_ids, [f"'{label[t]} {label[s]}'" for s,t in zip(source, target)]]


@app.callback(
    Output('songs-table-sankey-search', 'children'),
    Input('sankey-diagram-search', 'clickData'),
    Input('song_ids-sankey-search', 'data'))
def update_songs_table_sankey(clickData, sankey_data):

    if clickData: point_number = clickData['points'][0]['pointNumber']
    else: point_number = 0
    songs_ids = sankey_data[0][point_number]
    source_target = sankey_data[1][point_number]

    all_data = []
    conn = sqlite3.connect('CroLyrics_data/info.db')
    c = conn.cursor()
    for i in range(0, len(songs_ids), 999):
        placeholder= '?'
        placeholders= ', '.join(placeholder for unused in songs_ids[i:i+999])
        query= 'SELECT name, artist_name, views, url FROM songs_info WHERE code IN (%s)' % placeholders
        c.execute(query, songs_ids[i:i+999])
        all_data.extend(c.fetchall())

    df_songs = pd.DataFrame(all_data)
    df_songs.columns = ["Song", "Artist", "Views", "Lyrics Url"]
    df_songs["Song"] = "[" + df_songs["Song"] + "](" + df_songs["Lyrics Url"] + ")"
    df_songs = df_songs.drop("Lyrics Url", axis=1)
    df_songs = df_songs.sort_values("Views", ascending=False)

    conn.close()
    return generate_table(df_songs, f"the phrase {source_target}")




@app.callback(
    Output('barplot-most-common-by-artist', 'figure'),
    Input('barplot-subgroup-artist', 'value'),
    Input('barplot-subgroup-wordtype', 'value'))
def update_graph_by_artist(artist, wordtype):
 
    dataframe = get_subdf(wordtype)
    artist_id = get_artist_code(artist)[0]
    value_counts = dataframe[dataframe.Artist_ID == artist_id].lemma.value_counts()
       
       
    fig = px.bar(
                x=np.flip(value_counts[0:10].values), text=np.flip(value_counts[0:10].index), 
               orientation='h')
    
    fig.update_traces(hovertemplate='Word: %{text} <br>Number of songs: %{x}')
    
    fig.update_yaxes(title=wordtype, showticklabels=False)
    fig.update_xaxes(title="Number of songs")
    fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
    
    return fig

if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Envi

In [161]:
get_top_n([10,10,9,9], 2)

[10, 10]