# Dashboard

In [1]:
from dash import Dash, html, dash_table, dcc
from dash.dependencies import Input, Output  # Add Input, Output imports
import pandas as pd
from dash.dependencies import Input, Output
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import io
import base64

In [2]:
# Define a custom color palette with 15 different colors
custom_color_palette = ["#ff355e", "#fd5b78", "#ff6037", "#ff6037", "#ff9966",
                        "#ff9933", "#ffcc33", "#ffff66", "#ccff00","#66ff66", 
                        "#aaf0d1", "#16d0cb", "#50bfe6" ,"#9c27b0","#ee34d2",
                        "#ff00cc"] 

In [3]:
COLLAB = False # False # True

In [4]:
# For colab
if COLLAB:
    data = pd.read_csv('/content/drive/MyDrive/data_poems_preprocessed.csv', encoding='latin-1')
# For local
else:
    data = pd.read_csv('Datasets/data_poems_preprocessed.csv', encoding='latin-1')

data.head(5)

Unnamed: 0,title,author,categories,clean_tokens
0,The 80&rsquo;s Miracle Diet,By Melvin Dixon,"['Living', 'Health & Illness']","['free', 'without', 'ask', 'quick', 'delivery'..."
1,All Saints&rsquo;,By Corey Van Landingham,[],"['caravaggio', 'face', 'sink', 'pumpkin', 'bul..."
2,And These Are Just a Few ...,By Melvin Dixon,"['Living', 'Health & Illness', 'Social Comment...","['poem', 'epidemic', 'dead', 'live', 'remember..."
3,ASMR,By Corey Van Landingham,[],"['climb', 'mountain', 'delight', 'world', 'thi..."
4,ASMR,By Corey Van Landingham,[],"['hello', 'tonight', 'trace', 'static', 'bough..."


In [5]:
# For colab
if COLLAB:
    topic_df = pd.read_csv('/content/drive/MyDrive/topic_df.csv', encoding='latin-1')
# For local
else:
    topic_df = pd.read_csv('Datasets/topic_df.csv', encoding='latin-1')

topic_df.head(15)

Unnamed: 0,Topics
0,"['night', 'come', 'go', 'know', 'sleep', 'hear..."
1,"['water', 'sea', 'wind', 'white', 'sky', 'ligh..."
2,"['poetry', 'say', 'know', 'think', 'poet', 'ma..."
3,"['shall', 'god', 'thy', 'thee', 'may', 'love',..."
4,"['us', 'beggar', 'faintly', 'blue', 'color', '..."
5,"['verse', 'poetry_magazine', 'flower', 'upon',..."
6,"['light', 'flute', 'music', 'bird', 'note', 'e..."
7,"['body', 'eye', 'hand', 'know', 'word', 'love'..."
8,"['room', 'light', 'wall', 'see', 'eye', 'air',..."
9,"['man', 'old', 'say', 'black', 'town', 'back',..."


In [6]:
print(topic_df["Topics"][1])

['water', 'sea', 'wind', 'white', 'sky', 'light', 'blue', 'green', 'tree', 'black', 'come', 'sun', 'river', 'stone', 'leave', 'eye', 'go', 'fall', 'dark', 'rock']


In [7]:
# For colab
if COLLAB:
    dashdata = pd.read_csv('/content/drive/MyDrive/dashdata.csv')
# For local
else:
    dashdata = pd.read_csv('Datasets/dashdata.csv')

dashdata.head(5)

Unnamed: 0,Document Index,Predominant Topic,Topic Distribution
0,0,1,"[0, 0.7104108, 0, 0, 0, 0.04767357, 0, 0, 0.05..."
1,1,5,"[0, 0.25395682, 0, 0, 0, 0.3283527, 0.20961152..."
2,2,2,"[0, 0.18636362, 0.5357853, 0, 0, 0, 0, 0, 0.10..."
3,3,10,"[0, 0, 0.23145735, 0.029389093, 0, 0, 0, 0, 0,..."
4,4,12,"[0, 0, 0, 0, 0.11058025, 0, 0, 0, 0, 0, 0.2841..."


In [8]:
dashdata['clean_tokens'] = data['clean_tokens']


In [9]:
dashdata.head(5)

Unnamed: 0,Document Index,Predominant Topic,Topic Distribution,clean_tokens
0,0,1,"[0, 0.7104108, 0, 0, 0, 0.04767357, 0, 0, 0.05...","['free', 'without', 'ask', 'quick', 'delivery'..."
1,1,5,"[0, 0.25395682, 0, 0, 0, 0.3283527, 0.20961152...","['caravaggio', 'face', 'sink', 'pumpkin', 'bul..."
2,2,2,"[0, 0.18636362, 0.5357853, 0, 0, 0, 0, 0, 0.10...","['poem', 'epidemic', 'dead', 'live', 'remember..."
3,3,10,"[0, 0, 0.23145735, 0.029389093, 0, 0, 0, 0, 0,...","['climb', 'mountain', 'delight', 'world', 'thi..."
4,4,12,"[0, 0, 0, 0, 0.11058025, 0, 0, 0, 0, 0, 0.2841...","['hello', 'tonight', 'trace', 'static', 'bough..."


In [10]:
print(len(dashdata["Topic Distribution"][0]))
print(len(dashdata["clean_tokens"][0]))

print((dashdata["Topic Distribution"][0]))
print((dashdata["clean_tokens"][0]))

89
478
[0, 0.7104108, 0, 0, 0, 0.04767357, 0, 0, 0.05683244, 0, 0, 0.024485786, 0.1451873, 0, 0]
['free', 'without', 'ask', 'quick', 'delivery', 'via', 'overnight', 'male', 'special', 'handle', 'ten', 'year', 'incubation', 'lose', 'pound', 'two', 'weeks', 'cocktails', 'perrier', 'twist', 'azt', 'bactrim', 'broil', 'bacon', 'bits', 'egg', 'lipid', 'quiche', 'brunch', 'tongue', 'ablaze', 'toast', 'point', 'soundless', 'howl', 'talented', 'mind', 'best', 'body', 'generation', 'go', 'smoke', 'act', 'dial', 'get', 'operators', 'stand', 'photograph', 'prove', 'pass', 'away']


In [11]:
print("Size:", dashdata.shape)
dashdata.head(10)



Size: (44860, 4)


Unnamed: 0,Document Index,Predominant Topic,Topic Distribution,clean_tokens
0,0,1,"[0, 0.7104108, 0, 0, 0, 0.04767357, 0, 0, 0.05...","['free', 'without', 'ask', 'quick', 'delivery'..."
1,1,5,"[0, 0.25395682, 0, 0, 0, 0.3283527, 0.20961152...","['caravaggio', 'face', 'sink', 'pumpkin', 'bul..."
2,2,2,"[0, 0.18636362, 0.5357853, 0, 0, 0, 0, 0, 0.10...","['poem', 'epidemic', 'dead', 'live', 'remember..."
3,3,10,"[0, 0, 0.23145735, 0.029389093, 0, 0, 0, 0, 0,...","['climb', 'mountain', 'delight', 'world', 'thi..."
4,4,12,"[0, 0, 0, 0, 0.11058025, 0, 0, 0, 0, 0, 0.2841...","['hello', 'tonight', 'trace', 'static', 'bough..."
5,5,10,"[0, 0, 0.0328935, 0, 0.32178554, 0, 0, 0.05563...","['didier', 'vermont', 'leave', 'glass', 'hand'..."
6,6,10,"[0, 0.2884289, 0.07154473, 0, 0, 0, 0, 0, 0, 0...","['say', 'land', 'remember', 'muddy', 'rush', '..."
7,7,10,"[0.13097183, 0, 0.116376236, 0.13909899, 0.086...","['galleria', 'nazionale', 'arte', 'antica', 'r..."
8,8,8,"[0, 0.18895902, 0, 0.042033803, 0, 0, 0, 0, 0....","['net', 'headfirst', 'prairie', 'spring', 'see..."
9,9,10,"[0, 0.17507647, 0, 0, 0, 0.25174677, 0.0757936...","['kara', 'walker', 'blue', 'everyday', 'brown'..."


TODO: 
plot donde se pueda seleccionar un poema, con sus clean tokens y te salgan los 3 topics Id a los que pertenece. 
Y tu puedas ver luego cada topic ID las palabras que tiene y compararlas con el topic del poema. 

In [12]:
#  lsof -i :8050
# kill #pid of the previous command

In [13]:
# Convert the plot to a base64 encoded string
buf = io.BytesIO()
plt.savefig(buf, format='png')
plt.close()
buf.seek(0)
img_base64 = base64.b64encode(buf.read()).decode('utf-8')

# Print the length of the base64 encoded string
print("Length of base64 encoded image:", len(img_base64))

# Embed the image directly into the HTML img tag
img_str = f'<img src="data:image/png;base64,{img_base64}" style="max-width:100%">'


Length of base64 encoded image: 3196


In [14]:
import base64
import io
from dash import Dash, html, dash_table, dcc, Output, Input
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Initialize the app
app = Dash(__name__)

# Define the topics
topics = [f"Topic {i}" for i in range(1, topic_df.shape[0] + 1)]

# App layout
app.layout = html.Div([
    html.Div(children='Final Project Machine Learning'),
    
    # DataTable displaying the data
    dash_table.DataTable(data=dashdata.to_dict('records'), page_size=10),
    
    # Dropdown menu to select a topic
    dcc.Dropdown(
        id='topic-dropdown',
        options=[{'label': topic, 'value': topic} for topic in topics],
        value=topics[0]  # Default value
    ),

    # Word Cloud to display the words associated with the selected topic
    html.Div(id='wordcloud-container'),
    
    # Histogram showing the predominant LDA topic distribution
    dcc.Graph(
        figure=px.histogram(dashdata, x='Document Index', color='Predominant Topic', 
                            color_discrete_sequence=custom_color_palette, title='Predominant LDA Topic Distribution',
                            category_orders={'Predominant Topic': sorted(dashdata['Predominant Topic'].unique())})
    ),

    # Histogram showing the distribution of the predominant topics
    dcc.Graph(
        figure=px.histogram(dashdata, x='Predominant Topic', color='Predominant Topic',
                             color_discrete_sequence=custom_color_palette, title='Distribution of Predominant Topics')
    ),

])

# Callback to update the Word Cloud based on the selected topic
@app.callback(
    Output('wordcloud-container', 'children'),
    [Input('topic-dropdown', 'value')]
)
def update_wordcloud(selected_topic):
    # Get the index of the selected topic
    topic_index = topics.index(selected_topic)
    
    # Get the words associated with the selected topic from topic_df
    words = ' '.join(topic_df.iloc[topic_index]['Topics'])
    
    # Generate the Word Cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(words)
    
    # Plot the Word Cloud
    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()
    
    # Convert the plot to a base64 encoded string
    img_bytes = io.BytesIO()
    plt.savefig(img_bytes, format='PNG')
    img_bytes.seek(0)
    img_base64 = base64.b64encode(img_bytes.getvalue()).decode()
    
    # Display the Word Cloud as an image in Dash
    return html.Img(src=f'data:image/png;base64,{img_base64}', style={'max-width': '100%'})

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


In [15]:
# from dash import Dash, html, dash_table, dcc
# from dash.dependencies import Input, Output
# import pandas as pd
# import plotly.express as px
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt

# # Initialize the app
# app = Dash(__name__)

# # Define the topics
# topics = [f"Topic {i}" for i in range(1, topic_df.shape[0] + 1)]

# # App layout
# app.layout = html.Div([
#     html.Div(children='Final Project Machine Learning'),
    
#     # DataTable displaying the data
#     dash_table.DataTable(data=dashdata.to_dict('records'), page_size=10),
    
#     # Dropdown menu to select a topic
#     dcc.Dropdown(
#         id='topic-dropdown',
#         options=[{'label': topic, 'value': topic} for topic in topics],
#         value=topics[0]  # Default value
#     ),
    
#     dcc.Graph(
#         figure=px.histogram(dashdata, x='Document Index', color='Predominant Topic', 
#                             color_discrete_sequence=custom_color_palette, title='Predominant LDA Topic Distribution',
#                             category_orders={'Predominant Topic': sorted(dashdata['Predominant Topic'].unique())})
#     ),

#     # Histogram showing the distribution of the predominant topics
#     dcc.Graph(
#         figure=px.histogram(dashdata, x='Predominant Topic', color='Predominant Topic',
#                              color_discrete_sequence=custom_color_palette, title='Distribution of Predominant Topics')
#     ),
#     # Word Cloud to display the words associated with the selected topic
#     html.Div(id='wordcloud-container')
# ])

# # Callback to update the Word Cloud based on the selected topic
# @app.callback(
#     Output('wordcloud-container', 'children'),
#     [Input('topic-dropdown', 'value')]
# )
# def update_wordcloud(selected_topic):
#     # Get the index of the selected topic
#     topic_index = topics.index(selected_topic)
    
#     # Get the words associated with the selected topic from topic_df
#     words = ' '.join(topic_df.iloc[topic_index]['Topics'])
    
#     # Generate the Word Cloud
#     wordcloud = WordCloud(width=80, height=40, background_color='white').generate(words)
    
#     # Plot the Word Cloud
#     plt.figure(figsize=(5, 4))
#     plt.imshow(wordcloud, interpolation='bilinear')
#     plt.axis('off')
#     plt.tight_layout()
    
#     # Convert the plot to a base64 encoded string
#     buf = io.BytesIO()
#     plt.savefig(buf, format='png')
#     plt.close()
#     buf.seek(0)
#     img_base64 = base64.b64encode(buf.read()).decode('utf-8')
    
#     # Embed the image directly into the HTML img tag
#     img_str = f'<img src="data:image/png;base64,{img_base64}" style="max-width:100%">'
    
#     return html.Div(html.Div([html.Img(src=img_str)]))


# # Helper function to convert Matplotlib plot to base64 encoded string
# def image_to_base64(plt):
#     import base64
#     import io
    
#     # Save the plot to a BytesIO object
#     buf = io.BytesIO()
#     plt.savefig(buf, format='png')
#     buf.seek(0)
    
#     # Encode the plot as a base64 string
#     img_base64 = base64.b64encode(buf.read()).decode('utf-8')
    
#     return img_base64

# # Run the app
# if __name__ == '__main__':
#     app.run_server(debug=True)
