In [4]:
import ast
import re
import json
#import spacy
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

%matplotlib inline
plt.rcParams['font.family'] = 'Times New Roman'

In [7]:
persona_char = pd.read_csv('../Data/final_persona_char_movie_role.csv')
persona_char.columns

Index(['Wikipedia movie ID', 'character', 'pagerank', 'character_type',
       'full_name', 'freebase_char_map', 'wikipedia_movie_id', 'movie_name',
       'related_name', 'token', 'persona_code', 'persona_distribution',
       'freebase_movie_id', 'character_name', 'actor_dob', 'actor_gender',
       'actor_height', 'actor_ethnicity', 'actor_name', 'actor_age_release',
       'freebase_char_id', 'freebase_actor_id', 'movie_release_date',
       'movie_box_office_revenue', 'movie_runtime', 'movie_genres_cleaned',
       'movie_countries_cleaned', 'movie_languages_cleaned',
       'movie_release_year'],
      dtype='object')

In [10]:
# groupby by persona ids and get the proportion of each persona
persona_char_counts = persona_char.groupby(['persona_code']).size().reset_index(name='counts')
persona_char_counts.to_csv('../Data/persona_char_counts.csv', index=False)

In [2]:
persona_word = json.load(open('../Data/Persona/persona_word.json', 'r'))
categories = list(persona_word.keys())

all_words = []
for category in categories:
    all_words += persona_word[category]["top_verbs"]

all_words = list(set(all_words))

# Create a matrix where rows correspond to categories and columns to words
word_matrix = np.zeros((len(categories), len(all_words)))
for i, category in enumerate(categories):
    for word in persona_word[category]["top_verbs"]:
        word_matrix[i][all_words.index(word)] = persona_word[category]["top_freq"][persona_word[category]["top_verbs"].index(word)]

word_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [3]:
# Perform PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(word_matrix)

x = pca_result[:,0]
y = pca_result[:,1]

f = go.FigureWidget([go.Scatter(x=x, y=y, mode='markers')])
scatter = f.data[0]
colors = ['#a3a7e4'] * 100
scatter.marker.color = colors
scatter.marker.size = [10] * 100
f.layout.hovermode = 'closest'


# create our callback function
def update_point(trace, points, selector):
    c = list(scatter.marker.color)
    s = list(scatter.marker.size)
    for i in points.point_inds:
        c[i] = '#bae2be'
        s[i] = 20
        with f.batch_update():
            scatter.marker.color = c
            scatter.marker.size = s


scatter.on_click(update_point)

display(f)

FigureWidget({
    'data': [{'marker': {'color': [#a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4,
                                   #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4,
                                   #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4,
                                   #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4,
                                   #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4,
                                   #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4,
                                   #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4,
                                   #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4,
                                   #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4,
                                   #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4,
                                   #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4,
                                   #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4, #a3a7e4,
                         

In [3]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
import numpy as np

# Sample data
# Replace this with your actual data
matrix = np.random.rand(10, 20) * 1000

# Perform PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(matrix)

# Get word names (replace with your actual word names)
word_names = [f"Word_{i}" for i in range(matrix.shape[1])]

# Initialize the Dash app
app = dash.Dash(__name__)

# Define app layout
app.layout = html.Div([
    html.Div([
        dcc.Graph(
            id='scatter-plot',
            figure=px.scatter(
                x=pca_result[:, 0],
                y=pca_result[:, 1],
                labels={'x': 'PC1', 'y': 'PC2'},
                title='PCA 2D Scatter Plot',
                custom_data=np.arange(matrix.shape[0])
            )
        )
    ], style={'width': '49%', 'display': 'inline-block'}),
    html.Div([
        dcc.Graph(
            id='bar-plot',
            figure=go.Figure()
        )
    ], style={'width': '49%', 'display': 'inline-block'})
])

# Update the bar plot based on the selected scatter point
@app.callback(
    Output('bar-plot', 'figure'),
    [Input('scatter-plot', 'selectedData')]
)
def update_bar_chart(selected_data):
    if not selected_data or not selected_data['points']:
        # If no points are selected, show bar plot for all topics
        bar_fig = go.Figure()
        bar_fig.add_trace(go.Bar(x=word_names, y=matrix.mean(axis=0), name='All Topics'))
        bar_fig.update_layout(title='Word Frequencies for All Topics')
    else:
        topic_index = selected_data['points'][0]['pointIndex']
        selected_topic = matrix[topic_index, :]
        # If points are selected, show bar plot for the selected topic
        bar_fig = go.Figure()
        bar_fig.add_trace(go.Bar(x=word_names, y=selected_topic, name=f'Topic {topic_index}'))
        bar_fig.update_layout(title=f'Word Frequencies for Topic {topic_index}')

    return bar_fig

The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html


ValueError: String or int arguments are only possible when a DataFrame or an array is provided in the `data_frame` argument. No DataFrame was provided, but argument 'custom_data_0' is of type str or int.