In [32]:
import pandas as pd
import os
import json
from tqdm import tqdm
import plotly.express as px
import plotly.graph_objects as go

In [33]:
!pip install dash plotly





In [34]:
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)

In [35]:
def get_demography_df():
    files = os.listdir("data/demography/")
    entries = []
    for file in tqdm(files):
        with open(f"data/demography/{file}","r") as f:
            entry = json.load(f)
            entry["frame_id"] = int(file.split("_")[1])
            entries.append(entry)
    demography_df = pd.DataFrame(entries)
    return demography_df

In [36]:
demo_df = get_demography_df()

100%|█████████████████████████████████████████████████████████████████████| 1127/1127 [00:02<00:00, 552.10it/s]


In [37]:
demo_df.head()

Unnamed: 0,emotion,dominant_emotion,region,face_confidence,age,gender,dominant_gender,race,dominant_race,frame_id
0,"{'angry': 0.07739887564168878, 'disgust': 2.20...",sad,"{'x': 191, 'y': 157, 'w': 168, 'h': 292, 'left...",1.0,31,"{'Woman': 22.41971045732498, 'Man': 77.5802910...",Man,"{'asian': 1.7354777289006387, 'indian': 1.1071...",white,10005
1,"{'angry': 0.05867492408528019, 'disgust': 9.89...",sad,"{'x': 196, 'y': 138, 'w': 163, 'h': 295, 'left...",1.0,30,"{'Woman': 16.770778596401215, 'Man': 83.229219...",Man,"{'asian': 4.841800034046173, 'indian': 3.40968...",white,10020
2,"{'angry': 3.0267726747068555, 'disgust': 2.485...",neutral,"{'x': 205, 'y': 115, 'w': 163, 'h': 276, 'left...",1.0,31,"{'Woman': 13.677968084812164, 'Man': 86.322033...",Man,"{'asian': 1.998317427933216, 'indian': 1.90386...",white,10035
3,"{'angry': 1.7797511020489765e-06, 'disgust': 1...",neutral,"{'x': 344, 'y': 80, 'w': 153, 'h': 288, 'left_...",0.98,33,"{'Woman': 93.70995759963989, 'Man': 6.29003867...",Woman,"{'asian': 0.13043524231761694, 'indian': 0.189...",white,10050
4,"{'angry': 17.719951272010803, 'disgust': 7.726...",neutral,"{'x': 349, 'y': 72, 'w': 153, 'h': 304, 'left_...",0.99,33,"{'Woman': 95.17351388931274, 'Man': 4.82648648...",Woman,"{'asian': 1.2738639488816261, 'indian': 2.4188...",white,10065


In [38]:
race_emotion_count = demo_df.groupby(['dominant_race', 'dominant_emotion']).size().reset_index(name='count')
# Calculate the total count for normalization
total_count = race_emotion_count['count'].sum()

# Calculate percentages
race_emotion_count['percentage'] = (race_emotion_count['count'] / total_count) * 100
race_emotion_count.head()

Unnamed: 0,dominant_race,dominant_emotion,count,percentage
0,asian,angry,4,0.354925
1,asian,fear,12,1.064774
2,asian,happy,7,0.621118
3,asian,neutral,9,0.79858
4,asian,sad,71,6.299911


In [39]:
# Get unique lists of races and emotions
unique_races = race_emotion_count['dominant_race'].unique().tolist()
unique_emotions = race_emotion_count['dominant_emotion'].unique().tolist()

# Create a mapping of race and emotion to index
race_to_index = {race: i for i, race in enumerate(unique_races)}
emotion_to_index = {emotion: i + len(unique_races) for i, emotion in enumerate(unique_emotions)}

# Apply mapping to get sources and targets
race_emotion_count['source'] = race_emotion_count['dominant_race'].map(race_to_index)
race_emotion_count['target'] = race_emotion_count['dominant_emotion'].map(emotion_to_index)


In [40]:
# Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
      pad=15,
      thickness=20,
      line=dict(color="black", width=0.5),
      label=unique_races + unique_emotions,
      color="blue"
    ),
    link=dict(
      source=race_emotion_count['source'].tolist(),
      target=race_emotion_count['target'].tolist(),
      value=race_emotion_count['percentage'].tolist()
    ))])

fig.update_layout(title_text="Sankey Diagram: Race and Emotion Flow in Movie", font_size=10)
fig.show()

In [41]:
from dash import Dash, dcc, html
# import dash_mantine_components as dmc
from dash.dependencies import Input, Output

In [42]:
app = Dash()

In [43]:
def get_filtered_df(selected_range):
    # Compute min and max frame_id
    min_frame_id = demo_df['frame_id'].min()
    max_frame_id = demo_df['frame_id'].max()
    range_frame_id = max_frame_id - min_frame_id
    
    # Convert percentage slider values to frame_id range
    start_id = min_frame_id + (selected_range[0] / 100.0) * range_frame_id
    end_id = min_frame_id + (selected_range[1] / 100.0) * range_frame_id
    
    # Filter the DataFrame based on the calculated frame_id range
    filtered_df = demo_df[(demo_df['frame_id'] >= start_id) & (demo_df['frame_id'] <= end_id)]
    return filtered_df

In [44]:
# Callback to update the sankey diagram based on the slider
@app.callback(
    Output('sankey-emotion-race-graph', 'figure'),
    Input('frame-slider', 'value'),
    Input('race-emotion-radio-btn', 'value'))
def update_emotion_race_sankey_graph_slider(selected_range, col_radio):
    filtered_df = get_filtered_df(selected_range)
    
    race_emotion_count = demo_df.groupby(['dominant_race', 'dominant_emotion']).size().reset_index(name='count')
    # Calculate the total count for normalization
    total_count = race_emotion_count['count'].sum()
    
    # Calculate percentages
    race_emotion_count['percentage'] = (race_emotion_count['count'] / total_count) * 100

    # Get unique lists of races and emotions
    unique_races = race_emotion_count['dominant_race'].unique().tolist()
    unique_emotions = race_emotion_count['dominant_emotion'].unique().tolist()
    
    # Create a mapping of race and emotion to index
    race_to_index = {race: i for i, race in enumerate(unique_races)}
    emotion_to_index = {emotion: i + len(unique_races) for i, emotion in enumerate(unique_emotions)}
    
    # Apply mapping to get sources and targets
    race_col = "source" if col_radio == "dominant_race" else "target"
    emotion_col = "target" if col_radio == "dominant_race" else "source"
    race_emotion_count[race_col] = race_emotion_count['dominant_race'].map(race_to_index)
    race_emotion_count[emotion_col] = race_emotion_count['dominant_emotion'].map(emotion_to_index)
    # Create the Sankey diagram
    fig = go.Figure(data=[go.Sankey(
        node=dict(
          pad=15,
          thickness=20,
          line=dict(color="black", width=0.5),
          label=unique_races + unique_emotions,
          color="blue"
        ),
        link=dict(
          source=race_emotion_count['source'].tolist(),
          target=race_emotion_count['target'].tolist(),
          value=race_emotion_count['percentage'].tolist()
        ))])
    
    fig.update_layout(title_text="Sankey Diagram: Race and Emotion Flow in Movie", font_size=10)
    return fig


In [45]:
# Callback to update the gender race bar graph based on the slider
@app.callback(
    Output('gender-emotion-race-bar-graph', 'figure'),
    Input('race-emotion-radio-btn', 'value'),
    Input('frame-slider', 'value'))
def update_gender_emotion_race_graph_slider(col_radio, selected_range):
    filtered_df = get_filtered_df(selected_range)
    gender_df = filtered_df.groupby([col_radio,"dominant_gender"]).size().unstack(fill_value=0)
    grouped_percent = gender_df.div(gender_df.sum(axis=1), axis=0) * 100
    title_val = "Race" if col_radio=="dominant_race" else "Emotion"
    # Plotting the stacked bar chart
    fig = go.Figure()
    
    # Add one trace for each gender
    for gender in grouped_percent.columns:
        fig.add_trace(go.Bar(
            name=gender,
            x=grouped_percent[gender],  # percentage values
            y=grouped_percent.index,  # race/emotion categories
            orientation='h'
        ))
    
    # Update layout for a stacked bar chart
    fig.update_layout(
        barmode='stack',
        title=f'Distribution of Gender by {title_val}',
        xaxis_title=f'Percentage of {title_val}',
        yaxis_title=f'{title_val}',
    )
    
    return fig

In [46]:
# Callback to update the spider graph based on the slider
@app.callback(
    Output('spider-graph', 'figure'),
    Input('frame-slider', 'value'),
    Input('race-emotion-radio-btn', 'value'))
def update_radial_graph_slider(selected_range, col_radio):
    filtered_df = get_filtered_df(selected_range)
    
    specific_df = pd.DataFrame(filtered_df[col_radio].value_counts(normalize=True)*100).reset_index()
    # Create the figure
    fig = go.Figure(data=go.Scatterpolar(
        r=specific_df['proportion'],
        theta=specific_df[col_radio],
        fill='toself'
    ))

    fig.update_layout(
        title="Percentage representation in movie segment",
        polar=dict(
            radialaxis=dict(
                visible=True
            ),
        ),
        showlegend=False
    )
    
    return fig

In [47]:
# Callback to update the line graph based on the slider
@app.callback(
    Output('evolution-line-graph', 'figure'),
    Input('frame-slider', 'value'),
    Input('race-emotion-radio-btn', 'value'))
def update_line_graph_slider(selected_range, col_radio):
    filtered_df = get_filtered_df(selected_range)
    filtered_df.loc[:,"count"] = 1
    pivot_df = filtered_df.pivot_table(index="frame_id", columns=col_radio, values="count", aggfunc="sum", fill_value=0).cumsum()
    # Create the figure
    fig = go.Figure()

    # Adding traces for each race/emotion
    for column in pivot_df.columns:
        fig.add_trace(go.Scatter(
            x=pivot_df.index,
            y=pivot_df[column],
            mode='lines',
            name=column,
            stackgroup='one' # Stacking
        ))
    
    # updating the layout
    fig.update_layout(
        title= "Representation over time in movie segment",
        xaxis_title="Frame ID",
        yaxis_title="Cumulative count of appearances",
        hovermode="x"
    )
    
    return fig

In [48]:
app.layout = html.Div([
    html.H1(children="Movie Visualizations"),
    dcc.RangeSlider(
        id='frame-slider',
        min=0,
        max=100,
        value=[0, 100],
        step=5,  # Percentage step
        marks={i: f'{i}%' for i in range(0, 101, 10)}
    ),
    html.H2(children="Distribution of Race and Emotion"),
    html.Div([
        dcc.RadioItems(
            id='race-emotion-radio-btn',
            options=[
                {'label': 'Race', 'value': 'dominant_race'},
                {'label': 'Emotion', 'value': 'dominant_emotion'}
            ],
            value='dominant_race',  # Default value
            labelStyle={'display': 'inline-block', 'margin-right': '20px'}  # Space out the buttons
        )
    ], style={'margin-bottom': '20px'}),  # Space below the radio buttons
    # Container for graphs
    html.Div([
        dcc.Graph(id='spider-graph', style={'display': 'inline-block', 'width': '50%'}),
        dcc.Graph(id='evolution-line-graph', style={'display': 'inline-block', 'width': '50%'})
    ], style={'display': 'flex'}),
    html.Div([
        dcc.Graph(id='gender-emotion-race-bar-graph', style={'display': 'inline-block', 'width': '50%'}),
        dcc.Graph(id='sankey-emotion-race-graph', style={'display': 'inline-block', 'width': '50%'})
    ], style={'display': 'flex'})
])

In [49]:
app.run(debug=True)