In [1]:
import plotly.express as px
import pandas as pd
import plotly.io as pio
import src.models.question1 as q1
# Assuming load_oscar_winners_nominees_all_categories() is defined elsewhere
oscar_winners, oscar_nominees = q1.load_oscar_winners_nominees_all_categories()

# Combine the DataFrames
all_movies = pd.concat([oscar_winners, oscar_nominees])

# Create the box plot
fig = px.box(
    all_movies, 
    x="any_win",
    y="averageRating", 
    color="any_win",  # Differentiate by color
    title="Average Ratings Distribution of Oscar Winners and Nominees",
    category_orders={"any_win": ["false", "true"]},  # Ensure order
    custom_data=["primaryTitle"],
    color_discrete_map={  # Custom colors
        "False": "#4c72b0",
        "True": "#dd8452"
    }
)

# Update layout
fig.update_layout(
    title={
        'text': "Average Ratings Distribution of Oscar Winners and Nominees",
        'y': 0.9,  # Vertical alignment
        'x': 0.5,  # Horizontal centering
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis=dict(
        title="",
        tickvals=["false", "true"],
        ticktext=["Nominees", "Winners"]  # Correct labels
    ),
    yaxis=dict(title="Average IMDB Rating"),
    showlegend=False,
    width=800,
)

fig.update_traces(
  hovertemplate=(
    "<b>Title:</b> %{customdata[0]}<br>"
    "<b>Average Rating:</b> %{y}<extra></extra>"
  ),
)

# Show the plot
fig.show()


In [27]:
import plotly.express as px
import pandas as pd
import plotly.io as pio

# Assuming load_oscar_winners_nominees_all_categories() is defined elsewhere
oscar_winners, oscar_nominees = q1.load_oscar_winners_nominees_all_categories()

# Combine the DataFrames
all_movies = pd.concat([oscar_winners, oscar_nominees])
all_movies['any_win'] = all_movies['any_win'].astype(str)

# Create Box Plot
fig_box = px.box(
    all_movies, 
    x="any_win",
    y="averageRating", 
    color="any_win",
    category_orders={"any_win": ["False", "True"]},
    custom_data=["primaryTitle"],
    color_discrete_map={"False": "#4c72b0", "True": "#dd8452"}
)

fig_box.update_traces(
    hovertemplate=(
        "<b>Title:</b> %{customdata[0]}<br>"
        "<b>Average Rating:</b> %{y}<extra></extra>"
    ),
    visible=True  # Show Box Plot by default
)

# Create Strip Plot (Raw Points)
fig_strip = px.strip(
    all_movies, 
    x="any_win", 
    y="averageRating", 
    color="any_win",
    category_orders={"any_win": ["False", "True"]},
    custom_data=["primaryTitle"],
    color_discrete_map={"False": "#4c72b0", "True": "#dd8452"},
)

fig_strip.update_traces(
    hovertemplate=(
        "<b>Title:</b> %{customdata[0]}<br>"
        "<b>Average Rating:</b> %{y}<extra></extra>"
    ),
    marker=dict(
        size=8,           # Adjust the size of the circles
        symbol='circle-open',  # Ensure points are circular
        opacity=1.0     # Optional for better visibility
    ),
    visible=False, # Hide Raw Points initially
    jitter=1,  # Add jitter for better visibility
)

# Combine Both Figures
fig_box.add_traces(fig_strip.data)

# Add Dropdown Menu
fig_box.update_layout(
    updatemenus=[
        dict(
            buttons=[
                dict(
                    label="Box Plot",
                    method="update",
                    args=[
                        {"visible": [True, True, False, False]},  # Show both box traces, hide both strip traces
                        {"title.text": "Average Ratings Distribution (Box Plot)"}
                    ],
                ),
                dict(
                    label="Raw Points",
                    method="update",
                    args=[
                        {"visible": [False, False, True, True]},  # Hide both box traces, show both strip traces
                        {"title.text": "Average Ratings Distribution (Raw Points)"}
                    ],
                ),
            ],
            direction="down",
            x=0,
            xanchor="right",
            y=1.15,
            yanchor="top",
        )
    ],
    title={
        'text': "Average Ratings Distribution (Box Plot)",
        'y': 0.9, 
        'x': 0.5, 
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis=dict(
        title="",
        tickvals=["False", "True"],
        ticktext=["Nominees", "Winners"]
    ),
    yaxis=dict(title="Average IMDB Rating"),
    showlegend=False,
    width=800,
)


# Show the interactive plot
fig_box.show()

pio.write_html(fig_box, file="output/boxplot.html", full_html=False)


In [28]:
import plotly.express as px
import pandas as pd
import src.models.question1 as q1

# Parameters
k = 12
min_samples = 10

# Get data and perform clustering
embedded_categories, categories = q1.get_embedded_categories(min_samples)
labels = q1.perform_kmeans(embedded_categories, k)
reduced_dim = q1.reduce_dim(embedded_categories, 2)


clusters_cat = []
for cat in categories:
    clusters_cat.append(q1.category_mapping[cat])


# Create DataFrame for easier management
df_2d = pd.DataFrame({
    "x": reduced_dim[:, 0],
    "y": reduced_dim[:, 1],
    "Category": categories,
    "Cluster": clusters_cat
})

# Create scatter plot
fig_2d = px.scatter(
    df_2d, 
    x="x", 
    y="y", 
    color="Cluster", 
    custom_data=["Category", "Cluster"],
)

# Customize hover template
fig_2d.update_traces(
    hovertemplate=(
        "<b>Category:</b> %{customdata[0]}<br>"
        "<b>Cluster:</b> %{customdata[1]}<extra></extra>"
    ),
    marker=dict(size=10),  # Adjust marker size

)

# Disable the color legend and maintain aspect ratio
fig_2d.update_layout(
    showlegend=True, 
    width=700,
    title={
        'text': "2D Visualization of Categories using PCA",
        'y': 0.9, 
        'x': 0.5, 
        'xanchor': 'center',
        'yanchor': 'top'
    },
)
fig_2d.update_xaxes(scaleanchor="y", scaleratio=1)
fig_2d.update_yaxes(scaleanchor="x", scaleratio=1)

fig_2d.show()

pio.write_html(fig_2d, file="output/clusters_2d.html", full_html=False)


In [29]:
import plotly.express as px
import pandas as pd
import src.models.question1 as q1

# Parameters
k = 12
min_samples = 10

# Get data and perform clustering
embedded_categories, categories = q1.get_embedded_categories(min_samples)
labels = q1.perform_kmeans(embedded_categories, k)
reduced_dim = q1.reduce_dim(embedded_categories, 3)  # Reduce to 3 dimensions

# Create DataFrame for easier management
df_3d = pd.DataFrame({
    "x": reduced_dim[:, 0],
    "y": reduced_dim[:, 1],
    "z": reduced_dim[:, 2],
    "Category": categories,
    "Cluster": clusters_cat
})

# Create 3D scatter plot with custom data
fig_3d = px.scatter_3d(
    df_3d, 
    x="x", 
    y="y", 
    z="z", 
    color="Cluster", 
    custom_data=["Category", "Cluster"]
)

# Customize hover template
fig_3d.update_traces(
    hovertemplate=(
        "<b>Category:</b> %{customdata[0]}<br>"
        "<b>Cluster:</b> %{customdata[1]}<extra></extra>"
    )
)

# Disable the color legend
fig_3d.update_layout(
  showlegend=True, 
  width=700,
  title={
      'text': "3D Visualization of Categories using PCA",
      'y': 0.9, 
      'x': 0.5, 
      'xanchor': 'center',
      'yanchor': 'top'
  },
)


# Show the combined plot
fig_3d.show()

pio.write_html(fig_3d, file="output/clusters_3d.html", full_html=False)

In [30]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

oscar_winner_nominee_df = pd.concat(q1.load_oscar_winners_nominees_all_categories())

# Create a column with descriptive labels
oscar_winner_nominee_df['status'] = oscar_winner_nominee_df['any_win'].map({True: "Winner", False: "Nominee"})

# Separate data for winners and nominees for clarity
winners_df = oscar_winner_nominee_df[oscar_winner_nominee_df['status'] == 'Winner']
nominees_df = oscar_winner_nominee_df[oscar_winner_nominee_df['status'] == 'Nominee']

# Extract numeric arrays for regression (using the entire dataset or just combined)
x = oscar_winner_nominee_df['release'].values
y = oscar_winner_nominee_df['averageRating'].values

# Compute a linear regression using numpy.polyfit
slope, intercept = np.polyfit(x, y, 1)
# Generate x-values for line (cover the entire range of release years)
x_line = np.linspace(x.min(), x.max(), 100)
y_line = slope * x_line + intercept

# Create a figure
fig = go.Figure()

# Add Nominee scatter points
fig.add_trace(
    go.Scatter(
        x=nominees_df['release'],
        y=nominees_df['averageRating'],
        mode='markers',
        marker=dict(color='#1f87b4', opacity=0.7),
        name='Nominee',
        hovertemplate=(
            "<b>Title:</b> %{customdata[0]}<br>"
            "<b>Release Year:</b> %{x}<br>"
            "<b>Average Rating:</b> %{y}<extra></extra>"
        ),
        customdata=np.column_stack([nominees_df["primaryTitle"]])
    )
)

# Add Winner scatter points
fig.add_trace(
    go.Scatter(
        x=winners_df['release'],
        y=winners_df['averageRating'],
        mode='markers',
        marker=dict(color='#ff7f0e', opacity=0.7),
        name='Winner',
        hovertemplate=(
            "<b>Title:</b> %{customdata[0]}<br>"
            "<b>Release Year:</b> %{x}<br>"
            "<b>Average Rating:</b> %{y}<extra></extra>"
        ),
        customdata=np.column_stack([winners_df["primaryTitle"]])
    )
)

# Add the regression line trace on top
fig.add_trace(
    go.Scatter(
        x=x_line,
        y=y_line,
        mode='lines',
        line=dict(color='black', width=4),
        name='Trend',
        zorder=10,
    )
)

# Update the layout
fig.update_layout(
    title={
        'text': "Average Rating of Oscar Winners and Nominees Over Time",
        'y':0.9,
        'x':0.5,
        'xanchor':'center',
        'yanchor':'top'
    },
    xaxis=dict(title="Release Year"),
    yaxis=dict(title="Average Rating"),
    legend_title_text="",
    template="plotly_white",
    width=800
)

fig.show()

pio.write_html(fig, file="output/rating_years_scatter.html", full_html=False)


In [31]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Load and prepare data
oscar_winner_nominee_df = pd.concat(q1.load_oscar_winners_nominees_all_categories())

# Create a new decade column
oscar_winner_nominee_df['decade'] = (oscar_winner_nominee_df['release'] // 10) * 10

# Create figure
fig = go.Figure()

# Add a box trace grouped by decade
fig.add_trace(
    go.Box(
        x=oscar_winner_nominee_df['decade'],
        y=oscar_winner_nominee_df['averageRating'],
        name='Average Rating',
        # boxpoints='all', # Show all points
        # jitter=0.3,       # Spread points within the box width
        pointpos=0,    # Position points relative to the box
        marker=dict(opacity=1.0, color='rgba(31, 135, 180, 0.7)'),
        customdata=oscar_winner_nominee_df['primaryTitle'],
    )
)

# Update layout
fig.update_layout(
    title={
        'text': "Distribution of Average Ratings by Decade",
        'y':0.9, 
        'x':0.5, 
        'xanchor':'center',
        'yanchor':'top'
    },
    xaxis=dict(
        title="Decade", 
        type='category',
        tickmode='array',
        tickvals=np.sort(oscar_winner_nominee_df['decade'].unique()),
        ticktext=[f"{d}s" for d in np.sort(oscar_winner_nominee_df['decade'].unique())],
    ),
    yaxis=dict(title="Average Rating"),
    template="plotly_white",
    width=800,
    legend_title_text=""
)

# Add hover template
fig.update_traces(
  hovertemplate=(
    "<b>Title:</b> %{customdata}<br>"
    "<b>Decade:</b> %{x}<br>"
    "<b>Average Rating:</b> %{y}<extra></extra>"
  ),
)


fig.show()


pio.write_html(fig, file="output/rating_decades_box.html", full_html=False)


In [46]:
import plotly.express as px
import plotly.io as pio
import numpy as np
import pandas as pd

# Assuming load_oscar_winners_nominees_all_categories() is defined elsewhere
oscar_winner_nominee_df = pd.concat(q1.load_oscar_winners_nominees_all_categories())

oscar_winner_nominee_df['status'] = oscar_winner_nominee_df['any_win'].map({True: "Winner", False: "Nominee"})

# Sort values by "any_win" for consistent color ordering
oscar_winner_nominee_df = oscar_winner_nominee_df.sort_values(by="any_win")



# Create a scatter plot with Plotly
fig = px.scatter(
    oscar_winner_nominee_df,
    x="numVotes",
    y="averageRating",
    color="status",
    custom_data=["primaryTitle"],
    color_discrete_map={"Nominee": "#4c72b0", "Winner": "#dd8452"},
    log_x=True,
    opacity=0.7,
)

# Update hover template to show more information (if desired)
fig.update_traces(
    hovertemplate=(
        "<b>Title:</b> %{customdata[0]}<br>"
        "<b>Num Votes:</b> %{x}<br>"
        "<b>Average Rating:</b> %{y}<extra></extra>"
    ),
    # marker=dict(
    #     size=8,           # Adjust the size of the circles
    #     symbol='circle-open',  # Ensure points are circular
    #     opacity=1.0     # Optional for better visibility
    # ),
)



# Add a dropdown menu to toggle x-axis scale
fig.update_layout(
    updatemenus=[
        dict(
            buttons=[
                dict(
                    label="Log Scale",
                    method="relayout",
                    args=[{"xaxis.type": "log"}],
                ),
                dict(
                    label="Linear Scale",
                    method="relayout",
                    args=[{"xaxis.type": "linear"}],
                ),
                
            ],
            direction="down",
            x=0,
            xanchor="right",
            y=1.15,
            yanchor="top",
        )
    ],
    title={
        'text': "Number of Votes vs Average Rating",
        'y': 0.9,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis=dict(
        title="Number of Votes"
    ),
    yaxis=dict(
        title="Average Rating"
    ),
    showlegend=True,
    width=800,
    legend_title_text="",
    
)

# Show interactive plot
fig.show()

# Save as HTML (if desired)
pio.write_html(fig, file="output/scatter_numVotes_vs_averageRating.html", full_html=False)


In [32]:
import plotly.graph_objects as go

# Load causal effects and standard errors
cat_effects, cat_standard_errors = q1.get_causal_effect_for_base_cat()

# Sort categories by causal effect, descending
sorted_cat_effects = sorted(cat_effects.items(), key=lambda x: x[1], reverse=True)

# Extract categories, effects, and errors
categories = [x[0] for x in sorted_cat_effects]
effects = [x[1] for x in sorted_cat_effects]
errors = [cat_standard_errors[x[0]] for x in sorted_cat_effects]

# Create the figure
fig = go.Figure()

# Add bar trace
fig.add_trace(
    go.Bar(
        x=effects,
        y=categories,
        orientation='h',
        marker=dict(color='rgba(31, 135, 180, 0.7)'),
        name="Causal Effect"
    )
)

# Add error bars
fig.update_traces(
  error_x=dict(
    type='data',
    array=errors,
    visible=True,
    color='black'
  ),
  hovertemplate=(
    "<b>Category:</b> %{y}<br>"
    "<b>Causal Effect:</b> %{x:.2f} ± %{error_x.array:.2f}<extra></extra>"
  ),
)

# Update layout with sorted bars
fig.update_layout(
    title=dict(
        text="Causal Effect of Winning an Oscar on the Rating of a Movie",
        x=0.5,  # Center the title
        xanchor="center"
    ),
    xaxis=dict(title="Causal Effect (in rating points)"),
    yaxis=dict(
        title="Category",
        categoryorder="array",
        categoryarray=categories[::-1]  # Reverse to show most positive on top
    ),
    template="plotly_white",
    width=800,
    height=800
)

fig.show()


pio.write_html(fig, file="output/raw_causal_effects_bar.html", full_html=False)


In [33]:
import plotly.graph_objects as go

# Load causal effects and standard errors
cat_effects, cat_standard_errors = q1.get_causal_effect_for_new_cat()

# Sort categories by causal effect, descending
sorted_cat_effects = sorted(cat_effects.items(), key=lambda x: x[1], reverse=True)

# Extract categories, effects, and errors
categories = [x[0] for x in sorted_cat_effects]
effects = [x[1] for x in sorted_cat_effects]
errors = [cat_standard_errors[x[0]] for x in sorted_cat_effects]

# Create the figure
fig = go.Figure()

# Add bar trace
fig.add_trace(
    go.Bar(
        x=effects,
        y=categories,
        orientation='h',
        marker=dict(color='rgba(31, 135, 180, 0.7)'),
        name="Causal Effect"
    )
)

# Add error bars
fig.update_traces(
  error_x=dict(
    type='data',
    array=errors,
    visible=True,
    color='black'
  ),
  hovertemplate=(
    "<b>Category:</b> %{y}<br>"
    "<b>Causal Effect:</b> %{x:.2f} ± %{error_x.array:.2f}<extra></extra>"
  ),
)

# Update layout with sorted bars
fig.update_layout(
    title=dict(
        text="Causal Effect of Winning an Oscar on the Rating of a Movie",
        x=0.5,  # Center the title
        xanchor="center"
    ),
    xaxis=dict(title="Causal Effect (in rating points)"),
    yaxis=dict(
        title="Category",
        categoryorder="array",
        categoryarray=categories[::-1]  # Reverse to show most positive on top
    ),
    template="plotly_white",
    width=800,
    height=800
)

fig.show()


pio.write_html(fig, file="output/new_causal_effects_bar.html", full_html=False)

In [34]:
import plotly.express as px
import pandas as pd
import plotly.io as pio

# Assuming load_oscar_movies_all_categories() is defined elsewhere
oscar_movies_df = q1.load_oscar_movies_all_categories()

print(oscar_movies_df[["primaryTitle"]].head())

nb_oscars = (
    oscar_movies_df[["primaryTitle", "averageRating", "winner"]]
    .groupby(["primaryTitle", "averageRating"])
    .sum(numeric_only=True)[["winner"]]
    .reset_index()
)

# Convert 'winner' to string for consistent categorical treatment
nb_oscars['winner'] = nb_oscars['winner'].astype(str)

# Determine the order of categories based on numeric sort
unique_winners = sorted(nb_oscars['winner'].unique(), key=lambda x: int(x))
category_orders = {"winner": unique_winners}

# Create Box Plot
fig_box = px.box(
    nb_oscars, 
    x="winner",
    y="averageRating", 
    color="winner",
    category_orders=category_orders,
    custom_data=["primaryTitle"],
    color_discrete_sequence=px.colors.qualitative.Safe  # Choose a color sequence that works for multiple categories
)

fig_box.update_traces(
    hovertemplate=(
        "<b>Title:</b> %{customdata[0]}<br>"
        "<b>Average Rating:</b> %{y}<extra></extra>"
    ),
    visible=True  # Show Box Plot by default
)

# Create Strip Plot (Raw Points)
fig_strip = px.strip(
    nb_oscars, 
    x="winner", 
    y="averageRating", 
    color="winner",
    category_orders=category_orders,
    custom_data=["primaryTitle"],
    color_discrete_sequence=px.colors.qualitative.Safe
)

fig_strip.update_traces(
    hovertemplate=(
        "<b>Title:</b> %{customdata[0]}<br>"
        "<b>Average Rating:</b> %{y}<extra></extra>"
    ),
    marker=dict(
        size=8,           # Adjust the size of the points
        symbol='circle-open', 
        opacity=1.0
    ),
    visible=False, # Hide Raw Points initially
    jitter=1,  # Add jitter for better visibility of overlapping points
)

# Combine Both Figures
fig_box.add_traces(fig_strip.data)

# Number of categories determines the number of box and strip traces:
# Box plot traces come first, then strip plot traces in the figure.
# If you have N winner categories, you'll have N box traces and N strip traces.
num_categories = len(unique_winners)

# The "visible" arrays in the update menus need to reflect the number of traces.
# For the box plot view: show all box plot traces (True) and hide all strip (False).
visible_box = [True]*num_categories + [False]*num_categories
visible_strip = [False]*num_categories + [True]*num_categories

# Add Dropdown Menu
fig_box.update_layout(
    updatemenus=[
        dict(
            buttons=[
                dict(
                    label="Box Plot",
                    method="update",
                    args=[
                        {"visible": visible_box},
                        {"title.text": "Average Ratings Distribution by Number of Oscars Won (Box Plot)"}
                    ],
                ),
                dict(
                    label="Raw Points",
                    method="update",
                    args=[
                        {"visible": visible_strip},
                        {"title.text": "Average Ratings Distribution by Number of Oscars Won (Raw Points)"}
                    ],
                ),
            ],
            direction="down",
            x=0,
            xanchor="right",
            y=1.15,
            yanchor="top",
        )
    ],
    title={
        'text': "Average Ratings Distribution by Number of Oscars Won (Box Plot)",
        'y': 0.9, 
        'x': 0.5, 
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis=dict(
        title="Number of Oscars Won"
    ),
    yaxis=dict(title="Average IMDB Rating"),
    showlegend=False,
    width=800,
)

# Show the interactive plot
fig_box.show()

# Save the figure as HTML
pio.write_html(fig_box, file="output/nb_oscars_boxplot.html", full_html=False)


       primaryTitle
0   a ship comes in
1        the circus
2    glorious betsy
3  the last command
4  the last command
