## How does the genre of a movie influence the type of ending it has (happy, tragic, open-ended)? Certain genres may have a higher likelihood of happy or tragic endings.

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pickle
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import widgets, Output
from IPython.display import display
from scipy.stats import shapiro, levene, f_oneway, kruskal
import scikit_posthocs as sp
from scikit_posthocs import posthoc_dunn

In [11]:
# path
DATA_FOLDER = '../data/'
MOVIE_DATASET = DATA_FOLDER + 'movies_dataset_final_1.tsv'

# Dataset loading
movies = pd.read_csv(MOVIE_DATASET, sep='\t')

In [12]:
copper_colormap = cm.get_cmap('copper', 256)  # 256 niveaux
copper_colorscale = [
    [i / 255, f"rgb({int(255 * r)}, {int(255 * g)}, {int(255 * b)})"]
    for i, (r, g, b, _) in enumerate(copper_colormap(np.linspace(0, 1, 256)))
]


The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.



What kind of genres do we have in this dataset ? And how many movies for each genre ?

In [13]:
#Drop rows with no score
movies['Movie genres'] = movies['Movie genres'].str.lower()
genre_counts = movies['Movie genres'].str.split(', ').explode().value_counts().reset_index()
genre_counts.columns = ['Genre', 'Number of movies']
genre_counts = genre_counts.sort_values(by='Number of movies', ascending=False)
print("Number of genres:", len(genre_counts))

Number of genres: 346


## Cleaning and Filtering Movie Genres

We observe that we have 346 different movie genres. We will keep only the most common ones, and regroup the same genres that have not the exact same string.

In [14]:
genre_counts = genre_counts[genre_counts['Number of movies'] > 500]
print("Number of genres of our interest for our analysis: ", len(genre_counts))

Number of genres of our interest for our analysis:  36


In [15]:
# New dataframe 
movies_genres = movies.copy()
movies_genres = movies_genres.dropna(subset=['Movie genres'])
movies_genres = movies_genres.dropna(subset=['Score'])
movies_genres['Movie genres'] = movies_genres['Movie genres'].str.split(', ')
movies_genres = movies_genres.explode('Movie genres')
movies_genres = movies_genres.reset_index(drop=True)
movies_genres['Movie genres'] = movies_genres['Movie genres'].str.lower()

# Remove the rows of the movies_genres dataframe which genre is not in the genre_counts dataframe
movies_genres = movies_genres[movies_genres['Movie genres'].isin(genre_counts['Genre'])]
movies_genres = movies_genres.reset_index(drop=True)

In [16]:
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import plotly.express as px
# Calculate sample sizes of the genres that we keep
sample_sizes = movies_genres.groupby('Movie genres').size().reset_index(name='Count')
sample_sizes = sample_sizes.sort_values(by='Count', ascending=False)
# Define copper colorscale in valid format
copper_colorscale = [
    (255 / 255, 242 / 255, 230 / 255),
    (230 / 255, 169 / 255, 148 / 255),
    (202 / 255, 94 / 255, 91 / 255),
    (153 / 255, 51 / 255, 51 / 255),
    (102 / 255, 0 / 255, 0 / 255)
]

# Create a LinearSegmentedColormap
copper_interpolated = mcolors.LinearSegmentedColormap.from_list("copper", copper_colorscale)

# Interpolate colors for the genres
num_genres = sample_sizes.shape[0]
genre_colors = {
    genre: mcolors.rgb2hex(copper_interpolated(i / (num_genres - 1)))
    for i, genre in enumerate(sample_sizes['Movie genres'])
}

# Create bar chart with the custom color scale
fig = px.bar(
    sample_sizes,
    x='Movie genres',
    y='Count',
    color='Movie genres',
    title="Number of Movies per Genre",
    color_discrete_map=genre_colors
)

fig.update_layout(
    xaxis_title="Genre",
    yaxis_title="Number of Movies",
    width=900,
    height=600
)

fig.show()


In [17]:
# Mean ending score for each genre
genre_scores = movies_genres.groupby('Movie genres').agg({'Score': 'mean', 'Movie genres': 'count'})
genre_scores.columns = ['Mean score', 'Number of movies']
#print("Genres that have a negative average score: ", genre_scores[genre_scores['Mean score'] < 0].index)

genre_scores = genre_scores.reset_index()
genre_scores = genre_scores.sort_values(by='Mean score', ascending=True)
genre_scores = genre_scores.reset_index(drop=True)


In [19]:
copper_colormap = cm.get_cmap('copper', 256)  # 256 niveaux
copper_colorscale = [
    [i / 255, f"rgb({int(255 * r)}, {int(255 * g)}, {int(255 * b)})"]
    for i, (r, g, b, _) in enumerate(copper_colormap(np.linspace(0, 1, 256)))
]
# Create bubble chart
fig = px.scatter(
    genre_scores,
    x='Mean score',
    y='Movie genres',
    size='Number of movies',
    color='Mean score',
    color_continuous_scale=copper_colorscale,  # Adjust color scheme
    title='Distribution of genres by mean ending score',
    labels={'Mean_Score': 'Mean Score', 'Genre': 'Genre', 'Movie_Count': 'Number of Movies'}
)

# Improve layout
fig.update_traces(marker=dict(opacity=0.8, line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(
    xaxis_title="Mean score",
    yaxis_title="Genre",
    coloraxis_colorbar=dict(title="Mean Score"),
    height=600
)

fig.show()


The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.



In [22]:
# Round scores to 2 decimals to aggregate close values
movies_genres['Rounded2 Score'] = movies_genres['Score'].round(2)

# Get unique genres
genres = movies_genres['Movie genres'].unique()

# Create output widget for displaying the plot
output = Output()

# Create figure function
def create_figure(selected_genre):
    # Filter for the selected genre
    filtered_df = movies_genres[movies_genres['Movie genres'] == selected_genre]
    
    # Aggregate movie counts by rounded scores
    score_counts = filtered_df.groupby('Rounded2 Score').size().reset_index(name='Number of Movies')
    
    # Create a bar plot for the distribution
    fig = px.bar(
        score_counts,
        x='Rounded2 Score',
        y='Number of Movies',
        hover_data={'Rounded2 Score': True, 'Number of Movies': True},
        title=f"Movie Score Distribution for Genre: {selected_genre}",
        labels={'Rounded2 Score': 'Score', 'Number of Movies': 'Number of Movies'},
        template="plotly_white"
    )
    
    fig.update_layout(
        xaxis_title="Score",
        yaxis_title="Number of Movies",
        height=400
    )
    
    return fig

# Update plot function
def update_plot(change):
    selected_genre = change['new']
    with output:
        output.clear_output(wait=True)  # Clear previous plot
        fig = create_figure(selected_genre)
        fig.show()

# Create Dropdown widget
genre_dropdown = widgets.Dropdown(
    options=genres,
    value=genres[0],
    description='Genre:',
)

# Attach update function to the dropdown
genre_dropdown.observe(update_plot, names='value')

# Display the dropdown and initial plot
display(genre_dropdown, output)

# Show the initial plot
with output:
    fig = create_figure(genre_dropdown.value)
    fig.show()


Dropdown(description='Genre:', options=('thriller', 'science fiction', 'horror', 'adventure', 'action', 'psych…

Output()

In [27]:
# Step 1: Group scores by genres
groups = [group['Score'].values for _, group in movies_genres.groupby('Movie genres')]

# Step 2: Verify assumptions
print("\nStep 2: Checking Assumptions")

# 2.1 Normality Test (Shapiro-Wilk)
print("\nNormality Test:")
for i, group in enumerate(groups):
    if len(group) >= 3:  # Shapiro requires at least 3 data points
        stat, p = shapiro(group)
        print(f"Genre {i+1} (size={len(group)}) : W={stat:.3f}, p-value={p:.3f}")
    else:
        print(f"Genre {i+1} (size={len(group)}) : Not enough data for Shapiro-Wilk test.")

# 2.2 Homogeneity of Variances Test (Levene)
if len(groups) > 1:  # Levene requires at least two groups
    stat, p = levene(*groups)
    print("\nHomogeneity of Variances Test (Levene):")
    print(f"Statistic={stat:.3f}, p-value={p:.3f}")
else:
    print("\nNot enough groups to perform Levene's test.")

# Step 3: Perform statistical test (ANOVA or Kruskal-Wallis)
if all([len(group) >= 3 for group in groups]) and len(groups) > 1:
    if all([shapiro(group)[1] > 0.05 for group in groups]) and p > 0.05:
        # Use ANOVA if normality and homogeneity of variances are satisfied
        stat, p = f_oneway(*groups)
        print("\nANOVA Test:")
        print(f"F-statistic={stat:.3f}, p-value={p:.3f}")
    else:
        # Use Kruskal-Wallis if assumptions are violated
        stat, p = kruskal(*groups)
        print("\nKruskal-Wallis Test (Non-parametric Alternative):")
        print(f"Statistic={stat:.3f}, p-value={p:.3f}")
else:
    print("\nNot enough valid data or groups to perform statistical tests.")



Step 2: Checking Assumptions

Normality Test:
Genre 1 (size=3297) : W=0.975, p-value=0.000
Genre 2 (size=2332) : W=0.969, p-value=0.000
Genre 3 (size=874) : W=0.990, p-value=0.000
Genre 4 (size=2053) : W=0.980, p-value=0.000
Genre 5 (size=1174) : W=0.982, p-value=0.000
Genre 6 (size=558) : W=0.972, p-value=0.000
Genre 7 (size=1808) : W=0.979, p-value=0.000
Genre 8 (size=754) : W=0.985, p-value=0.000
Genre 9 (size=552) : W=0.970, p-value=0.000
Genre 10 (size=514) : W=0.964, p-value=0.000
Genre 11 (size=6122) : W=0.983, p-value=0.000
Genre 12 (size=727) : W=0.992, p-value=0.000
Genre 13 (size=502) : W=0.994, p-value=0.054
Genre 14 (size=2276) : W=0.987, p-value=0.000
Genre 15 (size=988) : W=0.980, p-value=0.000
Genre 16 (size=532) : W=0.962, p-value=0.000
Genre 17 (size=9457) : W=0.985, p-value=0.000
Genre 18 (size=1807) : W=0.982, p-value=0.000
Genre 19 (size=1351) : W=0.982, p-value=0.000
Genre 20 (size=2208) : W=0.975, p-value=0.000
Genre 21 (size=1834) : W=0.981, p-value=0.000
Genre


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 6122.


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 9457.



In [28]:
# Step 4: Perform Dunn's test
dunn_results = posthoc_dunn(
    movies_genres, 
    val_col='Score',    # Column with the continuous variable
    group_col='Movie genres',  # Column with the categorical variable
    p_adjust='bonferroni'      # Adjust p-values for multiple comparisons
)

# Step 3: Display results
print("\nDunn's Test Pairwise Comparisons:")
print(dunn_results)


Dunn's Test Pairwise Comparisons:
                              action  action/adventure    adaptation  \
action                  1.000000e+00      1.000000e+00  1.000000e+00   
action/adventure        1.000000e+00      1.000000e+00  1.000000e+00   
adaptation              1.000000e+00      1.000000e+00  1.000000e+00   
adventure               1.266856e-06      8.025556e-05  1.000000e+00   
animation               1.941936e-10      1.501176e-08  7.506449e-02   
black comedy            8.076488e-01      1.000000e+00  1.000000e+00   
black-and-white         1.000000e+00      1.000000e+00  1.000000e+00   
bollywood               6.258061e-09      1.345581e-07  3.953796e-02   
children's/family       6.647612e-15      2.491226e-13  3.067032e-06   
chinese                 1.000000e+00      1.000000e+00  1.000000e+00   
comedy                  1.324805e-40      7.087164e-30  4.757992e-07   
comedy-drama            1.446463e-12      6.429989e-11  4.520375e-04   
coming of age           1.174

In [30]:
# Transform the p-values to a logarithmic scale
log_dunn_results = -np.log10(dunn_results)  # Use -log10 to make small p-values more prominent

# Adjust hover text for the log scale
hover_text = dunn_results.applymap(lambda x: f"p-value: {x:.4e}\n-log10(p): {-np.log10(x):.4f}").values

# Create the heatmap
heatmap = go.Figure(
    data=go.Heatmap(
        z=log_dunn_results.values,
        x=dunn_results.columns,
        y=dunn_results.index,
        text=hover_text,
        hoverinfo="text",
        colorscale=copper_colorscale,  # Change colorscale as desired
        zmin=0,                # Min value for color range
        zmax=np.nanmax(log_dunn_results.values),  # Max value for color range
        colorbar=dict(title="-log10(p-value)"),
    )
)

# Update layout
heatmap.update_layout(
    title="Dunn Test Pairwise Comparison Heatmap (Log Scale)",
    xaxis=dict(title="Genres"),
    yaxis=dict(title="Genres"),
    width=700,
    height=600,
)

# Show the heatmap
heatmap.show()


DataFrame.applymap has been deprecated. Use DataFrame.map instead.

