In [61]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick # For percentage formatting
from matplotlib.gridspec import GridSpec
import seaborn as sns # For despine
import pycountry

In [62]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

In [63]:
import plot_style
import importlib
importlib.reload(plot_style) # Reload to ensure latest changes
plot_style.activate_plot_style()


Custom plot style 'publication_quality_plots' applied.


In [64]:
df = pd.read_parquet("../data/movies.parquet")

In [65]:
df.columns

Index(['title_id', 'title', 'original_title', 'release_year',
       'runtime_minutes', 'genre', 'imdb_rating', 'vote_count',
       'production_country'],
      dtype='object')

In [66]:
# Print number of unique production country values
print(f"Number of unique production country values: {df['production_country'].nunique()}")

Number of unique production country values: 118177


In [68]:
FOCUS_GENRES = plot_style.MOVIE_GENRES
print(FOCUS_GENRES)

['Drama', 'Comedy', 'Documentary', 'Romance', 'Action', 'Crime', 'Thriller', 'Horror', 'Adventure', 'Mystery']


In [70]:
df[["production_country", "genre"]].head()

Unnamed: 0,production_country,genre
0,AU|DE|HU|US,Romance
1,RU|US,"Documentary,News,Sport"
2,ES,
3,AU|DE|GB|HU|RS|SG|US,"Action,Adventure,Biography"
4,FR|US,Drama


In [71]:
# Explode production country and genre columns
df_exploded = df.explode("production_country").explode("genre")

In [72]:
# --- 1. Handle production_country ---
def get_alpha3(alpha2_code):
    """Converts 2-letter country code to 3-letter ISO Alpha-3 code."""
    if not alpha2_code or pd.isna(alpha2_code):
        return None
    try:
        country = pycountry.countries.get(alpha_2=alpha2_code)
        return country.alpha_3
    except AttributeError: # Country not found by pycountry
        # print(f"Warning: Could not find alpha-3 code for {alpha2_code}")
        return None


In [73]:
# First, split the 'production_country' field into a list
df['production_country'] = df['production_country'].str.split('|')

# Explode the list into individual rows
df = df.explode('production_country')

# Now apply the conversion to alpha-3
df['country_alpha3'] = df['production_country'].apply(get_alpha3)


In [74]:
# Create a mapping for country names for hover text later
def get_country_name(alpha3_code):
    if not alpha3_code or pd.isna(alpha3_code):
        return "Unknown"
    try:
        country = pycountry.countries.get(alpha_3=alpha3_code)
        return country.name
    except AttributeError:
        return alpha3_code # Fallback to code if name not found


In [75]:
# Drop rows where country code couldn't be converted, as they can't be plotted
df_countries = df.dropna(subset=['country_alpha3']).copy()
df_countries['country_name'] = df_countries['country_alpha3'].apply(get_country_name)

print(f"Original rows: {len(df)}, Rows after country conversion: {len(df_countries)}")
print(df_countries[['production_country', 'country_alpha3', 'country_name']].sample(5))


Original rows: 2467021, Rows after country conversion: 2283946
       production_country country_alpha3 country_name
269764                 ES            ESP        Spain
183903                 FR            FRA       France
305205                 IN            IND        India
381189                 ES            ESP        Spain
73561                  IS            ISL      Iceland


In [76]:
# --- 2. Handle genre ---
# Explode genres: each movie-genre pair gets a row
df_exploded = df_countries.assign(genre=df_countries['genre'].str.split(',')).explode('genre')
df_exploded['genre'] = df_exploded['genre'].str.strip() # Clean whitespace


In [77]:
print("\nSample of exploded genres:")
print(df_exploded[['title', 'country_alpha3', 'genre']].head())



Sample of exploded genres:
                           title country_alpha3        genre
0                     Miss Jerry            AUS      Romance
0                     Miss Jerry            DEU      Romance
0                     Miss Jerry            HUN      Romance
0                     Miss Jerry            USA      Romance
1  The Corbett-Fitzsimmons Fight            RUS  Documentary


In [86]:
# --- 3. Filter for Focus Genres (excluding Drama) ---
FOCUS_GENRES = plot_style.MOVIE_GENRES
# Genres for the interactive plot, excluding 'Drama'
INTERACTIVE_GENRES = [genre for genre in FOCUS_GENRES if genre in df_exploded['genre'].unique()]
print(f"\nFocus Genres for map (excluding Drama): {INTERACTIVE_GENRES}")



Focus Genres for map (excluding Drama): ['Drama', 'Comedy', 'Documentary', 'Romance', 'Action', 'Crime', 'Thriller', 'Horror', 'Adventure', 'Mystery']


In [87]:
# Filter the exploded DataFrame to only include movies of these interactive genres
df_genre_filtered = df_exploded[df_exploded['genre'].isin(INTERACTIVE_GENRES)].copy()


In [88]:
# --- 4. Calculate Total Movies per Country ---
# Important: Use df_countries (before genre-specific filtering) for the denominator
# We count unique title_ids to avoid issues if a movie was listed multiple times for a country (unlikely with this schema but good practice)
total_movies_per_country = df_countries.groupby(['country_alpha3', 'country_name'])['title_id'].nunique().reset_index(name='total_movies')
print("\nTotal movies per country (sample):")
print(total_movies_per_country.sort_values('total_movies', ascending=False).head())



Total movies per country (sample):
    country_alpha3    country_name  total_movies
198            USA   United States        136485
91             IND           India         46514
100            JPN           Japan         23689
68             GBR  United Kingdom         21604
65             FRA          France         12764


In [89]:
# --- 5. Calculate Genre-Specific Movies per Country ---
# Use df_genre_filtered (which already has one row per movie-genre pair for relevant genres)
genre_movies_per_country = df_genre_filtered.groupby(['country_alpha3', 'country_name', 'genre'])['title_id'].nunique().reset_index(name='genre_specific_movies')
print("\nGenre-specific movies per country (sample):")
print(genre_movies_per_country.head())



Genre-specific movies per country (sample):
  country_alpha3 country_name        genre  genre_specific_movies
0            ABW        Aruba       Comedy                      1
1            ABW        Aruba  Documentary                      2
2            ABW        Aruba        Drama                      1
3            AFG  Afghanistan       Action                      5
4            AFG  Afghanistan    Adventure                      3


In [90]:
# --- 6. Combine and Calculate Metric ---
# Merge genre-specific counts with total counts
df_metric = pd.merge(genre_movies_per_country, total_movies_per_country,
                     on=['country_alpha3', 'country_name'], how='left')


In [91]:
# Calculate Genre Strength
df_metric['genre_strength_pct'] = (df_metric['genre_specific_movies'] / df_metric['total_movies']) * 100

print("\nFinal metric data (sample):")
print(df_metric.sort_values('genre_strength_pct', ascending=False).head())


Final metric data (sample):
     country_alpha3                      country_name        genre  \
1450            VCT  Saint Vincent and the Grenadines  Documentary   
982             MSR                        Montserrat  Documentary   
62              ATA                        Antarctica  Documentary   
571             GUF                     French Guiana  Documentary   
60              ASM                    American Samoa       Comedy   

      genre_specific_movies  total_movies  genre_strength_pct  
1450                      1             1               100.0  
982                       1             1               100.0  
62                        3             3               100.0  
571                       1             1               100.0  
60                        1             1               100.0  


In [92]:
# Optional: Filter out countries with very few total movies to make percentages more meaningful
MIN_TOTAL_MOVIES_THRESHOLD = 10 # Example threshold
df_metric_filtered = df_metric[df_metric['total_movies'] >= MIN_TOTAL_MOVIES_THRESHOLD]
print(f"\nRows in metric df: {len(df_metric)}, Rows after filtering for >= {MIN_TOTAL_MOVIES_THRESHOLD} total movies: {len(df_metric_filtered)}")
# For the plot, we will use df_metric_filtered to make it less noisy,
# but you can choose to use df_metric if you prefer.
# For this example, let's stick with df_metric_filtered for cleaner viz
# If you want all countries, use: prepared_data = df_metric.copy()
prepared_data = df_metric_filtered.copy()


Rows in metric df: 1509, Rows after filtering for >= 10 total movies: 1405


In [94]:
import pandas as pd
import pycountry
import plotly.graph_objects as go
import plotly.io as pio
import plot_style # Your custom style file with GENRE_COLOR_MAP

# --- Helper function to lighten hex color ---
def lighten_hex_color_to_hex(hex_color, amount=0.85):
    if not hex_color or not isinstance(hex_color, str) or not hex_color.startswith('#'):
        return '#EEEEEE'
    hex_color = hex_color.lstrip('#')
    if len(hex_color) != 6:
        return '#EEEEEE'
    try:
        rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
        new_rgb = [int(c + (255 - c) * amount) for c in rgb]
        new_rgb = [min(255, max(0, c)) for c in new_rgb]
        return "#%02x%02x%02x" % tuple(new_rgb)
    except ValueError:
        return '#EEEEEE'

# --- Data Preparation (Ensure this section is run correctly) ---
# (Same data prep as before - ensure it's executed and `prepared_data`, `INTERACTIVE_GENRES` are populated)
df = pd.read_parquet("../data/movies.parquet")

def get_alpha3(alpha2_code):
    if not alpha2_code or pd.isna(alpha2_code): return None
    try: return pycountry.countries.get(alpha_2=alpha2_code).alpha_3
    except AttributeError: return None

df['country_alpha3'] = df['production_country'].apply(get_alpha3)
df_countries = df.dropna(subset=['country_alpha3']).copy()

def get_country_name(alpha3_code):
    if not alpha3_code or pd.isna(alpha3_code): return "Unknown"
    try: return pycountry.countries.get(alpha_3=alpha3_code).name
    except AttributeError: return alpha3_code
df_countries['country_name'] = df_countries['country_alpha3'].apply(get_country_name)

df_exploded = df_countries.assign(genre=df_countries['genre'].str.split(',')).explode('genre')
df_exploded['genre'] = df_exploded['genre'].str.strip()

FOCUS_GENRES_FROM_STYLE = plot_style.MOVIE_GENRES
INTERACTIVE_GENRES = [
    genre for genre in FOCUS_GENRES_FROM_STYLE
    if genre in df_exploded['genre'].unique()
]

if not INTERACTIVE_GENRES:
    available_genres = [g for g in df_exploded['genre'].unique() if g != 'Drama']
    if not available_genres and df_exploded['genre'].unique().any():
        available_genres = list(df_exploded['genre'].unique())
    INTERACTIVE_GENRES = available_genres[:max(1, len(available_genres))]
    if not INTERACTIVE_GENRES:
        raise ValueError("No suitable interactive genres found in the data.")
print(f"Using interactive genres: {INTERACTIVE_GENRES}")


df_genre_filtered = df_exploded[df_exploded['genre'].isin(INTERACTIVE_GENRES)].copy()

total_movies_per_country = df_countries.groupby(['country_alpha3', 'country_name'])['title_id'].nunique().reset_index(name='total_movies')
genre_movies_per_country = df_genre_filtered.groupby(['country_alpha3', 'country_name', 'genre'])['title_id'].nunique().reset_index(name='genre_specific_movies')

df_metric = pd.merge(genre_movies_per_country, total_movies_per_country,
                     on=['country_alpha3', 'country_name'], how='left')

df_metric['total_movies_safe'] = df_metric['total_movies'].replace(0, pd.NA)
df_metric['genre_strength_pct'] = (df_metric['genre_specific_movies'] / df_metric['total_movies_safe']) * 100
df_metric.dropna(subset=['genre_strength_pct'], inplace=True)

MIN_TOTAL_MOVIES_THRESHOLD = 10
prepared_data = df_metric[df_metric['total_movies'] >= MIN_TOTAL_MOVIES_THRESHOLD].copy()

if prepared_data.empty and not df_metric.empty:
    print(f"Warning: No data after MIN_TOTAL_MOVIES_THRESHOLD ({MIN_TOTAL_MOVIES_THRESHOLD}). Using unfiltered metric data.")
    prepared_data = df_metric.copy()
elif prepared_data.empty and df_metric.empty:
    raise ValueError("No data available for plotting after all preparation.")
# --- End of Data Prep ---


# --- Interactive Plotting with Enhancements ---
pio.templates.default = "plotly_white"
fig = go.Figure()

# Initial genre setup
initial_genre = INTERACTIVE_GENRES[0]
initial_df_slice = prepared_data[prepared_data['genre'] == initial_genre]

initial_genre_color = plot_style.GENRE_COLOR_MAP.get(initial_genre, '#777777')
initial_light_genre_color = lighten_hex_color_to_hex(initial_genre_color, 0.85)

initial_max_strength = 0.0
if not initial_df_slice.empty:
    initial_max_strength = initial_df_slice['genre_strength_pct'].max()
if pd.isna(initial_max_strength) or initial_max_strength <= 0: # Ensure it's positive
    initial_max_strength = 1.0

fig.add_trace(
    go.Choropleth(
        locations=initial_df_slice['country_alpha3'],
        z=initial_df_slice['genre_strength_pct'],
        locationmode='ISO-3',
        colorscale=[[0, initial_light_genre_color], [1, initial_genre_color]],
        colorbar_title='Genre Strength (%)',
        zmin=0,
        zmax=initial_max_strength, # Dynamic max for this genre
        customdata=initial_df_slice[['country_name', 'genre_specific_movies', 'total_movies']],
        hovertemplate=(
            "<b>%{customdata[0]}</b> (%{location})<br>"
            "Genre Strength: %{z:.2f}%<br>"
            "%{text} Movies: %{customdata[1]}<br>"
            "Total Movies: %{customdata[2]}"
            "<extra></extra>"
        ),
        text=initial_df_slice['genre'],
        name='',
        uid=f"choropleth_{initial_genre.lower().replace(' ', '_')}" # Unique ID for the trace
    )
)

# Create buttons for genre selection
buttons = []
for i, genre_to_select in enumerate(INTERACTIVE_GENRES):
    df_slice = prepared_data[prepared_data['genre'] == genre_to_select]
    genre_color = plot_style.GENRE_COLOR_MAP.get(genre_to_select, '#777777')
    light_genre_color = lighten_hex_color_to_hex(genre_color, 0.85)

    current_genre_max_strength = 0.0
    if not df_slice.empty:
        current_genre_max_strength = df_slice['genre_strength_pct'].max()
    if pd.isna(current_genre_max_strength) or current_genre_max_strength <= 0: # Ensure positive
        current_genre_max_strength = 1.0

    button_label = f'<span style="color:{genre_color}; font-weight:bold;">{genre_to_select}</span>'

    buttons.append(
        dict(
            method='update', # Use 'update' for modifying existing trace
            label=button_label,
            args=[
                { # Dictionary of trace properties to update
                  # These keys (z, locations, etc.) directly update fig.data[0]
                    'z': [df_slice['genre_strength_pct'].tolist()], # Ensure it's a list of lists for data
                    'locations': [df_slice['country_alpha3'].tolist()],
                    'customdata': [df_slice[['country_name', 'genre_specific_movies', 'total_movies']].values.tolist()],
                    'text': [df_slice['genre'].tolist()],
                    'colorscale': [[[0, light_genre_color], [1, genre_color]]], # This should be a list containing the colorscale
                    'zmax': [current_genre_max_strength], # This should be a list containing the new zmax
                    'zmin': [0], # This should be a list containing the new zmin
                    # 'uid': [f"choropleth_{genre_to_select.lower().replace(' ', '_')}"] # Optional: update UID if needed, though not strictly necessary for update
                },
                { # Dictionary of layout properties to update
                    'title.text': f'Global Genre Specialization: <span style="color:{genre_color};">{genre_to_select}</span> Movies'
                }
            ]
        )
    )

map_width = 1280
map_height = 720

fig.update_layout(
    width=map_width,
    height=map_height,
    title_text=f'Global Genre Specialization: <span style="color:{initial_genre_color};">{initial_genre}</span> Movies',
    title_x=0.5,
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='natural earth',
        landcolor='rgb(205, 205, 205)',
        bgcolor='rgba(255,255,255,0)',
        subunitcolor='rgb(255, 255, 255)',
    ),
    updatemenus=[
        dict(
            type="buttons",
            direction="left",
            active=0,
            buttons=buttons,
            pad={"r": 10, "t": 10, "b":10},
            showactive=True,
            x=0.5,
            xanchor="center",
            y=1.12, # Adjusted y a bit higher
            yanchor="top",
            bgcolor='rgba(0,0,0,0)',
            font=dict(size=11)
        )
    ],
    margin=dict(t=130, b=40, l=40, r=40), # Adjusted top margin
    paper_bgcolor='white',
    plot_bgcolor='white',
)

fig.update_traces(showscale=True) # Ensure colorbar is always shown for the choropleth
fig.show()

Using interactive genres: ['Drama', 'Comedy', 'Documentary', 'Romance', 'Action', 'Crime', 'Thriller', 'Horror', 'Adventure', 'Mystery']


In [95]:
# Save the figure as an HTML file
pio.write_html(fig, file="../figures/interactive_genre_map.html", auto_open=True)