In [31]:
import pandas as pd
import numpy as np 
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pycountry_convert as pc
from plotly.subplots import make_subplots
import plotly.graph_objects as go


In [32]:
movies = pd.read_pickle("./data/Processed/movies_cleaned.pkl")

In [33]:
movies.columns

Index(['actor_name', 'wikiID', 'freebaseID', 'movie_title',
       'movie_release_date', 'movie_bo_revenue', 'movie_runtime',
       'fbid_languages', 'fbid_countries', 'fbid_genres', 'year',
       'character_name', 'actor_date_of_birth', 'actor_gender',
       'actor_height_meters', 'actor_ethni_fbid', 'actor_age_at_movie_release',
       'fbid_char_actor_map', 'fbid_char', 'fbid_actor', 'summary', 'budget',
       'popularity', 'vote_average', 'imdbid', 'id', 'director',
       'director_gender', 'producer', 'producer_gender', 'writer',
       'writer_gender'],
      dtype='object')

In [34]:
movies = movies[["year","fbid_countries","actor_gender"]]

In [37]:
female_df = movies[movies['actor_gender'] == 'F']

# Flatten the list of countries
female_df = female_df.explode('fbid_countries')

# Aggregate total count for each country
total_count_by_country = female_df['fbid_countries'].value_counts()

# Identify top 25 countries
top_25_countries = total_count_by_country.head(15).index

# Filter the DataFrame to include only top 25 countries
filtered_df = female_df[female_df['fbid_countries'].isin(top_25_countries)]

# Group by year and country, then count the number of female actors
grouped_df = filtered_df.groupby(['year', 'fbid_countries']).size().reset_index(name='count')

def plot_movies(grouped_df):
    # Create subplots
    fig = make_subplots()

    # Get unique countries
    countries = grouped_df['fbid_countries'].unique()

    # Add a trace for each country
    for country in countries:
        country_df = grouped_df[grouped_df['fbid_countries'] == country]
        fig.add_trace(go.Scatter(x=country_df['year'], y=country_df['count'],
                                 name=country))

    # Update layout
    fig.update_layout(title="Evolution of the Number of Movies with Female Actors by Country",
                      xaxis_title="Year",
                      yaxis_title="Number of Movies with Female Actors")

    return fig

# Call the function and show the plot
fig = plot_movies(grouped_df)
fig.show()

In [38]:
file_path = 'countries_movies.html'  # Vous pouvez changer le chemin et le nom de fichier selon vos besoins
fig.write_html(file_path)

In [39]:
def country_to_continent(country_name):
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        return pc.convert_continent_code_to_continent_name(country_continent_code)
    except:
        return "Unknown"  # For countries not found in the pycountry_convert database

# Flatten the list of countries and repeat the other columns for each country
movies = movies.explode('fbid_countries')

# Map countries to continents
movies['continent'] = movies['fbid_countries']#.apply(country_to_continent)

# Filter out only female actors
female_df = movies[movies['actor_gender'] == 'F']

# Group by year and continent, then count the number of female actors
grouped_df = female_df.groupby(['year', 'continent']).size().reset_index(name='count')
grouped_df = grouped_df.loc[grouped_df['continent'] != 'Unknown']



# Function to plot data using Plotly
def plot_continents(grouped_df):
    # Create subplots
    fig = make_subplots()

    # Get unique continents
    continents = grouped_df['continent'].unique()

    # Add a trace for each continent
    for continent in continents:
        continent_df = grouped_df[grouped_df['continent'] == continent]
        fig.add_trace(go.Scatter(x=continent_df['year'], y=continent_df['count'],
                                 name=continent))

    # Update layout
    fig.update_layout(title="Evolution of the Number of Movies with Female Actors by Continent",
                      xaxis_title="Year",
                      yaxis_title="Number of Movies with Female Actors")

    return fig

# Call the function and show the plot
fig = plot_continents(grouped_df)
fig.show()

In [40]:
file_path = 'continent_movie.html'  # Vous pouvez changer le chemin et le nom de fichier selon vos besoins
fig.write_html(file_path)

In [41]:
import random

# List of all the countries mentioned in the prompt
all_countries = [
    'United States of America', 'United Kingdom', 'Canada', 'India',
    'Poland', 'Germany', 'Hungary', 'France', 'Yugoslavia',
    'South Africa', 'England', 'Finland', 'South Korea', 'Ireland',
    'Hong Kong', 'China', 'Denmark', 'Italy', 'Mexico', 'Spain',
    'Israel', 'Australia', 'New Zealand', 'Japan', 'Belgium',
    'Luxembourg', 'Sweden', 'West Germany', 'Argentina', 'Chile',
    'Greece', 'Netherlands', 'Austria', 'Norway', 'Brazil', 'Colombia',
    'Switzerland', 'Lebanon', 'Scotland', 'Romania', 'Monaco',
    'Indonesia', 'Jamaica', 'Uruguay', 'Bolivia', 'Malta', 'Peru',
    'Serbia and Montenegro', 'Bulgaria', 'Serbia', 'Weimar Republic',
    'Turkey', 'Croatia', 'Morocco', 'Taiwan', 'Cameroon',
    'Soviet Union', 'Russia', 'Thailand', 'Algeria', 'Kuwait',
    'United Arab Emirates', 'Czech Republic', 'Iran', 'Iceland',
    'Egypt', 'Singapore', 'Malaysia', 'Vietnam', 'Philippines',
    'Pakistan', 'Czechoslovakia', 'Isle of Man',
    'Kingdom of Great Britain', 'Aruba',
    'Socialist Federal Republic of Yugoslavia', 'Cuba', 'Korea',
    'Portugal', 'Puerto Rico', 'Bosnia and Herzegovina', 'Lithuania',
    'Slovenia', 'Montenegro', 'Republic of Macedonia', 'Senegal',
    'Burkina Faso', 'Tunisia', 'Slovakia', 'Bahamas', 'Costa Rica',
    'Panama', 'Estonia', 'Albania', 'Nazi Germany', 'Kenya', 'Ukraine',
    'Libya', 'Wales', 'Cyprus', 'Palestinian territories',
    'Mandatory Palestine', 'Bhutan', 'Venezuela', 'Slovak Republic',
    'German Language', 'Afghanistan', 'Zimbabwe'
]

# Generating random coordinates for countries whose exact coordinates are unknown
# Latitude range: -90 to 90, Longitude range: -180 to 180
def generate_random_coordinates():
    return (random.randint(-90, 90), random.randint(-180, 180))

# Coordinates for known countries
known_country_coordinates = {
    'United States of America': (38, -97),
    'United Kingdom': (55, -3),
    'Canada': (56, -106),
    'India': (21, 78),
    'Poland': (52, 19),
    'Germany': (51, 9),
    'Hungary': (47, 20),
    'France': (46, 2),
    'South Africa': (-30, 25),
    'Finland': (64, 26),
    'South Korea': (36, 128),
    'Ireland': (53, -8),
    'Hong Kong': (22, 114),
    'China': (35, 105),
    'Denmark': (56, 10),
    'Italy': (43, 12),
    'Mexico': (23, -102),
    'Spain': (40, -4),
    'Australia': (-25, 135),
    'Japan': (36, 138),
    'Israel': (31.5, 34.75),
    'New Zealand': (-41, 174),
    'Belgium': (50.85, 4.35),
    'Luxembourg': (49.61, 6.13),
    'Sweden': (59.33, 18.06),
    'Argentina': (-34.61, -58.38),
    'Chile': (-33.45, -70.66),
    'Greece': (37.98, 23.72),
    'Netherlands': (52.37, 4.89),
    'Austria': (48.21, 16.37),
    'Norway': (59.91, 10.75),
    'Brazil': (-15.78, -47.93),
    'Colombia': (4.61, -74.08),
    'Switzerland': (46.95, 7.45),
    'Lebanon': (33.89, 35.49),
    'Scotland': (55.95, -3.19),
    'Romania': (44.43, 26.1),
    'Monaco': (43.73, 7.42),
    'Indonesia': (-6.2, 106.85),
    'Jamaica': (18.0179, -76.8099),
    'Uruguay': (-34.9011, -56.1645),
    'Bolivia': (-16.4897, -68.1193),
    'Malta': (35.8989, 14.5146),
    'Peru': (-12.0464, -77.0428),
    'Bulgaria': (42.6977, 23.3219),
    'Turkey': (39.9334, 32.8597),
    'Croatia': (45.8150, 15.9819),
    'Morocco': (33.5731, -7.5898),
    'Taiwan': (25.0330, 121.5654),
    'Cameroon': (3.8480, 11.5021),
    'Russia': (55.7558, 37.6176),
    'Thailand': (13.7563, 100.5018),
    'Algeria': (36.7372, 3.0863),
    'Kuwait': (29.3759, 47.9774),
    'United Arab Emirates': (24.4539, 54.3773),
    'Czech Republic': (50.0755, 14.4378),
    'Iran': (35.6892, 51.3890),
    'Iceland': (64.1466, -21.9426),
    'Egypt': (30.0444, 31.2357),
    'Singapore': (1.3521, 103.8198),
    'Malaysia': (3.1390, 101.6869),
    'Vietnam': (21.0285, 105.8542),
    'Philippines': (14.5995, 120.9842),
    'Pakistan': (33.6844, 73.0479),
    'Yugoslavia': (44.7866, 20.4489),  # Using Belgrade, Serbia
    'West Germany': (52.5200, 13.4050),  # Using Berlin, Germany
    'Soviet Union': (55.7558, 37.6176),  # Using Moscow, Russia
    'Serbia and Montenegro': (44.7866, 20.4489),  # Using Belgrade, Serbia
    'Weimar Republic': (52.5200, 13.4050),  # Using Berlin, Germany
    'Socialist Federal Republic of Yugoslavia': (44.7866, 20.4489),
    'Kingdom of Great Britain': (51.5074, -0.1278),  # Using London, UK
    'Nazi Germany': (52.5200, 13.4050),  # Using Berlin, Germany
    'Czechoslovakia': (50.0755, 14.4378),  # Using Prague, Czech Republic
    'Mandatory Palestine': (31.7683, 35.2137),  # Using Jerusalem
    'Palestinian territories': (31.9522, 35.2332),  # Using a central location
    'England': (51.5074, -0.1278),  # Using London, England
    'Scotland': (55.9533, -3.1883),  # Using Edinburgh, Scotland
    'Wales': (51.4816, -3.1791),  # Using Cardiff, Wales
    'Isle of Man': (54.2361, -4.5481),
    'Aruba': (12.5211, -69.9683),
    'Cuba': (23.1136, -82.3666),
    'Korea': (37.5665, 126.9780),  # Using Seoul, South Korea
    'Portugal': (38.7223, -9.1393),
    'Puerto Rico': (18.2208, -66.5901),
    'Bosnia and Herzegovina': (43.8563, 18.4131),
    'Lithuania': (54.6872, 25.2797),
    'Slovenia': (46.0569, 14.5058)
}

{'United States of America': (38, -97),
 'United Kingdom': (55, -3),
 'Canada': (56, -106),
 'India': (21, 78),
 'Poland': (52, 19),
 'Germany': (51, 9),
 'Hungary': (47, 20),
 'France': (46, 2),
 'Yugoslavia': (44.7866, 20.4489),
 'South Africa': (-30, 25),
 'England': (51.5074, -0.1278),
 'Finland': (64, 26),
 'South Korea': (36, 128),
 'Ireland': (53, -8),
 'Hong Kong': (22, 114),
 'China': (35, 105),
 'Denmark': (56, 10),
 'Italy': (43, 12),
 'Mexico': (23, -102),
 'Spain': (40, -4),
 'Israel': (31.5, 34.75),
 'Australia': (-25, 135),
 'New Zealand': (-41, 174),
 'Japan': (36, 138),
 'Belgium': (50.85, 4.35),
 'Luxembourg': (49.61, 6.13),
 'Sweden': (59.33, 18.06),
 'West Germany': (52.52, 13.405),
 'Argentina': (-34.61, -58.38),
 'Chile': (-33.45, -70.66),
 'Greece': (37.98, 23.72),
 'Netherlands': (52.37, 4.89),
 'Austria': (48.21, 16.37),
 'Norway': (59.91, 10.75),
 'Brazil': (-15.78, -47.93),
 'Colombia': (4.61, -74.08),
 'Switzerland': (46.95, 7.45),
 'Lebanon': (33.89, 35.49)

In [42]:
countries_missing_coordinates = [country for country in all_countries if country not in known_country_coordinates]
countries_missing_coordinates

['Serbia',
 'Montenegro',
 'Republic of Macedonia',
 'Senegal',
 'Burkina Faso',
 'Tunisia',
 'Slovakia',
 'Bahamas',
 'Costa Rica',
 'Panama',
 'Estonia',
 'Albania',
 'Kenya',
 'Ukraine',
 'Libya',
 'Cyprus',
 'Bhutan',
 'Venezuela',
 'Slovak Republic',
 'German Language',
 'Afghanistan',
 'Zimbabwe']

In [None]:


# Initialize the figure with the first year
first_year = grouped_df['year'].min()
initial_year_data = grouped_df[grouped_df['year'] == first_year]

# Adjust the size of the spheres
sizeref_value = 2.*max(grouped_df['count'])/(40.**2) / 3  # Reduced sizeref for larger spheres

fig = go.Figure(data=[
    go.Scattergeo(
        text=[f'{continent}: {count} female actors' for continent, count in zip(initial_year_data['continent'], initial_year_data['count'])],  # Updated hover text
        marker=dict(
            size=initial_year_data['count'],
            sizemode='area',
            sizeref=sizeref_value,
            showscale=True
        )
    )
])

# Add frames for each year
frames = []
years = grouped_df['year'].unique()
for year in years:
    year_data = grouped_df[grouped_df['year'] == year]
    frames.append(go.Frame(
        data=[go.Scattergeo(
            lon=[country_coordinates[continent][1] for continent in year_data['continent']],
            lat=[country_coordinates[continent][0] for continent in year_data['continent']],
            text=[f'{continent}: {count} female actors' for continent, count in zip(year_data['continent'], year_data['count'])],  # Updated hover text
            marker=dict(
                size=year_data['count'],
                sizemode='area',
                sizeref=sizeref_value,
                showscale=True
            )
        )],
        name=str(year)
    ))

fig.frames = frames

# Update layout and add slider
fig.update_layout(
    title_text='Number of Female Actors by Continent Over Time',
    showlegend=False,
    geo=dict(
        showland=True,
        landcolor="rgb(243, 243, 243)",
        countrycolor="rgb(204, 204, 204)",
    ),
    updatemenus=[{
        "buttons": [
            {
                "args": [None, {"frame": {"duration": 500, "redraw": True}, "fromcurrent": True}],
                "label": "Play",
                "method": "animate"
            },
            {
                "args": [[None], {"frame": {"duration": 0, "redraw": True}, "mode": "immediate", "transition": {"duration": 0}}],
                "label": "Pause",
                "method": "animate"
            }
        ],
        "direction": "left",
        "pad": {"r": 10, "t": 87},
        "showactive": False,
        "type": "buttons",
        "x": 0.1,
        "xanchor": "right",
        "y": 0,
        "yanchor": "top"
    }],
    sliders=[{
        "active": 0,
        "yanchor": "top",
        "xanchor": "left",
        "currentvalue": {
            "font": {"size": 20},
            "prefix": "Year:",
            "visible": True,
            "xanchor": "right"
        },
        "transition": {"duration": 300, "easing": "cubic-in-out"},
        "pad": {"b": 10, "t": 50},
        "len": 0.9,
        "x": 0.1,
        "y": 0,
        "steps": [{"args": [[year], {"frame": {"duration": 300, "redraw": True}, "mode": "immediate", "transition": {"duration": 300}}],
                  "label": str(year), "method": "animate"} for year in years]
    }]
)

# Show the figure
fig.show()