In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import plotly.express as px
import plotly.graph_objects as go
import json


In [1]:
# !pip install wikipedia-api beautifulsoup4

# Generation of movie_directors.csv

In [3]:
# Usage example
page_id = 28463795  # Wikipedia page ID for the movie "Brun Bitter"
infobox_data = get_wikipedia_infobox_by_id(page_id)

if infobox_data:
    for key, value in infobox_data.items():
        print(f"{key}: {value}")

Directed by: Sølve Skagen
Written by: Sølve Skagen Gunnar Staalesen
Starring: Frank Krog Kristin Kajander Anne Krigsvoll
Release date: 17 November 1998 ( 1998-11-17 )
Running time: 83 minutes
Country: Norway
Language: Norwegian


In [10]:
# extract secoond colomn from data/imdb_ratings.csv
wiki_movies_id = pd.read_csv('data/imdb_ratings.csv')['wikipedia_movie_id']

In [38]:
len(wiki_movies_id)

44868

In [None]:
movie_directors = []
for i in tqdm(wiki_movies_id, desc="Processing movies"):
    infobox_data = get_wikipedia_infobox_by_id(i)
    if (infobox_data is None) or ("Directed by" not in infobox_data):
        continue
    director = infobox_data.get("Directed by")
    movie_directors.append({"wikipedia_movie_id": i, "Director": director})

movie_directors = pd.DataFrame(movie_directors)

In [40]:
movie_directors = movie_directors.drop_duplicates()

In [58]:
gender_db = pd.read_csv("src/data/name_gender_dataset.csv")

In [62]:
def attribute_gender_to_dir(name_dir):
    current_best_prob = 0
    gender = None
    if type(name_dir)!=str:
        name_dir = str(name_dir)
    for string in name_dir.split():
        if string in gender_db['Name'].values :
            prob_str = gender_db[gender_db['Name'] == string]['Probability'].values[0]
            if prob_str > current_best_prob:
                current_best_prob = prob_str
                gender = gender_db[gender_db['Name'] == string]['Gender'].values[0]
    return gender
                

In [65]:
# if one of the words of the director is in gender_db "Name", give the corresponding gender otherwise give "Unknown"
tqdm.pandas()
movie_directors['Gender'] = movie_directors['Director'].progress_apply(lambda x: attribute_gender_to_dir(x))

100%|██████████| 42718/42718 [38:24<00:00, 18.54it/s]  


In [69]:
movie_directors = movie_directors[movie_directors['Gender'].notna()]

In [11]:
movie_directors.to_csv('data/movies_director.csv', index=False)

# Analysis

In [3]:
movie_directors = pd.read_csv('data/movies_director.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/movies_director.csv'

In [None]:
movie_directors.head()

In [14]:
# distribution of genders amongst movie directors
fig = px.histogram(movie_directors, x='Gender', title='Gender distribution of movies director')
fig.show()

In [75]:
# goal : plot the distribution of movie box office revenue depending on the gender of the movie director

#1 - create new dataframe combining data of movies_metadata and movies_director
movies_metadata = pd.read_csv('data/movie.metadata_filtered.csv')
merged_movies_and_directors = movies_metadata.join(movie_directors.set_index('wikipedia_movie_id'), on='wikipedia_movie_id')
merged_movies_and_directors = merged_movies_and_directors[merged_movies_and_directors['Director'].notna()]

In [76]:
merged_movies_and_directors[merged_movies_and_directors['Gender']=='F']['box_office_revenue']

9                NaN
42        34331783.0
51               NaN
53         3960327.0
70               NaN
            ...     
60131            NaN
60147    494471524.0
60161            NaN
60182            NaN
60193            NaN
Name: box_office_revenue, Length: 4035, dtype: float64

In [77]:
masc_directors = merged_movies_and_directors[merged_movies_and_directors['Gender']=='M']
fem_directors = merged_movies_and_directors[merged_movies_and_directors['Gender']=='F']

box_office_merged = merged_movies_and_directors[merged_movies_and_directors['box_office_revenue'].notna()]
box_office_merged = box_office_merged[box_office_merged['movie_runtime'].notna()].sort_values(by='release_date')

In [88]:
#2 - plot the distribution
fig = px.scatter(box_office_merged, x="release_date", y="box_office_revenue",
           size="movie_runtime", color="Gender", hover_name="movie_name", range_y=[0, 3000000000])
fig.show()

In [96]:
def map_cluster(mapping, elem):
    for generic, variants in mapping.items():
        if elem in variants:
            return generic
    return elem

In [97]:
with open("src/data/clusters.json", 'r') as file:
    data = json.load(file)

genres_cluster = data['Genres']

In [98]:
merged_movies_and_directors['genres'] = merged_movies_and_directors['genres'].apply(lambda x: list(dict.fromkeys([map_cluster(genres_cluster, elem) for elem in x]).keys())) 