In [1]:
import pandas as pd
import numpy as np
import re
import json
import os

from src.data_completion import *
from src.data_preprocessing import *
from src.data_loading import *
from src.data_fetching import *
from src.data_visualization import *

import statsmodels.formula.api as smf

from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import plotly.express as px
%load_ext autoreload
%autoreload 2

In [None]:

# Example: Create a simple scatter plot
df = px.data.iris()  # Sample dataset
fig = px.scatter(df, x="sepal_width", y="sepal_length", color="species", title="Sepal Dimensions")
fig.show()
# Save the plot as an HTML file
fig.write_html("plot.html")

In [3]:
# Load the preprocessed datasets
full_movie_data_preprocessed = pd.read_csv('data/preprocessed/full_movie_data_preprocessed.csv')
full_characters_data_preprocessed = pd.read_csv('data/preprocessed/full_characters_data_preprocessed.csv')
subset_movie_with_full_data_on_lead_actors = pd.read_csv('data/preprocessed/subset_movie_with_full_data_on_lead_actors.csv')
lead_actors_data_on_subset_movie = pd.read_csv('data/preprocessed/lead_actors_data_on_subset_movie.csv')
characters_data_on_subset_movie = pd.read_csv('data/preprocessed/characters_data_on_subset_movie.csv')

# We must convert dates to datetime
lead_actors_data_on_subset_movie['actor_dob'] = pd.to_datetime(lead_actors_data_on_subset_movie['actor_dob'])
characters_data_on_subset_movie['actor_dob'] = pd.to_datetime(characters_data_on_subset_movie['actor_dob'])
full_movie_data_preprocessed['movie_release_date'] = pd.to_datetime(full_movie_data_preprocessed['movie_release_date'])


In [4]:
movie_data_completed = full_movie_data_preprocessed.copy()
movie_data_completed.describe()

Unnamed: 0,wikipedia_movie_id,movie_release_date,box_office_revenue,runtime,averageRating,numVotes,release_year,adjusted_box_office
count,3493.0,3493,3493.0,3493.0,3493.0,3493.0,3493.0,3493.0
mean,7225853.0,1995-05-02 01:52:57.440595456,41679770.0,107.578214,6.268394,60651.38,1994.808188,90749580.0
min,3746.0,1958-01-29 00:00:00,1.0,61.0,1.7,506.0,1958.0,1.913719
25%,1076567.0,1987-10-11 00:00:00,4106588.0,95.0,5.7,5905.0,1987.0,9125168.0
50%,3058252.0,1996-10-11 00:00:00,14715070.0,104.0,6.4,18507.0,1996.0,32386250.0
75%,9932614.0,2005-06-10 00:00:00,41000000.0,116.0,6.9,59226.0,2005.0,94593130.0
max,36814250.0,2012-11-02 00:00:00,963420400.0,198.0,8.8,2318314.0,2012.0,4022758000.0
std,8926649.0,,79774460.0,18.176518,0.972713,130017.2,11.856645,180633200.0


In [33]:
import plotly.graph_objects as go

def plotly_histogram(
    data: pd.DataFrame,
    columns: list,
    titles: list,
    labels: list,
    bins=50,
    log_scale=False,
    kdes=False,
    hue=None,
):
    """Plot histograms for the specified columns using Plotly.

    Args:
        data (pd.DataFrame): The dataset to plot.
        columns (list): The columns from the dataset to plot.
        titles (list): The titles for each plot.
        labels (list): The x labels for each plot.
        bins (int or list, optional): The number of bins. Defaults to 50.
        log_scale (bool or list, optional): If true, uses a log scale. Defaults to False.
        kdes (bool or list, optional): If true, adds a kde curve. Defaults to True.
        hue (str or None, optional): Name of the column in the dataset for color grouping. Defaults to None.
    """
    if not isinstance(bins, list):
        bins = [bins] * len(columns)

    if not isinstance(log_scale, list):
        log_scale = [log_scale] * len(columns)

    if not isinstance(kdes, list):
        kdes = [kdes] * len(columns)

    figs = []
    for i, col in enumerate(columns):
        hist_fig = px.histogram(
            data,
            x=col,
            color=hue,
            nbins=bins[i],
            title=titles[i],
            labels={col: labels[i]},
            marginal="box" if kdes[i] else None,
        )
        if log_scale[i]:
            hist_fig.update_layout(
                xaxis=dict(type="log"),
                yaxis=dict(type="log" if hue is None else "linear"),
            )
        hist_fig.update_layout(
            width=500,
            height=500,
        )
        figs.append(hist_fig)

    return figs

def histo_acots_plotly(actors: pd.DataFrame, hue=None):
    """Plot histograms for actor data using Plotly.

    Args:
    actors (pd.DataFrame): The actors dataset 
    hue (str or None): Name of the column in the dataset for color grouping. Defaults to None.
    """
    columns = ["actor_height", "actor_age_at_release", "actor_dob"]
    titles = [
        "Height of the actor",
        "Age of the actor at the release of the movie",
        "Date of birth of the actor",
    ]
    labels = ["Height (m)", "Age (years)", "Date of birth"]

    return plotly_histogram(actors, columns, titles, labels, bins=25, hue=hue)


def histo_movies_plotly(
    movies: pd.DataFrame,
    hue: str | None = None,
    axes: list[plt.Axes] = None,
):
    """Plot histograms for the movie data.

    Args:
        movies (pd.DataFrame): The movie dataset.
        hue (str | None, optional): Name of the column in the dataset. Defaults to None.
        axes (list[plt.Axes], optional): The axes to plot on. Defaults to None.
    """
    columns = ["runtime", "box_office_revenue", "movie_release_date"]
    titles = [
        "Runtime of the movie",
        "Box office revenue of the movie",
        "Release date of the movie",
    ]
    labels = ["Runtime (min)", "Box office revenue (dollars, log scale)", "Release date"]
    log_scale = [False, True, False]

    return plotly_histogram(
        movies,
        columns,
        titles,
        labels,
        bins=50,
        log_scale=log_scale,
        hue=hue
    )


In [34]:
figs = histo_movies_plotly(movie_data_completed)
# Save the plot as an HTML file
# figs.write_html("hist_movies.html")
for i in figs:
    i.show()

In [36]:
figs[0].write_html("movies_runtime.html")
figs[2].write_html("moviers_release_date.html")