In [10]:
import numpy as np
import pandas as pd 
from datasets import load_dataset
import tqdm as tqdm
import matplotlib.pyplot as plt
import plotly.express as px


In [11]:
regression_dataset = load_dataset("ada-datadruids/regression_dataset_tmdb")
regression_dataset_df = regression_dataset['train'].to_pandas()

In [12]:
regression_dataset_df.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords', 'based_on_book'],
      dtype='object')

In [13]:
books_movie_dataset = pd.read_csv("../data/final_dataset.csv")
books_movie_dataset.columns

Index(['Unnamed: 0', 'movie_id', 'movie_cluster', 'movie_name', 'release_date',
       'runtime_x', 'language', 'country', 'genres', 'movie_year', 'index',
       'id', 'BookTitle', 'BookYear', 'Author', 'full name', 'tmdb_id',
       'id_goodreads', 'imdbid', 'runtime_y', 'revenue', 'budget', 'rb_ratio',
       'book_id', 'normalized_rating_x', 'standardized_rating_x',
       'normalized_rating_y', 'standardized_rating_y', 'length',
       'review_count'],
      dtype='object')

In [14]:
from utils.general_utils import adjust_for_inflation_final_dataset
books_movie_dataset_cleaned = books_movie_dataset[books_movie_dataset['revenue'] != 0]
books_movie_dataset_cleaned = adjust_for_inflation_final_dataset(books_movie_dataset, books_movie_dataset_cleaned)

In [15]:
book_adaptations = books_movie_dataset_cleaned.groupby(['book_id', 'BookTitle']).size().reset_index(name='adaptation_count')

# Sort by 'adaptation_count' in descending order to see the most frequently adapted books
book_adaptations = book_adaptations.sort_values(by='adaptation_count', ascending=False)

# Display the top books with the most adaptations
print(book_adaptations[book_adaptations.adaptation_count > 1])

        book_id                                   BookTitle  adaptation_count
297    414895.0                       La Planète des singes                 7
629  35031085.0                                Frankenstein                 7
405    897831.0                                The Exorcist                 5
114     34268.0  Peter Pan, or The Boy Who Wouldn't Grow Up                 5
47       7190.0                     Les Trois Mousquetaires                 5
..          ...                                         ...               ...
50       7604.0                                      Lolita                 2
16       2493.0                            The Time Machine                 2
618  19161905.0                                 The Getaway                 2
632  40940649.0                                 I Am Legend                 2
574   6356906.0                     The Apple Dumpling Gang                 2

[87 rows x 3 columns]


In [16]:
Planete_singes=books_movie_dataset_cleaned[books_movie_dataset_cleaned['BookTitle']=='La Planète des singes']
frankenstein = books_movie_dataset_cleaned[books_movie_dataset_cleaned['BookTitle']=='Frankenstein']
exorcist= books_movie_dataset_cleaned[books_movie_dataset_cleaned['BookTitle']=='The Exorcist']
peter_pan=books_movie_dataset_cleaned[books_movie_dataset_cleaned['BookTitle']=="Peter Pan, or The Boy Who Wouldn't Grow Up"]
trois_mousquetaires=books_movie_dataset_cleaned[books_movie_dataset_cleaned['BookTitle']=='Les Trois Mousquetaires']

In [29]:
books_with_multiple_films = ['La Planète des singes', 'Frankenstein', 'The Exorcist',"Peter Pan, or The Boy Who Wouldn't Grow Up", 'Les Trois Mousquetaires']
books_movie_dataset_cleaned['BookIdentifier'] = books_movie_dataset_cleaned['BookTitle'].where(books_movie_dataset_cleaned['BookTitle'].isin(books_with_multiple_films))
books_filtered = books_movie_dataset_cleaned[books_movie_dataset_cleaned['BookIdentifier'].notnull()]

In [31]:
fig = px.scatter(
    books_filtered,
    x='movie_year',
    y='revenue',
    color='BookIdentifier',
    hover_data={'BookTitle': True, 'BookYear': True, 'movie_year': True},
    title='Revenue for different Books adapted multiple Times',
    labels={'BookYear': 'Book Release Year', 'years_to_movie': 'Years to Movie Release'},
    template='plotly_white'
)

fig.update_traces(
    hovertemplate='<b>Book Title:</b> %{customdata[0]}<br>' +
                  '<b>Book Release Year:</b> %{customdata[1]}<br>' +
                  '<b>Movie Release Year:</b> %{x}<br>' +
                  '<b>Years to Movie Release:</b> %{y}<br>'
)

fig.show()
