# Dataset from IMDB dowloaded from https://developer.imdb.com/non-commercial-datasets/
## Please download title.basics.tsv.gz and title.ratings.tsv.gz and put them in the data/ folder (unzip)

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import importlib
import os
import kagglehub
import ast
import math
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px
import json

In [2]:
import sys
sys.path.append('scripts/')
import scraping, merge_goodreads, merge_cmu, merge_imdb
from scraping import *
from merge_goodreads import *
from merge_cmu import *
from merge_imdb import *

# Scraping data from wikipedia
We must first define the url that we will scrap data from. They will allow us to make a mapping between books and their film adaptation.

In [3]:
# URL of the Wikipedia page
url_0_C = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(0%E2%80%939,_A%E2%80%93C)"
url_D_J = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(D%E2%80%93J)"
url_K_R = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(K%E2%80%93R)"
url_S_Z = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(S%E2%80%93Z)"
url_short = "https://en.wikipedia.org/wiki/List_of_short_fiction_made_into_feature_films"
url_kids = "https://en.wikipedia.org/wiki/List_of_children%27s_books_made_into_feature_films"

urls = [url_0_C, url_D_J, url_K_R, url_S_Z, url_short, url_kids]

Then, we scrap and process data from these wikipedia pages.

In [4]:
# Launches the scrapping on every url selected
dataframes = []
for url in urls: 
    df = scrap_book_to_movie(url)
    clean_df = scrap_post_processing(df)
    dataframes.append(clean_df)

book_adaptations = pd.concat(dataframes).reset_index(drop=True)
book_adaptations = book_adaptations.drop_duplicates().reset_index(drop=True)

We now have a dataframe with 4941 film adaptations together with the book they are adapting.

# Merge with Goodreads
We will now merge the book to movie mapping with the goodreads dataset to have additional information on the books.


First we download the dataset from kaggle

In [5]:
path = kagglehub.dataset_download("bahramjannesarr/goodreads-book-datasets-10m")



In [6]:
df_movies = book_adaptations.copy()
df_goodreads = books_csv_to_df(path)

df_goodreads['merge_authors'] = clean_spaces(df_goodreads['Authors'])
df_goodreads['merge_names'] = clean_spaces(df_goodreads['Name'])
df_goodreads['merge_names'] = remove_parenthesis(df_goodreads['merge_names'])


df_movies['merge_authors'] = clean_spaces(df_movies['BookAuthor'])
df_movies['merge_names'] = clean_spaces(df_movies['BookTitle'])
df_movies['merge_names'] = remove_parenthesis(df_movies['merge_names'])

merge_goodreads = df_goodreads.merge(right=df_movies, how="right", left_on=['merge_authors', 'merge_names'], right_on=['merge_authors', 'merge_names'], copy=False)
merge_goodreads = merge_goodreads.drop_duplicates(subset = df_movies.columns).reset_index(drop=True)
columns = ['FilmYear', 'FilmTitle', 'BookAuthor', 'BookTitle', 'BookStartYear', 'BookEndYear', 'Description', 'Rating', 'RatingDistTotal', 'Language']
merge_goodreads = merge_goodreads[columns]
merge_goodreads.rename({'RatingDistTotal': 'BookRatingNb', 'Description' : 'BookDescription', 'Rating': 'BookRating', 'Language' : 'BookLanguage'}, axis=1, inplace=True)
merge_goodreads.BookRatingNb = merge_goodreads.BookRatingNb.apply(lambda x: x.split(":")[-1] if isinstance(x, str) else None)
merge_goodreads

Unnamed: 0,FilmYear,FilmTitle,BookAuthor,BookTitle,BookStartYear,BookEndYear,BookDescription,BookRating,BookRatingNb,BookLanguage
0,2002,25th Hour,David Benioff,The 25th Hour,2001,2001,David Benioff's character-driven debut novel; ...,3.81,5015,
1,2015,Grasshopper,Kōtarō Isaka,3 Assassins,2004,2004,,,,
2,1961,"Murder, She Said",Agatha Christie,4.50 from Paddington,1957,1957,"For an instant the two trains ran together, si...",3.95,41729,eng
3,2008,Crime Is Our Business,Agatha Christie,4.50 from Paddington,1957,1957,"For an instant the two trains ran together, si...",3.95,41729,eng
4,1990,Die Hard 2,Walter Wager,58 Minutes,1987,1987,,3.55,278,
...,...,...,...,...,...,...,...,...,...,...
4936,2015,Z for Zachariah,Robert C. O'Brien,Z for Zachariah,1974,1974,,3.66,13608,eng
4937,2005,Zathura,Chris Van Allsburg,Zathura,2002,2002,,3.78,2465,
4938,1999,Zenon: Girl of the 21st Century,Marilyn Sadler,Zenon: Girl of the 21st Century,1997,1997,,,,
4939,2001,Zenon: The Zequel,Marilyn Sadler,Zenon: Girl of the 21st Century,1997,1997,,,,


# Merge with CMU
We will now merge this data with the CMU dataset to add extra information on these films.

In [7]:
# Merge df with CMU depending on title_film and year_film
merge_cmu = merge_with_CMU(merge_goodreads)
merge_cmu

Unnamed: 0,wikipedia_id,movie_name,movie_date,box_office,runtime,language,countries,genres,clean_name,BookAuthor,BookTitle,BookStartYear,BookEndYear,BookDescription,BookRating,BookRatingNb,BookLanguage
0,77856,Mary Poppins,1964,102272727.0,139.0,English Language,United States of America,"Children's/Family, Musical, Fantasy, Comedy, D...",marypoppins,P. L. Travers,Mary Poppins,1934,1988,,4.03,110287,eng
1,26878691,Mysterious Island,1982,,100.0,Standard Mandarin,Hong Kong,"Action/Adventure, Wuxia, Martial Arts Film, Ch...",mysteriousisland,Jules Verne,The Mysterious Island,1874,1874,,4.11,42044,eng
6,11633165,Innocence,1997,,110.0,Turkish Language,Turkey,"Crime Fiction, Drama, Crime Drama",innocence,Frank Wedekind,"Mine-Haha, or On the Bodily Education of Young...",1903,1903,,,,
7,1369204,Juarez,1939,,125.0,"English Language, Spanish Language",United States of America,"Costume drama, Biographical film, Historical f...",juarez,Bertita Harding,The Phantom Crown: The Story of Maximilian & C...,1934,1934,,,,
8,164388,The Great Santini,1979,4702575.0,115.0,English Language,United States of America,"Family Drama, Drama",thegreatsantini,Pat Conroy,The Great Santini,1976,1976,,4.14,29091,eng
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8097,2238130,Crack-Up,1946,,93.0,English Language,United States of America,"Thriller, Psychological thriller, Black-and-wh...",crack-up,Fredric Brown,Madman's Holiday,1943,1943,,,,
8098,24180469,Angel,1966,,7.0,English Language,Canada,"Short Film, Animation",angel,Elizabeth Taylor,Angel,1957,1957,"At the turn of the century, 15-year-old Angeli...",3.85,1924,
8099,5714284,The Four Feathers,1939,,130.0,English Language,"England, United Kingdom","Adventure, War film, Action/Adventure, Drama, ...",thefourfeathers,A. E. W. Mason,The Four Feathers,1902,1902,"Just before sailing off to war in the Sudan, B...",3.91,4516,eng
8104,4492078,How to Eat Fried Worms,2006,13040527.0,98.0,English Language,United States of America,"Family Film, Drama, Comedy, Film adaptation",howtoeatfriedworms,Thomas Rockwell,How to Eat Fried Worms,1973,1973,<i>How to Eat Fried Worms</i> has happily repu...,3.76,43948,


Now we have more information on the films that are an adaptation of a book, such as their genres. Let's add more information such as the film's rating by merging with IMDB's dataset.

# Merge with IMDB
## Dataset from IMDB dowloaded from https://developer.imdb.com/non-commercial-datasets/
### Please download title.basics.tsv.gz and title.ratings.tsv.gz and put them in the data/ folder (unzip)

In [8]:
merge_imdb = merge_with_imdb(merge_cmu)
merge_imdb

lines dropped during merge with IMDB:  935


Unnamed: 0,wikipedia_id,MovieName,MovieYear,MovieBoxOffice,MovieRuntime,MovieLanguage,MovieCountries,MovieGenre,BookAuthor,BookTitle,BookStartYear,BookEndYear,BookDescription,BookRating,BookRatingNb,BookLanguage,MovieRating,MovieRatingNb
0,5954041,The Fairylogue and Radio-Plays,1908,,120.0,English Language,United States of America,"Silent film, Black-and-white",L. Frank Baum,The Wonderful Wizard of Oz,1900,1900,,3.99,348112,,5.2,78
1,19236804,Cleopatra,1912,,88.0,English Language,United States of America,"History, Silent film, Drama",Margaret George,The Memoirs of Cleopatra,1997,1997,,4.18,17986,eng,5.1,636
2,1090641,Atlantis,1913,,113.0,"English Language, Danish Language",Denmark,"Silent film, Drama, Indie, Black-and-white",Gerhart Hauptmann,Atlantis,1912,1912,,,,,6.5,502
3,22521524,Ivanhoe,1913,,,"Silent film, English Language",United States of America,"Swashbuckler films, Silent film, Drama, Adventure",Sir Walter Scott,Ivanhoe,1820,1820,,,,,5.6,98
4,18979350,Cinderella,1914,,52.0,"Silent film, English Language",United States of America,"Silent film, Fantasy, Black-and-white",Charles Perrault,Cinderella,1697,1697,The former Well Loved Tales series have been r...,3.97,1937,eng,6.0,1100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2836,32621340,The Client,2011,15780280.0,122.0,Korean Language,South Korea,"Crime Fiction, Mystery, Thriller, Drama",John Grisham,The Client,1993,1993,,4.01,378641,eng,6.8,2040
2837,34647290,Love,2012,,150.0,Standard Mandarin,"China, Taiwan","Romance Film, Drama",Leo Tolstoy,Anna Karenina,1873,1887,,4.05,631550,eng,6.5,1668
2838,35020822,Helpless,2012,16175929.0,117.0,Korean Language,South Korea,"Thriller, Mystery",Miyuki Miyabe,All She Was Worth,1992,1992,Here is a deftly written thriller that is also...,3.69,2921,,6.7,2662
2839,34956831,Nightmare,2011,,,Mandarin Chinese,China,Horror,Cornell Woolrich,Nightmare,1942,1942,,,,,5.7,21


We now have 2841 film samples that are adaptations from known books and which can use for analysis.

# Merge with CMU plots

In [9]:
cmu_plots = pd.read_csv("../MovieSummaries/plot_summaries.txt", sep='\t', names=['wikipedia_id', 'MoviePlot'], header=None)

In [10]:
# adapted movies with plot
adapted_movies_df = merge_imdb.merge(right=cmu_plots, how="left", left_on='wikipedia_id', right_on='wikipedia_id', copy=False)
adapted_movies_df.to_csv('../data/adapted_movies.csv', index=False, errors='ignore')

# All movies merge (non-adapted movies included)

In [11]:
# all movies with plots
cmu_movies = clean_cmu("../MovieSummaries/movie.metadata.tsv")
cmu_movies = cmu_movies.dropna(subset=['movie_date'])
cmu_movies = cmu_movies[cmu_movies.movie_date != "nan"]
cmu_movies['movie_date'] = cmu_movies['movie_date'].astype('int64')
all_movies_df = merge_with_imdb(cmu_movies)
all_movies_df = all_movies_df.merge(right=cmu_plots, how="left", left_on='wikipedia_id', right_on='wikipedia_id', copy=False)
all_movies_df.to_csv('../data/all_movies.csv', index=False, errors='ignore')
all_movies_df

lines dropped during merge with IMDB:  30201


Unnamed: 0,wikipedia_id,MovieName,MovieYear,MovieBoxOffice,MovieRuntime,MovieLanguage,MovieCountries,MovieGenre,MovieRating,MovieRatingNb,MoviePlot
0,10109752,Miss Jerry,1894,,,Silent film,United States of America,"Short Film, Silent film, Indie, Black-and-white",5.4,215,After finding out that her father is suffering...
1,28703057,The Corbett-Fitzsimmons Fight,1897,100000.0,,,,Sports,5.2,541,The film no longer exists in its entirety; how...
2,142995,The Story of the Kelly Gang,1906,,70.0,English Language,Australia,"Crime Fiction, Silent film, Biography, Indie, ...",6.0,947,The Story of the Kelly Gangs tone is of sorrow...
3,4849466,L'Enfant prodigue,1907,,90.0,French Language,France,"Short Film, Silent film, Drama",5.7,28,
4,32986669,Robbery Under Arms,1907,,,Silent film,Australia,"Silent film, Drama",4.3,27,Key scenes of the film included the branding o...
...,...,...,...,...,...,...,...,...,...,...,...
44633,21216680,TN-07 AL 4777,2009,,115.0,Tamil Language,India,"Thriller, Drama",7.2,35,
44634,28032359,Love At Seventh Sight,2009,,100.0,Mandarin Chinese,China,"Romantic drama, Romance Film, Drama, Chinese M...",5.5,11,
44635,16794053,Quarantine,2008,41319906.0,89.0,English Language,United States of America,"Thriller, Science Fiction, Horror, Zombie Film...",3.9,201,{{plot}} A television reporter named Angela Vi...
44636,12005,The Return of Godzilla,1984,4116395.0,103.0,"Japanese Language, Russian Language, English L...",Japan,"Science Fiction, Japanese Movies, Monster",6.8,2830,A Japanese fishing vessel is trying to find it...


In [14]:
# Example mapping dictionary from general genres to subgenres
with open('../data/genre_categories.json') as f:
    genre_mapping = json.load(f)


data = []
for genre, subgenres in genre_mapping.items():
    for subgenre in subgenres:
        data.append({"Genre": genre, "Subgenre": subgenre})

df = pd.DataFrame(data)

fig = px.sunburst(
    df,
    path=["Genre", "Subgenre"],
    title="Mapping of General Genres to Subgenres",
    template="plotly"
)

fig.write_html('../data/genre_sunburst.html')

fig.show()