# Dataset from IMDB dowloaded from https://developer.imdb.com/non-commercial-datasets/
## Please download title.basics.tsv.gz and title.ratings.tsv.gz and put them in the data/ folder (unzip)

In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import importlib
import os
import kagglehub
import ast
import math
import warnings
warnings.filterwarnings("ignore")

In [28]:
import sys
sys.path.append('scripts/')
import scraping, merge_goodreads, merge_cmu, merge_imdb
from scraping import *
from merge_goodreads import *
from merge_cmu import *
from merge_imdb import *

# Scraping data from wikipedia
We must first define the url that we will scrap data from. They will allow us to make a mapping between books and their film adaptation.

In [29]:
# URL of the Wikipedia page
url_0_C = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(0%E2%80%939,_A%E2%80%93C)"
url_D_J = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(D%E2%80%93J)"
url_K_R = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(K%E2%80%93R)"
url_S_Z = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(S%E2%80%93Z)"
url_short = "https://en.wikipedia.org/wiki/List_of_short_fiction_made_into_feature_films"
url_kids = "https://en.wikipedia.org/wiki/List_of_children%27s_books_made_into_feature_films"

urls = [url_0_C, url_D_J, url_K_R, url_S_Z, url_short, url_kids]

Then, we scrap and process data from these wikipedia pages.

In [30]:
# Launches the scrapping on every url selected
dataframes = []
for url in urls: 
    df = scrap_book_to_movie(url)
    clean_df = scrap_post_processing(df)
    dataframes.append(clean_df)

book_adaptations = pd.concat(dataframes).reset_index(drop=True)
book_adaptations = book_adaptations.drop_duplicates().reset_index(drop=True)
book_adaptations.to_csv('book_adaptations.csv', index=False)

We now have a dataframe with 4941 film adaptations together with the book they are adapting.

# Merge with Goodreads
We will now merge the book to movie mapping with the goodreads dataset to have additional information on the books.


First we download the dataset from kaggle

In [None]:
path = kagglehub.dataset_download("bahramjannesarr/goodreads-book-datasets-10m")

In [None]:
df_movies = book_adaptations.copy()
df_goodreads = books_csv_to_df(path)

df_goodreads['merge_authors'] = clean_spaces(df_goodreads['Authors'])
df_goodreads['merge_names'] = clean_spaces(df_goodreads['Name'])
df_goodreads['merge_names'] = remove_parenthesis(df_goodreads['merge_names'])


df_movies['merge_authors'] = clean_spaces(df_movies['BookAuthor'])
df_movies['merge_names'] = clean_spaces(df_movies['BookTitle'])
df_movies['merge_names'] = remove_parenthesis(df_movies['merge_names'])

merge_goodreads = df_goodreads.merge(right=df_movies, how="right", left_on=['merge_authors', 'merge_names'], right_on=['merge_authors', 'merge_names'], copy=False)
merge_goodreads = merge_goodreads.drop_duplicates(subset = df_movies.columns).reset_index(drop=True)
columns = ['FilmYear', 'FilmTitle', 'BookAuthor', 'BookTitle', 'BookStartYear', 'BookEndYear', 'Description', 'Rating', 'RatingDistTotal', 'Language']
merge_goodreads = merge_goodreads[columns]
merge_goodreads.rename({'RatingDistTotal': 'BookRatingNb', 'Description' : 'BookDescription', 'Rating': 'BookRating', 'Language' : 'BookLanguage'}, axis=1, inplace=True)
merge_goodreads.BookRatingNb = merge_goodreads.BookRatingNb.apply(lambda x: x.split(":")[-1] if isinstance(x, str) else None)
merge_goodreads

# Merge with CMU
We will now merge this data with the CMU dataset to add extra information on these films.

In [None]:
# Merge df with CMU depending on title_film and year_film
merge_cmu = merge_with_CMU(merge_goodreads)
merge_cmu

Now we have more information on the films that are an adaptation of a book, such as their genres. Let's add more information such as the film's rating by merging with IMDB's dataset.

# Merge with IMDB
## Dataset from IMDB dowloaded from https://developer.imdb.com/non-commercial-datasets/
### Please download title.basics.tsv.gz and title.ratings.tsv.gz and put them in the data/ folder (unzip)

In [None]:
merge_imdb = merge_with_imdb(merge_cmu)
merge_imdb

We now have 2841 film samples that are adaptations from known books and which can use for analysis.

# Merge with CMU plots

In [17]:
cmu_plots = pd.read_csv("../MovieSummaries/plot_summaries.txt", sep='\t', names=['wikipedia_id', 'MoviePlot'], header=None)

In [21]:
# adapted movies with plot
adapted_movies_df = merge_imdb.merge(right=cmu_plots, how="left", left_on='wikipedia_id', right_on='wikipedia_id', copy=False)
adapted_movies_df.to_csv('adapted_movies.csv', index=False, errors='ignore')

# All movies merge (non-adapted movies included)

In [None]:
# all movies with plots
cmu_movies = clean_cmu("../MovieSummaries/movie.metadata.tsv")
cmu_movies = cmu_movies.dropna(subset=['movie_date'])
cmu_movies = cmu_movies[cmu_movies.movie_date != "nan"]
cmu_movies['movie_date'] = cmu_movies['movie_date'].astype('int64')
all_movies_df = merge_with_imdb(cmu_movies)
all_movies_df = all_movies_df.merge(right=cmu_plots, how="left", left_on='wikipedia_id', right_on='wikipedia_id', copy=False)
all_movies_df.to_csv('all_movies.csv', index=False, errors='ignore')
all_movies_df