In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Scraping data from wikipedia
We must first define the url that we will scrap data from. They will allow us to make a mapping between books and their film adaptation.

In [2]:
# URL of the Wikipedia page
url_0_C = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(0%E2%80%939,_A%E2%80%93C)"
url_D_J = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(D%E2%80%93J)"
url_K_R = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(K%E2%80%93R)"
url_S_Z = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(S%E2%80%93Z)"
url_short = "https://en.wikipedia.org/wiki/List_of_short_fiction_made_into_feature_films"
url_kids = "https://en.wikipedia.org/wiki/List_of_children%27s_books_made_into_feature_films"

urls = [url_0_C, url_D_J, url_K_R, url_S_Z, url_short, url_kids]

Then, we will define a series of functions that we will use when scraping and processing its result

In [3]:
# Scraps a url to extract a list of fiction works and their film adaptation
def scrap_book_to_movie(url): 
    response = requests.get(url)
    result = []
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        content = soup.find('div', {'class': 'mw-parser-output'})
        tables = content.find_all('table', {'class': 'wikitable'})
        for table in tables:
            rows = table.find_all('tr')
            
            for row in rows:
                cells = row.find_all('td')

                # splits into book and movie
                cell_tab = [cell.get_text(strip=True) for cell in cells]
                result.append(cell_tab)    

    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

    result = pd.DataFrame(result)
    result.columns = ['fiction_work', 'film_adaptations']
    return result

In [4]:
# Extracts years in format "2000", "1999-2000" or "1999-present"
def extract_years(text):
    years = re.findall(r'\((?:[^)]*?)(\d{4}(?:[–-](?:\d{4}|present))?)(?:[^)]*?)\)', text)
    return years[0] if years else None

In [5]:
# Cleanup authors feature
def clean_authors(authors): 
    authors = authors.fillna("")
    authors = authors.replace("unknown", "")

    authors = authors.apply(lambda a: re.sub(r'and([A-Z])', r', \1', a))
    authors = authors.apply(lambda a: a.replace('and ', ','))

    authors = authors.apply(lambda a: a.replace('(series)', ''))
    authors = authors.apply(lambda a: a.replace('various authors', ''))
    
    authors = authors.apply(lambda a: re.sub(r'\[.*?\]', '', a))
    authors = authors.apply(lambda a: re.sub(r'\(as[^)]+\)', '', a))
    authors = authors.apply(lambda a: re.sub(r'\(pseudonym[^)]+\)', '', a))
    return authors

In [6]:
# Extracts authors from a text
def extract_authors(text): 
    if any(substring in text for substring in [" fils", " père", " III", " Sr.", " Jr."]):
        text[-1] = text[-1].replace(r'\[.*?\]', '')
        return ",".join(text[-2:])
    if len(text) > 1 :
        return text[-1]
    return None

In [7]:
# Extracts several features from a dataframe while sanitizing them
def extract_features(df):
    df['title_book'] = df['fiction_work'].str.split('(').str[0]
    df['title_book'] = df['title_book'].apply(lambda t: t.replace('"', ''))

    df_split_comma = df['fiction_work'].str.split(',')
    df['author_book'] = df_split_comma.apply(extract_authors)
    df['author_book'] = clean_authors(df['author_book'])

    df['year_book'] = df['fiction_work'].apply(extract_years)

    df['title_film'] = df['film_adaptations'].str.split('(').str[0]
    df['year_film'] = df['film_adaptations'].apply(extract_years)

    df = df.drop(['fiction_work', 'film_adaptations'], axis = 1)

    return df

In [8]:
# Filling in values "same as above" and "same as below" with the data above or below respectively
def clean_same_as_above_below(df):
    indexes = df.index[df['title_film'] == 'same as above'].tolist()
    target_ind =[(index - 1) for index in indexes]
    df['title_film'][indexes] = df['title_film'][target_ind]
    df['year_film'][indexes] = df['year_film'][target_ind]

    indexes = df.index[df['title_film'] == 'same as below'].tolist()
    target_ind =[(index + 1) for index in indexes]
    df['title_film'][indexes] = df['title_film'][target_ind]
    df['year_film'][indexes] = df['year_film'][target_ind]

    return df

In [9]:
# Final processing on scrapping result - drops nan, null columns, fills empty rows and cleans the features
def scrap_post_processing(df): 
    df.loc[df['film_adaptations'].isnull() & df['fiction_work'].notnull(), ['film_adaptations']] = df['fiction_work']
    df.loc[df['film_adaptations'] == df['fiction_work'], ['fiction_work']] = None

    # fill nan fiction_work values with the last non null value of fiction_work
    df['fiction_work'] = df['fiction_work'].ffill()
    # drop nan where both columns are nan
    df = df.dropna(subset=['film_adaptations'])

    df = extract_features(df)

    df = clean_same_as_above_below(df)

    return df

In [10]:
# Launches the scrapping on every url selected
dataframes = []
for url in urls: 
    df = scrap_book_to_movie(url)
    clean_df = scrap_post_processing(df)
    dataframes.append(clean_df)

book_adaptations = pd.concat(dataframes).reset_index(drop=True)
book_adaptations = book_adaptations.drop_duplicates().reset_index(drop=True)
book_adaptations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title_book'] = df['fiction_work'].str.split('(').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title_book'] = df['title_book'].apply(lambda t: t.replace('"', ''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['author_book'] = df_split_comma.apply(extract_authors)
A value is tryi

Unnamed: 0,title_book,author_book,year_book,title_film,year_film
0,The 25th Hour,David Benioff,2001,25th Hour,2002
1,3 Assassins,Kōtarō Isaka,2004,Grasshopper,2015
2,4.50 from Paddington,Agatha Christie,1957,"Murder, She Said",1961
3,4.50 from Paddington,Agatha Christie,1957,Crime Is Our Business,2008
4,58 Minutes,Walter Wager,1987,Die Hard 2,1990
...,...,...,...,...,...
4936,Z for Zachariah,Robert C. O'Brien,1974,Z for Zachariah,2015
4937,Zathura,Chris Van Allsburg,2002,Zathura,2005
4938,Zenon: Girl of the 21st Century,Marilyn Sadler,1997,Zenon: Girl of the 21st Century,1999
4939,Zenon: Girl of the 21st Century,Marilyn Sadler,1997,Zenon: The Zequel,2001


We now have a dataframe with 3549 film adaptations together with the book they are adapting.

# Merge with Goodreads
We will now merge the book to movie mapping with the goodreads dataset to have additional information on the books.


First we download the dataset from kaggle

In [11]:
import os
import kagglehub

path = kagglehub.dataset_download("bahramjannesarr/goodreads-book-datasets-10m")

Download already complete (482791291 bytes).
Extracting files...


KeyboardInterrupt: 

In [None]:
def books_csv_to_df(path):
    book_csv_list = os.listdir(path)[:-1]
    book_csv_path_list = [os.path.join(path, book_csv) for book_csv in book_csv_list]
    dataframes = []
    for path, name in zip(book_csv_path_list, book_csv_list):
        df = pd.read_csv(path)
        dataframes.append(df)

    df_goodreads = pd.concat(dataframes)
    return df_goodreads

In [12]:
def clean_spaces(column):
    return column.apply(lambda name: str(name).lower().replace(" ", ""))

In [None]:
def remove_parenthesis(column):
    return column.apply(lambda name: re.sub(r"\(.*?\)", "", str(name)))

In [None]:
path = "..\data"

df_movies = book_adaptations.copy()
df_goodreads = books_csv_to_df(path)

df_goodreads['merge_authors'] = clean_spaces(df_goodreads['Authors'])
df_goodreads['merge_names'] = clean_spaces(df_goodreads['Name'])
df_goodreads['merge_names'] = remove_parenthesis(df_goodreads['merge_names'])


df_movies['merge_authors'] = clean_spaces(df_movies['author_book'])
df_movies['merge_names'] = clean_spaces(df_movies['title_book'])
df_movies['merge_names'] = remove_parenthesis(df_movies['merge_names'])

merge_goodreads = df_goodreads.merge(right=df_movies, how="right", left_on=['merge_authors', 'merge_names'], right_on=['merge_authors', 'merge_names'], copy=False)
merge_goodreads = merge_goodreads.drop_duplicates(subset = df_movies.columns).reset_index(drop=True)
merge_goodreads = merge_goodreads.drop(columns = ['merge_authors', 'merge_names', 'Authors', 'Name'])

# Merge with CMU
We will now merge this data with the CMU dataset to add extra information on these films.

First, we define a function that we will use to clean titles and compare them consistently when merging datasets based on the film's title. We also define any function used for cleaning later on.

In [None]:
# cleanup of titles consistent accross datasets
def clean_title(title):
    return title.lower().replace(" ", "")

# clean a json-like representation into a list of the values of the json-pairs
def clean_json_format(list):
    dict = ast.literal_eval(list)
    return ', '.join(dict.values())

Then we can proceed with the merging with CMU

In [None]:
import ast

# Merge df with CMU depending on title_film and year_film
def merge_with_CMU(df):    
    cmu_movies = pd.read_csv("../MovieSummaries/movie.metadata.tsv", sep='\t', usecols=[2,3,4,5,6,7,8], names=['movie_name', 'movie_date', 'box_office', 'runtime', 'language', 'countries', 'genres'])
    
    # Clean the move name and place it in another column to save the original version. This will also be used when merging with IMDB
    name_column = cmu_movies['movie_name']
    new_name_column = name_column.apply(clean_title)
    cmu_movies['clean_name'] = new_name_column

    # Clean the title from the dataset scrapped to allow merge on title with CMU
    title_column = df['title_film']
    new_title_column = title_column.apply(clean_title)
    df['title_film'] = new_title_column

    # Clean json values to have a nicer representation
    cmu_movies['language'] = cmu_movies['language'].apply(clean_json_format)
    cmu_movies['countries'] = cmu_movies['countries'].apply(clean_json_format)
    cmu_movies['genres'] = cmu_movies['genres'].apply(clean_json_format)
    
    # Extract only the year of the Movie release date
    date_column = cmu_movies['movie_date']
    new_date_column = date_column.apply(lambda x : str(x)[0:4])
    cmu_movies['movie_date'] = new_date_column
    
    # Do the merge
    merge_cmu = cmu_movies.merge(right=df, how="inner", left_on=['clean_name', 'movie_date'], right_on=['title_film', 'year_film'], copy=False)
    merge_cmu = merge_cmu.drop(['title_film', 'year_film'], axis=1)
    merge_cmu = merge_cmu.drop_duplicates()
    merge_cmu = merge_cmu.dropna(subset=['movie_date'])
    merge_cmu['movie_date'] = merge_cmu['movie_date'].astype('int64')
    return merge_cmu

merge_cmu = merge_with_CMU(merge_goodreads)
merge_cmu

Unnamed: 0,movie_name,movie_date,box_office,runtime,language,countries,genres,clean_name,title_book,author_book,year_book
0,Mary Poppins,1964,102272727.0,139.0,English Language,United States of America,"Children's/Family, Musical, Fantasy, Comedy, D...",marypoppins,Mary Poppins,P. L. Travers,1934–1988
1,Mysterious Island,1982,,100.0,Standard Mandarin,Hong Kong,"Action/Adventure, Wuxia, Martial Arts Film, Ch...",mysteriousisland,The Mysterious Island,Jules Verne,1874
2,Juarez,1939,,125.0,"English Language, Spanish Language",United States of America,"Costume drama, Biographical film, Historical f...",juarez,The Phantom Crown: The Story of Maximilian & C...,Bertita Harding,1934
3,The Great Santini,1979,4702575.0,115.0,English Language,United States of America,"Family Drama, Drama",thegreatsantini,The Great Santini,Pat Conroy,1976
4,The Castle,1968,,88.0,German Language,West Germany,"Mystery, Drama",thecastle,The Castle,Franz Kafka,1926
...,...,...,...,...,...,...,...,...,...,...,...
2292,The Russia House,1990,22998000.0,129.0,"English Language, Russian Language","United States of America, United Kingdom","Thriller, Film adaptation, Drama, Political th...",therussiahouse,The Russia House,John le Carré,1989
2293,Ivanhoe,1913,,,English Language,United Kingdom,"Silent film, Drama, Historical drama",ivanhoe,Ivanhoe,Sir Walter Scott,1820
2294,Rising Sun,1993,107198790.0,129.0,English Language,United States of America,"Thriller, Crime Fiction, Mystery, Drama, Suspe...",risingsun,Rising Sun,Michael Crichton,1992
2295,The Deluge,1974,,315.0,Polish Language,Poland,War film,thedeluge,The Deluge,Henryk Sienkiewicz,1886


Now we have more information on the films that are an adaptation of a book, such as their genres. Let's add more information such as the film's rating by merging with IMDB's dataset.

# Merge with IMDB
The merging with IMDB takes place in two steps. First, we must merge with *title.basics.tsv* to add IMDB's *titleId* feature to each film.

In [None]:
data_folder = "data/"

# merge a dataframe with 'title.basics.tsv' and append 'imdbID' and 'isAdult' as new features
def merge_with_imdb_id(df):
    titles = pd.read_csv(data_folder + "title.basics.tsv", sep='\t', header=0, usecols=[0, 1, 2, 3, 4, 5], names=['imdbID', 'titleType', 'imdbPrimaryTitle', 'imdbOriginalTitle', 'isAdult', 'imdbYear'], dtype={'isAdult': 'string'})

    # sanitize imdbYear, isAdult and titleType
    titles['imdbYear'] = pd.to_numeric(titles['imdbYear'], errors='coerce')
    titles['isAdult'] = pd.to_numeric(titles['isAdult'], errors='coerce')
    titles = titles.dropna()
    titles = titles[titles['titleType'] == 'movie']

    # merge with both the imdb's original and primary titles to consider both cases
    titles['imdbOriginalTitle'] = titles['imdbOriginalTitle'].apply(clean_title)
    titles['imdbPrimaryTitle'] = titles['imdbPrimaryTitle'].apply(clean_title)
    mergeOnOriginal = pd.merge(titles, df, how='inner', left_on=['imdbOriginalTitle', 'imdbYear'], right_on=['clean_name', 'movie_date'])
    mergeOnPrimary = pd.merge(titles, df, how='inner', left_on=['imdbPrimaryTitle', 'imdbYear'], right_on=['clean_name', 'movie_date'])
    
    merge = pd.concat([mergeOnOriginal, mergeOnPrimary], axis=0)
    merge = merge.drop_duplicates(subset=['imdbID'])
    merge = merge.drop(['imdbPrimaryTitle', 'imdbOriginalTitle', 'imdbYear', 'titleType'], axis=1)
    
    return merge

In [None]:
# merge a dataframe with 'title.ratings.tsv' to add rating and number of votes
def merge_with_imdb_ratings(df):
    imdb_ratings = pd.read_csv(data_folder + "title.ratings.tsv", sep='\t', header=0, names=['imdbID', 'rating', 'numVotes'])

    # sanitize rating and numVotes
    imdb_ratings['rating'] = pd.to_numeric(imdb_ratings['rating'])
    imdb_ratings['numVotes'] = pd.to_numeric(imdb_ratings['numVotes'])
    imdb_ratings = imdb_ratings.dropna()

    # merge using the common imdbID
    return pd.merge(df, imdb_ratings, how='inner', on=['imdbID'])

In [None]:
# merge a dataframe with IMDB's datasets
def merge_with_imdb(df):
    # perform merging
    merged_id = merge_with_imdb_id(df)
    merge_imdb = merge_with_imdb_ratings(merged_id)

    # cleanup merging
    merge_imdb = merge_imdb.drop(['imdbID', 'clean_name'], axis=1)
    
    print('lines dropped during merge with IMDB: ', len(df) - len(merge_imdb))
    return merge_imdb

merge_imdb = merge_with_imdb(merge_cmu)
merge_imdb.to_csv('merge_imdb.csv', index=False)
merge_imdb

lines dropped during merge with IMDB:  357


Unnamed: 0,isAdult,movie_name,movie_date,box_office,runtime,language,countries,genres,title_book,author_book,year_book,rating,numVotes
0,0,The Fairylogue and Radio-Plays,1908,,120.0,English Language,United States of America,"Silent film, Black-and-white",The Wonderful Wizard of Oz,L. Frank Baum,1900,5.2,76
1,0,Atlantis,1913,,113.0,"English Language, Danish Language",Denmark,"Silent film, Drama, Indie, Black-and-white",Atlantis,Gerhart Hauptmann,1912,6.5,500
2,0,Ivanhoe,1913,,,"Silent film, English Language",United States of America,"Swashbuckler films, Silent film, Drama, Adventure",Ivanhoe,Sir Walter Scott,1820,5.6,97
3,0,"His Majesty, the Scarecrow of Oz",1914,,60.0,English Language,United States of America,"Silent film, Adventure, Children's/Family, Bla...",The Wonderful Wizard of Oz,L. Frank Baum,1900,5.3,533
4,0,The Jungle,1914,,,"Silent film, English Language",,Silent film,The Jungle,Upton Sinclair,1906,6.8,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1935,0,"Oslo, August 31st",2011,,95.0,Norwegian Language,Norway,Drama,Will O' the Wisp,Pierre Drieu La Rochelle,1931,7.6,30513
1936,0,The Assassins,2012,,,Standard Mandarin,China,Drama,Romance of the Three Kingdoms,Luo Guanzhong,,5.5,2147
1937,0,Dangerous Liaisons,2012,,110.0,"English Language, Standard Mandarin","Singapore, South Korea, China","Mystery, Romance Film, Drama",The Dangerous Liaisons,Pierre Choderlos de Laclos,1782,6.0,2092
1938,0,Helpless,2012,16175929.0,117.0,Korean Language,South Korea,"Thriller, Mystery",All She Was Worth,Miyuki Miyabe,1992,6.7,2642


We now have 1940 film samples that are adaptations from known books and which can use for analysis.