# Scrape Wikipedia tables of films based on books

Parent page: https://en.wikipedia.org/wiki/Lists_of_works_of_fiction_made_into_feature_films

In [1]:
import wikipedia
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

Get main page HTML

In [2]:
main_page_wiki = wikipedia.page("Lists of works of fiction made into feature films")
main_page_soup = BeautifulSoup(main_page_wiki.html())

Get relevant \<a\> tags

In [3]:
sub_page_tags = [
    tag for tag in main_page_soup.select("a") if tag.text.startswith("List of")
]

# print for sanity
for title in [tag.text for tag in sub_page_tags]:
    print(title)

List of fiction works made into feature films (0–9, A–C)
List of fiction works made into feature films (D–J)
List of fiction works made into feature films (K–R)
List of fiction works made into feature films (S–Z)
List of short fiction made into feature films
List of children's books made into feature films
List of films based on comics
List of comics and comic strips made into feature films
List of plays adapted into feature films
List of non-fiction works made into feature films


Function to extract all DataFrames from a single subpage (which contains multiple tables, one per starting letter)

In [4]:
def collect_tables(subpage_title):
    subpage_frames = []
    subpage_title = "List of fiction works made into feature films (0–9, A–C)"
    subpage = wikipedia.page(subpage_title)
    subpage_soup = BeautifulSoup(subpage.html())
    wikitables = subpage_soup.select("table.wikitable")
    for table in wikitables:
        # convert soup object to str to represent HTML
        # pass that to pandas to extract DataFrame
        table_df = pd.read_html(StringIO(str(table)))[0]
        subpage_frames.append(table_df)

    return subpage_frames

For each page, collect tables

In [5]:
all_frames = []

for sub_page_tag in sub_page_tags:
    subpage_tables = collect_tables(sub_page_tag.text)
    all_frames.extend(subpage_tables)

len(all_frames)

40

In [6]:
films_to_books = pd.concat(all_frames, axis=0, ignore_index=True)
print(films_to_books.shape)
films_to_books.head()

(7320, 2)


Unnamed: 0,Fiction work(s),Film adaptation(s)
0,"The 25th Hour (2001), David Benioff",25th Hour (2002)
1,"3 Assassins (グラスホッパー, Gurasuhoppā) (2004), Kōt...",Grasshopper (2015)
2,"4.50 from Paddington (1957), Agatha Christie","Murder, She Said (1961)"
3,"4.50 from Paddington (1957), Agatha Christie",Crime Is Our Business (French: Le Crime est no...
4,"58 Minutes (1987), Walter Wager",Die Hard 2 (1990)


Let's make some simplifying assumptions:

- author is always at the end of the "fiction work" after the last comma
- if we find 4 numbers + any other character (we have things like "1996-present" for book series) in brackets in either the fiction work or the film, it's the publication date
- the book/film title is everything up to the first `(` (this may not end well, who knows?)

In [9]:
# author
# note: in book ratings data, multiple authors are comma separated
films_to_books["author"] = (
    films_to_books["Fiction work(s)"]
    .str.split(",")
    .str[-1]
    .str.replace(" and ", ", ")
    .str.strip()
    .str.upper()
)

# publication dates
films_to_books["book_published_date"] = films_to_books["Fiction work(s)"].str.extract(
    r"\((\d{4}.*?)\)"
)
films_to_books["film_published_date"] = films_to_books[
    "Film adaptation(s)"
].str.extract(r"\((\d{4}.*?)\)")


# titles
def get_clean_title(title):
    if "(" in title:
        return title[: title.index("(")].upper().strip()
    return title.upper().strip()


films_to_books["book_title"] = films_to_books["Fiction work(s)"].apply(get_clean_title)
films_to_books["film_title"] = films_to_books["Film adaptation(s)"].apply(
    get_clean_title
)

films_to_books.sample(20, random_state=42)

Unnamed: 0,Fiction work(s),Film adaptation(s),author,book_published_date,film_published_date,book_title,film_title
2574,"Billy Bathgate (1989), E. L. Doctorow",Billy Bathgate (1991),E. L. DOCTOROW,1989,1991,BILLY BATHGATE,BILLY BATHGATE
1562,"The Age of Innocence (1921), Edith Wharton",The Age of Innocence (1928),EDITH WHARTON,1921,1928,THE AGE OF INNOCENCE,THE AGE OF INNOCENCE
4477,"The Adventures of Tom Sawyer (1876), Mark Twain",The Animated Adventures of Tom Sawyer (1998)[N 1],MARK TWAIN,1876,1998,THE ADVENTURES OF TOM SAWYER,THE ANIMATED ADVENTURES OF TOM SAWYER
2826,"Cimarron (1929), Edna Ferber",Cimarron (1960),EDNA FERBER,1929,1960,CIMARRON,CIMARRON
3720,The Adventures of Pinocchio (Italian: Le avven...,Pinocchio (2019),CARLO COLLODI,1883,2019,THE ADVENTURES OF PINOCCHIO,PINOCCHIO
2415,"The Apprenticeship of Duddy Kravitz (1959), Mo...",The Apprenticeship of Duddy Kravitz (1974),MORDECAI RICHLER,1959,1974,THE APPRENTICESHIP OF DUDDY KRAVITZ,THE APPRENTICESHIP OF DUDDY KRAVITZ
3463,"The Caine Mutiny (1952), Herman Wouk",The Caine Mutiny Court-Martial (1955)[N 1],HERMAN WOUK,1952,1955,THE CAINE MUTINY,THE CAINE MUTINY COURT-MARTIAL
1512,The Adventures of Pinocchio (Italian: Le avven...,Pinocchio and the Emperor of the Night (1987),CARLO COLLODI,1883,1987,THE ADVENTURES OF PINOCCHIO,PINOCCHIO AND THE EMPEROR OF THE NIGHT
1891,Blood and Sand (Spanish: Sangre y arena) (1908...,Blood and Sand (1916),VICENTE BLASCO IBÁÑEZ,1908,1916,BLOOD AND SAND,BLOOD AND SAND
4336,A Connecticut Yankee in King Arthur's Court (1...,A Connecticut Yankee (1931),MARK TWAIN,1889,1931,A CONNECTICUT YANKEE IN KING ARTHUR'S COURT,A CONNECTICUT YANKEE


In [10]:
films_to_books.to_parquet("./data/films_to_books.parquet", index=False)