In [1]:
import numpy as np
import pandas as pd


Base data (film to book linkage)

In [2]:
films_to_books = pd.read_parquet("./data/films_to_books.parquet")
films_to_books.head()

Unnamed: 0,Fiction work(s),Film adaptation(s),author,book_published_date,film_published_date,book_title,film_title
0,"The 25th Hour (2001), David Benioff",25th Hour (2002),DAVID BENIOFF,2001,2002,THE 25TH HOUR,25TH HOUR
1,"3 Assassins (グラスホッパー, Gurasuhoppā) (2004), Kōt...",Grasshopper (2015),KŌTARŌ ISAKA,2004,2015,3 ASSASSINS,GRASSHOPPER
2,"4.50 from Paddington (1957), Agatha Christie","Murder, She Said (1961)",AGATHA CHRISTIE,1957,1961,4.50 FROM PADDINGTON,"MURDER, SHE SAID"
3,"4.50 from Paddington (1957), Agatha Christie",Crime Is Our Business (French: Le Crime est no...,AGATHA CHRISTIE,1957,2008,4.50 FROM PADDINGTON,CRIME IS OUR BUSINESS
4,"58 Minutes (1987), Walter Wager",Die Hard 2 (1990),WALTER WAGER,1987,1990,58 MINUTES,DIE HARD 2


Film published date is a bit messy, take the first 4 characters as the year (some are things like "1972; Australian") and convert to float

In [3]:
films_to_books["film_published_date"] = (
    films_to_books["film_published_date"].str[:4].astype(float)
)
films_to_books.head()

Unnamed: 0,Fiction work(s),Film adaptation(s),author,book_published_date,film_published_date,book_title,film_title
0,"The 25th Hour (2001), David Benioff",25th Hour (2002),DAVID BENIOFF,2001,2002.0,THE 25TH HOUR,25TH HOUR
1,"3 Assassins (グラスホッパー, Gurasuhoppā) (2004), Kōt...",Grasshopper (2015),KŌTARŌ ISAKA,2004,2015.0,3 ASSASSINS,GRASSHOPPER
2,"4.50 from Paddington (1957), Agatha Christie","Murder, She Said (1961)",AGATHA CHRISTIE,1957,1961.0,4.50 FROM PADDINGTON,"MURDER, SHE SAID"
3,"4.50 from Paddington (1957), Agatha Christie",Crime Is Our Business (French: Le Crime est no...,AGATHA CHRISTIE,1957,2008.0,4.50 FROM PADDINGTON,CRIME IS OUR BUSINESS
4,"58 Minutes (1987), Walter Wager",Die Hard 2 (1990),WALTER WAGER,1987,1990.0,58 MINUTES,DIE HARD 2


In [4]:
films_to_books["book_published_date"].unique()

array(['2001', '2004', '1957', '1987', '1936', '1998', '1996', '1985',
       '1970', '1969', '2000', '1971', '1999', '1816', '1911', '1966',
       '1996–present', '1866', '1824', '1884', '1883', None, '1876',
       '1959', '1923', '1935', '2017', '1955', '1979', '1953', '1921',
       '1961', '1968', '1887', '1960', '1946', '1919', '1929', '1992',
       '2009', '1895', '1976', '1993', '1978', '1948', '1990', '1965',
       '1995', '1991', '1976–2008', '1925', '1927', '1943', '1977',
       '2005', '1958', '1942', '1939', '1957–2012', '1956', '2014',
       '1944', '1909', '1908', '1937', '1933', '1938', '1951', '1873',
       '1949', '1963', '1983', '1982', '1877', '1854', '1912', '1975',
       '1914', '1997', '1981', '1989', '1931', '1928', '1954', '1972',
       '1900', '1947', '1890', '2003', '1907', '1924', '1926', '1945',
       '1930', '1932', '1885', '1880', '1941', '1952', '1932–1968',
       '1888', '1941–1989', '1984', '1964', '1973', '1916', '1934',
       '1906', '1980

Delete any publication date that isn't a number

In [5]:
films_to_books.loc[
    films_to_books["book_published_date"].str.isnumeric() == False,
    "book_published_date",
] = np.nan

films_to_books["book_published_date"] = films_to_books["book_published_date"].astype(
    float
)

## Join book ratings data

In [6]:
book_ratings = pd.read_parquet("./data/book_ratings.parquet")
print(book_ratings.shape)
book_ratings.head()

(10000, 8)


Unnamed: 0,goodreads_book_id,authors,original_publication_year,original_title,title,average_rating,small_image_url,average_rating_normalised
0,2767052,SUZANNE COLLINS,2008.0,THE HUNGER GAMES,"THE HUNGER GAMES (THE HUNGER GAMES, #1)",4.34,https://images.gr-assets.com/books/1447303603s...,86.8
1,3,"J.K. ROWLING, MARY GRANDPRÉ",1997.0,HARRY POTTER AND THE PHILOSOPHER'S STONE,HARRY POTTER AND THE SORCERER'S STONE (HARRY P...,4.44,https://images.gr-assets.com/books/1474154022s...,88.8
2,41865,STEPHENIE MEYER,2005.0,TWILIGHT,"TWILIGHT (TWILIGHT, #1)",3.57,https://images.gr-assets.com/books/1361039443s...,71.4
3,2657,HARPER LEE,1960.0,TO KILL A MOCKINGBIRD,TO KILL A MOCKINGBIRD,4.25,https://images.gr-assets.com/books/1361975680s...,85.0
4,4671,F. SCOTT FITZGERALD,1925.0,THE GREAT GATSBY,THE GREAT GATSBY,3.89,https://images.gr-assets.com/books/1490528560s...,77.8


Based on previous exploration, join only on title

In [7]:
print(len(films_to_books))

merged_book_ratings = films_to_books.merge(
    book_ratings, left_on="book_title", right_on="original_title", how="inner"
)

print(
    f"{len(merged_book_ratings)} books merged on title alone ({100 * (len(merged_book_ratings) / len(films_to_books)):.1f}%)"
)

merged_book_ratings.head()

7320
2270 books merged on title alone (31.0%)


Unnamed: 0,Fiction work(s),Film adaptation(s),author,book_published_date,film_published_date,book_title,film_title,goodreads_book_id,authors,original_publication_year,original_title,title,average_rating,small_image_url,average_rating_normalised
0,"4.50 from Paddington (1957), Agatha Christie","Murder, She Said (1961)",AGATHA CHRISTIE,1957.0,1961.0,4.50 FROM PADDINGTON,"MURDER, SHE SAID",140278,AGATHA CHRISTIE,1957.0,4.50 FROM PADDINGTON,"4:50 FROM PADDINGTON (MISS MARPLE, #8)",3.92,https://s.gr-assets.com/assets/nophoto/book/50...,78.4
1,"4.50 from Paddington (1957), Agatha Christie",Crime Is Our Business (French: Le Crime est no...,AGATHA CHRISTIE,1957.0,2008.0,4.50 FROM PADDINGTON,CRIME IS OUR BUSINESS,140278,AGATHA CHRISTIE,1957.0,4.50 FROM PADDINGTON,"4:50 FROM PADDINGTON (MISS MARPLE, #8)",3.92,https://s.gr-assets.com/assets/nophoto/book/50...,78.4
2,"About a Boy (1998), Nick Hornby",About a Boy (2002),NICK HORNBY,1998.0,2002.0,ABOUT A BOY,ABOUT A BOY,4271,NICK HORNBY,1998.0,ABOUT A BOY,ABOUT A BOY,3.79,https://images.gr-assets.com/books/1382004144s...,75.8
3,"Absolute Power (1996), David Baldacci",Absolute Power (1997),DAVID BALDACCI,1996.0,1997.0,ABSOLUTE POWER,ABSOLUTE POWER,15159,DAVID BALDACCI,1995.0,ABSOLUTE POWER,ABSOLUTE POWER,4.15,https://images.gr-assets.com/books/1328399707s...,83.0
4,"The Accidental Tourist (1985), Anne Tyler",The Accidental Tourist (1988),ANNE TYLER,1985.0,1988.0,THE ACCIDENTAL TOURIST,THE ACCIDENTAL TOURIST,60792,"ANNE TYLER, JENNIFER BASSETT",1985.0,THE ACCIDENTAL TOURIST,THE ACCIDENTAL TOURIST,3.9,https://images.gr-assets.com/books/1502220282s...,78.0


# Films

Let's try joining on:

- title ONLY
- title AND release date

In [8]:
film_ratings = pd.read_parquet("./data/film_metacritic_ratings.parquet")
print(film_ratings.shape)
film_ratings.head()

(12846, 12)


Unnamed: 0.1,Unnamed: 0,Title,Release Date,Description,Rating,No of Persons Voted,Directed by,Written by,Duration,Genres,film_title,film_published_date
0,0,Dekalog (1988),"Mar 22, 1996",This masterwork by Krzysztof Kieślowski is one...,74.0,118,Krzysztof Kieslowski,"Krzysztof Kieslowski, Krzysztof Piesiewicz",9 h 32 m,Drama,DEKALOG (1988),1996
1,1,Three Colors: Red,"Nov 23, 1994",Krzysztof Kieslowski closes his Three Colors t...,83.0,241,Krzysztof Kieslowski,"Krzysztof Kieslowski, Krzysztof Piesiewicz, Ag...",1 h 39 m,"Drama,Mystery,Romance",THREE COLORS: RED,1994
2,2,The Conformist,"Oct 22, 1970","Set in Rome in the 1930s, this re-release of B...",73.0,106,Bernardo Bertolucci,"Alberto Moravia, Bernardo Bertolucci",1 h 47 m,Drama,THE CONFORMIST,1970
3,3,Tokyo Story,"Mar 13, 1972",Yasujiro Ozu’s Tokyo Story follows an aging co...,81.0,147,Yasujirô Ozu,"Kôgo Noda, Yasujirô Ozu",2 h 16 m,Drama,TOKYO STORY,1972
4,4,The Leopard (re-release),"Aug 13, 2004","Set in Sicily in 1860, Luchino Visconti's spec...",78.0,85,Luchino Visconti,"Giuseppe Tomasi di Lampedusa, Suso Cecchi D'Am...",3 h 7 m,"Drama,History",THE LEOPARD,2004


In [9]:
print(len(films_to_books))

merged_film_ratings = films_to_books.merge(film_ratings, on="film_title", how="inner")

print(
    f"{len(merged_film_ratings)} films merged on title alone ({100 * (len(merged_film_ratings) / len(films_to_books)):.1f}%)"
)

7320
3650 films merged on title alone (49.9%)


What about adding date to disambiguate between films with the same title?

In [10]:
print(len(films_to_books))

merged_film_ratings_2 = films_to_books.merge(
    film_ratings, on=["film_title", "film_published_date"], how="inner"
)

print(
    f"{len(merged_film_ratings_2)} films merged on title AND date ({100 * (len(merged_film_ratings_2) / len(films_to_books)):.1f}%)"
)

7320
1480 films merged on title AND date (20.2%)


Let's join Metacritic ratings only on title

In [11]:
print(len(merged_book_ratings))

merged_film_ratings = merged_book_ratings.merge(
    film_ratings, on="film_title", how="inner"
)

print(merged_film_ratings.shape)

merged_film_ratings.head()

2270
(1570, 26)


Unnamed: 0,Fiction work(s),Film adaptation(s),author,book_published_date,film_published_date_x,book_title,film_title,goodreads_book_id,authors,original_publication_year,...,Title,Release Date,Description,Rating,No of Persons Voted,Directed by,Written by,Duration,Genres,film_published_date_y
0,"About a Boy (1998), Nick Hornby",About a Boy (2002),NICK HORNBY,1998.0,2002.0,ABOUT A BOY,ABOUT A BOY,4271,NICK HORNBY,1998.0,...,About a Boy,"May 17, 2002",About a Boy is about a man (Grant) -- a handso...,74.0,99,"Chris Weitz, \n \n Paul Weitz","Nick Hornby, Peter Hedges, Chris Weitz, Paul W...",1 h 41 m,"Comedy,Drama,Romance",2002
1,"Absolute Power (1996), David Baldacci",Absolute Power (1997),DAVID BALDACCI,1996.0,1997.0,ABSOLUTE POWER,ABSOLUTE POWER,15159,DAVID BALDACCI,1995.0,...,Absolute Power,"Feb 14, 1997","He was where he shouldn't have been, saw what ...",65.0,42,Clint Eastwood,"David Baldacci, William Goldman",2 h 1 m,"Action,Crime,Drama,Thriller",1997
2,"The Accidental Tourist (1985), Anne Tyler",The Accidental Tourist (1988),ANNE TYLER,1985.0,1988.0,THE ACCIDENTAL TOURIST,THE ACCIDENTAL TOURIST,60792,"ANNE TYLER, JENNIFER BASSETT",1985.0,...,The Accidental Tourist,"Dec 23, 1988",An author of travel books (Hurt) sees his worl...,66.0,13,Lawrence Kasdan,"Anne Tyler, Frank Galati, Lawrence Kasdan",2 h 1 m,"Comedy,Drama,Romance",1988
3,The Adventures of Sherlock Holmes (serialised ...,Sherlock Holmes (1916),ARTHUR CONAN DOYLE,,1916.0,THE ADVENTURES OF SHERLOCK HOLMES,SHERLOCK HOLMES,3590,ARTHUR CONAN DOYLE,1892.0,...,Sherlock Holmes,"Dec 25, 2009",Sherlock Holmes has made his reputation findin...,77.0,558,Guy Ritchie,"Michael Robert Johnson, Anthony Peckham, Simon...",2 h 8 m,"Action,Adventure,Mystery",2009
4,The Adventures of Sherlock Holmes (serialised ...,Sherlock Holmes (1922),ARTHUR CONAN DOYLE,,1922.0,THE ADVENTURES OF SHERLOCK HOLMES,SHERLOCK HOLMES,3590,ARTHUR CONAN DOYLE,1892.0,...,Sherlock Holmes,"Dec 25, 2009",Sherlock Holmes has made his reputation findin...,77.0,558,Guy Ritchie,"Michael Robert Johnson, Anthony Peckham, Simon...",2 h 8 m,"Action,Adventure,Mystery",2009


In [14]:
final_data = (
    merged_film_ratings[
        [
            "book_title",
            "author",
            "book_published_date",
            "average_rating_normalised",
            "film_title",
            "film_published_date_y",
            "Rating",
        ]
    ]
    .drop_duplicates()
    .rename(
        columns={
            "average_rating_normalised": "avg_book_rating",
            "Rating": "avg_film_rating",
            "film_published_date_y": "film_published_date",
        }
    )
)

print(final_data.shape)
final_data.sort_values("book_title")

(87, 7)


Unnamed: 0,book_title,author,book_published_date,avg_book_rating,film_title,film_published_date,avg_film_rating
134,A CLOCKWORK ORANGE,ANTHONY BURGESS,1962.0,79.6,A CLOCKWORK ORANGE,1971,73.0
145,A CONNECTICUT YANKEE IN KING ARTHUR'S COURT,MARK TWAIN,1889.0,75.2,BLACK KNIGHT,2001,93.0
0,ABOUT A BOY,NICK HORNBY,1998.0,75.8,ABOUT A BOY,2002,74.0
1,ABSOLUTE POWER,DAVID BALDACCI,1996.0,83.0,ABSOLUTE POWER,1997,65.0
6,AFTER,ANNA TODD,2017.0,74.8,AFTER,2019,99.0
...,...,...,...,...,...,...,...
131,THE CITY OF EMBER,JEANNE DUPRAU,2003.0,77.0,CITY OF EMBER,2008,64.0
133,THE CLIENT,JOHN GRISHAM,1993.0,79.4,THE CLIENT,1994,71.0
136,THE COLOR PURPLE,ALICE WALKER,1982.0,83.4,THE COLOR PURPLE,1985,82.0
138,THE COLOR PURPLE,ALICE WALKER,1982.0,83.4,THE COLOR PURPLE,2023,82.0


Quick check - any films better?

In [15]:
final_data[final_data["avg_film_rating"] > final_data["avg_book_rating"]]

Unnamed: 0,book_title,author,book_published_date,avg_book_rating,film_title,film_published_date,avg_film_rating
6,AFTER,ANNA TODD,2017.0,74.8,AFTER,2019,99.0
8,AFTER,ANNA TODD,2017.0,74.8,AFTER,2014,99.0
10,AFTER,ANNA TODD,2017.0,74.6,AFTER,2019,99.0
12,AFTER,ANNA TODD,2017.0,74.6,AFTER,2014,99.0
21,AMERICAN PSYCHO,BRET EASTON ELLIS,1991.0,76.2,AMERICAN PSYCHO,2000,85.0
22,THE AMITYVILLE HORROR,JAY ANSON,1977.0,76.4,THE AMITYVILLE HORROR,2005,79.0
24,THE AMITYVILLE HORROR,JAY ANSON,1977.0,76.4,THE AMITYVILLE HORROR,1979,79.0
30,THE ANDROMEDA STRAIN,MICHAEL CRICHTON,1969.0,77.4,THE ANDROMEDA STRAIN,1971,80.0
35,ASK THE DUST,JOHN FANTE,1939.0,82.8,ASK THE DUST,2006,85.0
36,ATONEMENT,IAN MCEWAN,2001.0,77.6,ATONEMENT,2007,81.0


In [17]:
film_ratings[film_ratings["film_title"] == "CRASH"]

Unnamed: 0.1,Unnamed: 0,Title,Release Date,Description,Rating,No of Persons Voted,Directed by,Written by,Duration,Genres,film_title,film_published_date
5174,6563,Crash,"May 6, 2005","A provocative, unflinching look at the complex...",86.0,564,Paul Haggis,"Paul Haggis, Bobby Moresco",1 h 52 m,"Crime,Drama,Thriller",CRASH,2005
5175,6564,Crash,"May 6, 2005","A provocative, unflinching look at the complex...",86.0,564,Paul Haggis,"Paul Haggis, Bobby Moresco",1 h 52 m,"Crime,Drama,Thriller",CRASH,2005
5176,6565,Crash,"Mar 21, 1997",The immediate subject matter of Crash is the s...,86.0,564,Paul Haggis,"Paul Haggis, Bobby Moresco",1 h 52 m,"Crime,Drama,Thriller",CRASH,1997
5177,6566,Crash,"Mar 21, 1997",The immediate subject matter of Crash is the s...,86.0,564,Paul Haggis,"Paul Haggis, Bobby Moresco",1 h 52 m,"Crime,Drama,Thriller",CRASH,1997
