In [23]:
import pandas as pd
from pathlib import Path
import requests
import tarfile
from wikimapper import WikiMapper
from datapackage import Package
import numpy as np

## CMU Movie Dataset
Primary dataset for our analysis, downloaded from the [CMU Official Website](http://www.cs.cmu.edu/~ark/personas/). The main file movie.metadata.tsv is stored in the `data` folder. 

Before running this, please remove the `data` folder from your working directory. 

In [None]:
MOVIE_CMU_URL = "http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz"
response = requests.get(MOVIE_CMU_URL, stream=True)
file = tarfile.open(fileobj=response.raw, mode="r|gz")
file.extractall(path='.')

Path("MovieSummaries").rename("data")
data_path = Path("data")

for file_name in ["character.metadata.tsv", "name.clusters.txt", "plot_summaries.txt", "README.txt", "tvtropes.clusters.txt"]:
    file_path = data_path / file_name
    if file_path.exists():
        file_path.unlink()

cmu_cols = ["movie_wikipedia_id", "movie_freebase_id", "movie_title", "movie_release", "movie_revenue", "movie_runtime", "movie_languages", "movie_countries", "movie_genres"]
cmu_df = (pd.read_csv(
    data_path / "movie.metadata.tsv", 
    sep="\t", 
    header=None, 
    names=cmu_cols, 
    usecols=["movie_wikipedia_id", "movie_title", "movie_release", "movie_revenue", "movie_runtime",  "movie_languages", "movie_countries", "movie_genres"])
    .assign(
        movie_release=lambda df: df.movie_release.astype(str).str.slice(0, 4).replace("nan", pd.NA).astype("Int32"),
    )
)

## Mapping Wikipedia IDs to Wikidata IDs

Our analysis requires connecting movies to books they're based on. We use the [wikimapper](https://github.com/jcklie/wikimapper) Python library for mapping Wikipedia IDs (available in the CMU movie dataset) to Wikidata IDs (which we will need to join the results of the Wikidata Query Service), requiring a Wikipedia SQL dump for creating an index.


The following commands may take around one hour to finish. 

In [28]:
!wikimapper download enwiki-latest --dir data
!wikimapper create enwiki-latest --dumpdir data --target data/index_enwiki-latest.db

2023-11-15 15:06:03,101 - wikimapper.download - INFO - Downloading [https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz] to [data/enwiki-latest-page.sql.gz]
2023-11-15 15:13:47,592 - wikimapper.download - INFO - Downloading [https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page_props.sql.gz] to [data/enwiki-latest-page_props.sql.gz]
2023-11-15 15:15:15,525 - wikimapper.download - INFO - Downloading [https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-redirect.sql.gz] to [data/enwiki-latest-redirect.sql.gz]
2023-11-15 15:15:52,647 - wikimapper.processor - INFO - Creating index for [enwiki-latest] in [data/index_enwiki-latest.db]
2023-11-15 15:15:52,662 - wikimapper.processor - INFO - Parsing pages dump
2023-11-15 15:24:09,978 - wikimapper.processor - INFO - Creating database index on 'wikipedia_title'
2023-11-15 15:25:20,468 - wikimapper.processor - INFO - Parsing page properties dump
2023-11-15 15:28:12,513 - wikimapper.processor - INFO - Parsing redirects du

Enrich the CMU Movie Dataframe by mapping the wikipedia id to the wikidata id. 

In [37]:
mapper = WikiMapper(data_path / "index_enwiki-latest.db")
cmu_df = (cmu_df.assign(
            movie_wikidata_id = lambda x: x.movie_wikipedia_id.apply(
                lambda wikipedia_id: mapper.wikipedia_id_to_id(wikipedia_id)
                )
            )
            .drop(columns=["movie_wikipedia_id"])
         )

## Identifying Movies Based on Books Using Wikidata
We identify book-adaptations using the Wikidata database, focusing on the P144 (based on) relation. A SPARQL query on Wikidata Query Service helps us extract interconnected pairs of movie and book entities along with book attributes.

In [38]:
WIKI_DATA_SERVICE_URL = 'https://query.wikidata.org/sparql'
query = '''
SELECT DISTINCT ?movie ?book ?bookLabel ?authorLabel ?instanceOfLabel ?countryLabel ?pubDateLabel ?genreLabel ?awardLabel ?seriesLabel ?goodreadsLabel
WHERE 
{
  VALUES ?bookType { wd:Q47461344 wd:Q7725634 wd:Q571 wd:Q14406742 wd:Q21198342 wd:Q277759 }
  VALUES ?movieType { wd:Q11424 wd:Q506240 }

  ?book wdt:P31 ?bookType.
  OPTIONAL {?book wdt:P50 ?author}
  OPTIONAL {?book wdt:P31 ?instanceOf}
  OPTIONAL {?book wdt:P495 ?country}
  OPTIONAL {?book wdt:P577 ?pubDate}
  OPTIONAL {?book wdt:P136 ?genre}
  OPTIONAL {?book wdt:P166 ?award}
  OPTIONAL {?book wdt:P179 ?series}
  OPTIONAL {?book wdt:P8383 ?goodreads}

  ?movie wdt:P31 ?movieType;          
         wdt:P144 ?book.

  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
'''
query_result = requests.get(WIKI_DATA_SERVICE_URL, params = {'format': 'json', 'query': query})
wikidata_df =pd.DataFrame(query_result.json()['results']['bindings'])
for column in wikidata_df.columns:
    wikidata_df[column] = wikidata_df[column].apply(lambda x: x['value'] if isinstance(x, dict) and 'value' in x else x)
  
def get_list(series: pd.Series) -> list:
    return list(set(series.dropna().tolist()))

def mode(x: pd.Series) -> pd.Series:
    modes = x.mode()
    if len(modes) > 0:
        return modes.iloc[0]
    return None

categories = {
    'fiction': {'novel', 'short novel', 'novella', 'serialized fiction', 'short story', 'war fiction', 'magic realist fiction', 'metafiction', 'science fiction', 'suspense in literature', 'horror novel', 'horror fiction', 'crime fiction', 'psychological thriller', 'speculative/fantastic fiction', 'adventure fiction', 'detective fiction', 'noir fiction', 'political novel', 'vampire fiction', 'dystopian fiction', 'social science fiction', 'techno-thriller', 'thriller', 'fantasy', 'Gothic novel', 'picaresque novel', 'mystery fiction', 'post-apocalyptic fiction', 'philosophical fiction', 'romantic fiction', 'Bildungsroman', 'roman à clef', 'comedy', 'black comedy'},
    'non_fiction': {'nonfiction', 'memoir', 'autobiography', 'biographical novel', 'biography', 'essay'},
    'children': {'children\'s literature', 'children\'s fiction', 'young adult fiction', 'children\'s novel'},
    'historical': {'historical fiction', 'historical novel'},
    'drama': {'play', 'drama', 'tragedy'},
    'anime': {'adventure anime and manga', 'drama anime and manga'},
    'fantasy': {'magic realist fiction', 'fantasy', 'vampire fiction', 'fairy tale'},
    'science_fiction': {'science fiction', 'dystopian fiction', 'social science fiction', 'techno-thriller', 'post-apocalyptic fiction'},
    'horror': {'horror novel', 'horror fiction'},
    'thriller': {'psychological thriller', 'thriller'},
    'detective': {'detective fiction', 'noir fiction', 'mystery fiction', 'cloak and dagger novel'},
    'satire': {'satire', 'satirical fiction', 'metafiction'},
    'comedy': {'comedy', 'black comedy'},
}

wikidata_df = (wikidata_df
                .assign(
                    movie_wikidata_id = lambda x: x.movie.str.split('/').str[-1],
                    book_wikidata_id = lambda x: x.book.str.split('/').str[-1],
                    book_release = lambda x: pd.to_datetime(x.pubDateLabel, errors='coerce').dt.year.astype('Int64')
                )
                .groupby(['movie_wikidata_id', 'book_wikidata_id'])
                .agg(
                    book_title = pd.NamedAgg(column='bookLabel', aggfunc=mode),
                    book_author = ('authorLabel', 'first'),
                    book_release = ('book_release', 'first'),
                    book_country = ('countryLabel', 'first'),
                    book_goodreads_id = ('goodreadsLabel', 'first'),
                    series = ('seriesLabel', 'first'),
                    instance_of = pd.NamedAgg(column='instanceOfLabel', aggfunc=get_list),
                    genre = pd.NamedAgg(column='genreLabel', aggfunc=get_list),
                    award = pd.NamedAgg(column='awardLabel', aggfunc=get_list)
                )
                .assign(
                    book_part_of_series = lambda x: x.series.notnull().astype(int),
                    literary_work = lambda x: x.instance_of.apply(lambda y: 'literary work' in y).astype(int),
                    written_work = lambda x: x.instance_of.apply(lambda y: 'written work' in y).astype(int),
                    comic_book_seris = lambda x: x.instance_of.apply(lambda y: 'comic book series' in y).astype(int),
                    book_series = lambda x: x.instance_of.apply(lambda y: 'book series' in y).astype(int),
                    manga_series = lambda x: x.instance_of.apply(lambda y: 'manga series' in y).astype(int),
                    book_fiction = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['fiction'])) > 0).astype(int),
                    book_non_fiction = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['non_fiction'])) > 0).astype(int),
                    book_children = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['children'])) > 0).astype(int),
                    book_historical = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['historical'])) > 0).astype(int),
                    book_drama = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['drama'])) > 0).astype(int),
                    book_anime = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['anime'])) > 0).astype(int),
                    book_fantasy = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['fantasy'])) > 0).astype(int),
                    book_science_fiction = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['science_fiction'])) > 0).astype(int),
                    book_horror = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['horror'])) > 0).astype(int),
                    book_thriller = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['thriller'])) > 0).astype(int),
                    book_detective = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['detective'])) > 0).astype(int),
                    book_satire = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['satire'])) > 0).astype(int),
                    book_comedy = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['comedy'])) > 0).astype(int),
                    book_won_price = lambda x: x.award.apply(lambda y: len(y) > 0).astype(int),
                )
                .drop(['instance_of', 'genre', 'award', 'series'], axis=1)
                .reset_index()
                )

  book_release = lambda x: pd.to_datetime(x.pubDateLabel, errors='coerce').dt.year.astype('Int64')


## Setting Up Kaggle API for Dataset Access

Some of the datasets we use are from Kaggle. To access certain datasets on Kaggle, you'll need a Kaggle account and an API token. Kaggle provides an API that allows you to programmatically download datasets directly into your Jupyter Notebook environment. Here's how to set up and use the Kaggle API token:

### 1. Kaggle Account and Token

If you don't already have a Kaggle account, you can sign up for one at [Kaggle](https://www.kaggle.com/). Once you have an account, follow these steps to create an API token:

- Log in to your Kaggle account.
- Go to your account settings page by clicking on your profile picture in the upper right-hand corner of the Kaggle website and selecting "Account."
- Scroll down to the "API" section and click on the "Create New API Token" button. This will download a file called `kaggle.json` containing your API credentials.

### 2. Storing the Kaggle API Token

To use the Kaggle API in your Jupyter Notebook, you need to store the `kaggle.json` at a designated folder. To do copy the `kaggle.json` in your working directory and run the following commands:


In [None]:
! mkdir ~/.kaggle
! mv kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

## Goodreads Dataset
For further information about the books, we use a [Kaggle](https://www.kaggle.com) dataset curated from [Goodreads](https://www.goodreads.com). In particular we will investigate how the book ratings on goodread influence the respective book-adaptations. 

In [None]:
!kaggle datasets download -d mdhamani/goodreads-books-100k
!unzip goodreads-books-100k.zip
!rm goodreads-books-100k.zip
!mv GoodReads_100k_books.csv data/

In [39]:
goodreads_df = (pd.read_csv("data/GoodReads_100k_books.csv", usecols=['link', 'pages', 'rating', 'totalratings'])
                .assign(
                    book_goodreads_id = lambda x: x.link.str.split('/').str[-1].str.replace('.', '-').str.split('-').str[0],
                )
                .drop(columns=['link'])
                .rename(columns={'pages': 'book_pages', 'rating': 'book_rating', 'totalratings': 'book_total_ratings'})
                )

## Enhancing Revenue Data with TMDB
The CMU dataset lacks comprehensive revenue data, so we supplement it with revenue information from [The Movie Database](https://www.themoviedb.org).

In [None]:
!kaggle datasets download -d rounakbanik/the-movies-dataset
!unzip the-movies-dataset.zip
!rm the-movies-dataset.zip
!mv movies_metadata.csv data/
!rm credits.csv
!rm keywords.csv
!rm links.csv
!rm links_small.csv
!rm ratings.csv
!rm ratings_small.csv

In [40]:
def replace_jpg(x):
    return np.nan if isinstance(x, str) and x.endswith('.jpg') else x


tmdb_df = (pd.read_csv("data/movies_metadata.csv")
            .assign(
                    movie_budget = lambda df: df.budget.apply(replace_jpg).astype("Int64").replace(0, pd.NA),
                    movie_revenue_tmdb = lambda df: df.revenue.replace(0.0, pd.NA).astype("Int64")
            )
            .loc[:, ['imdb_id', 'movie_budget', 'movie_revenue_tmdb']]
          )
tmdb_df.drop_duplicates(subset=['imdb_id'], inplace=True)

  tmdb_df = (pd.read_csv("data/movies_metadata.csv")


## Normalizing Revenues and Budgets with CPI
To make 70 years' worth of revenue and budget data comparable, we adjust for inflation using the US Consumer Price Index (CPI).

In [67]:
package = Package('https://datahub.io/core/cpi-us/datapackage.json')
cpi_df = (pd.DataFrame(package.resources[1].read(), columns=['date', 'cpi', '_'])
          .assign(
                year = lambda df: df.date.astype(str).str.slice(0, 4).astype("Int32"),
                inflation_adjustment = lambda df: (df.cpi.iloc[-1] / df.cpi).astype(float)
            )
          .drop(columns=['date', 'cpi', '_'])
          .drop_duplicates(subset=['year'])
          .reset_index(drop=True)
          .assign(inflation_adjustment = lambda x: x.inflation_adjustment.astype(float))
        )

## IMDb Ratings
We load IMDb movie ratings to assess the 'goodness' of movies according to users' opinions.

In [3]:
!curl -o title.ratings.tsv.gz https://datasets.imdbws.com/title.ratings.tsv.gz
!gunzip title.ratings.tsv.gz
!mv title.ratings.tsv data/
!rm title.ratings.tsv.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 6722k  100 6722k    0     0  9941k      0 --:--:-- --:--:-- --:--:--  9.8M
rm: title.ratings.tsv.gz: No such file or directory
mv: rename data/title.ratings.tsv to data/imdb_ratings.tsv: No such file or directory


In [42]:
imdb_df = (pd.read_csv("data/title.ratings.tsv", sep='\t')
            .rename(columns={
                'tconst': 'imdb_id', 
                'averageRating': 'imdb_rating', 
                'numVotes': 'imdb_total_votes'})
            )

## IMDB ID
To join IMDb and TMDB datasets, we query IMDb IDs from Wikidata.

In [43]:
query = '''
SELECT ?movie ?IMDB_ID
WHERE
{
VALUES ?movieType { wd:Q11424 wd:Q506240 }
?movie wdt:P31 ?movieType.
?movie wdt:P345 ?IMDB_ID.

SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
'''
query_result = requests.get(WIKI_DATA_SERVICE_URL, params = {'format': 'json', 'query': query})
wikidata_imdb_df =pd.DataFrame(query_result.json()['results']['bindings'])
for column in wikidata_imdb_df.columns:
    wikidata_imdb_df[column] = wikidata_imdb_df[column].apply(lambda x: x['value'] if isinstance(x, dict) and 'value' in x else x)
    

In [44]:
wikidata_imdb_df = (wikidata_imdb_df
                    .assign(
                        movie_wikidata_id = lambda x: x.movie.str.split('/').str[-1],
                        imdb_id = lambda x: x.IMDB_ID
                    )
                    .loc[:, ['movie_wikidata_id', 'imdb_id']]
                    )

## Merge Datasets

In [73]:
book_adaptation_df = (cmu_df
                        .merge(wikidata_imdb_df, on='movie_wikidata_id', how='left')
                        .merge(imdb_df, on='imdb_id', how='left')
                        .merge(tmdb_df, on='imdb_id', how='left')
                        .merge(wikidata_df, on='movie_wikidata_id', how='left')
                        .merge(goodreads_df, on='book_goodreads_id', how='left')
                        .merge(cpi_df, left_on='movie_release', right_on='year', how='left')
                        .assign(
                            movie_budget = lambda x: x.movie_budget.astype(float),
                            movie_revenue = lambda x: x.movie_revenue.fillna(x.movie_revenue_tmdb).astype(float),
                            movie_is_adaptation = lambda x: x.book_wikidata_id.notna()
                            )
                        .assign(
                            movie_budget = lambda df: df.movie_budget * df.inflation_adjustment,
                            movie_revenue = lambda df: df.movie_revenue * df.inflation_adjustment
                        )
                        .drop(columns=['movie_wikidata_id', 'imdb_id', 'movie_revenue_tmdb', 'book_goodreads_id',
                                        'year', 'inflation_adjustment', 'book_wikidata_id'])
                     )

In [75]:
book_adaptation_df.to_csv(data_path / "book_adaptation.csv", index=False)