In [79]:
import pandas as pd
from wikimapper import WikiMapper

mapper = WikiMapper("data/index_enwiki-latest.db")

## Load CMU Movie Dataset

In [80]:
cmu_movie_cols = ['wikipedia_id', 'freebase_id', 'cmu_movie_title', 'release_data', 'box_office_revenue', 'runtime', 'languages', 'countries', 'genres']
cmu_movie_df = (pd.read_csv('data/movie.metadata.tsv', sep='\t', header=None, names=cmu_movie_cols)
                .drop(['freebase_id', 'release_data', 'box_office_revenue', 'runtime', 'languages', 'countries', 'genres'], axis=1)
                .assign(
                    wikidata_id = lambda x: x.wikipedia_id.apply(lambda y: mapper.wikipedia_id_to_id(y))
                )
)

## Find movies based on books / book series
To find all movies that are based on books we run the followin query on the [Wikidata Query Service](https://query.wikidata.org/):

<code>
SELECT DISTINCT ?movie ?book
  
WHERE 
{

  VALUES ?bookType { wd:Q47461344 wd:Q7725634 wd:Q571 wd:Q14406742 wd:Q21198342 wd:Q277759}

  VALUES ?movieType { wd:Q11424 wd:Q506240 }
  
  ?book wdt:P31 ?bookType.   

  ?movie wdt:P31 ?movieType;          
        
        wdt:P144 ?book.

  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
</code>

It searches for instances of `film` or `television film` that are based on an instance of `literary work`, `written work`, `book`, `comic book series`, `manga series` or `book series`. 

In [81]:
movie_book_df = (pd.read_csv('data/movie_book.csv')
                    .assign(
                        movie_wikidata_id = lambda x: x.movie.str.split('/').str[-1],
                        book_wikidata_id = lambda x: x.book.str.split('/').str[-1],
                    )
                    .drop(['movie', 'book'], axis=1)
                )

In [82]:
movie_book_df = (movie_book_df.merge(cmu_movie_df, left_on='movie_wikidata_id', right_on='wikidata_id', how='left')
              .query('wikipedia_id.notnull()')
              .reset_index(drop=True)
              .assign(movie_wikipedia_id = lambda x: x.wikipedia_id.astype(int))
              .loc[:, ['movie_wikipedia_id', 'book_wikidata_id']]
            )

## Get information about books
To later on be able to merge the books with goodreads data get title and author from Wikidata:
<code>

SELECT DISTINCT

  ?book ?bookLabel

  ?title

  ?authorLabel
  
WHERE 
{

  VALUES ?book { query_string }

  OPTIONAL { ?book wdt:P1476 ?title. }
  
  OPTIONAL { ?book wdt:P50 ?author. }

  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}

<\code>

In [83]:
query_string = " ".join([f'wd:{wikidata_id}' for wikidata_id in movie_book_df.book_wikidata_id.unique()])

In [84]:
print(query_string)

wd:Q480 wd:Q844 wd:Q1834 wd:Q2222 wd:Q2870 wd:Q4347 wd:Q4784 wd:Q2515 wd:Q6511 wd:Q8275 wd:Q6911 wd:Q8258 wd:Q9184 wd:Q11829 wd:Q11880 wd:Q11831 wd:Q12098 wd:Q11678 wd:Q11336 wd:Q16438 wd:Q11859 wd:Q13912 wd:Q13911 wd:Q27321 wd:Q22726 wd:Q18813 wd:Q18796 wd:Q28172 wd:Q24323 wd:Q26085 wd:Q25551 wd:Q25338 wd:Q26541 wd:Q26505 wd:Q36097 wd:Q37293 wd:Q35160 wd:Q38793 wd:Q38213 wd:Q29478 wd:Q32136 wd:Q31285 wd:Q41542 wd:Q47209 wd:Q46887 wd:Q40185 wd:Q40205 wd:Q46751 wd:Q46758 wd:Q45735 wd:Q45192 wd:Q50857 wd:Q50948 wd:Q60220 wd:Q43361 wd:Q43714 wd:Q43891 wd:Q70784 wd:Q70806 wd:Q80038 wd:Q80817 wd:Q80771 wd:Q81240 wd:Q81810 wd:Q81689 wd:Q61324 wd:Q72309 wd:Q82737 wd:Q82464 wd:Q82428 wd:Q83755 wd:Q83797 wd:Q83585 wd:Q91926 wd:Q53592 wd:Q53945 wd:Q91499 wd:Q75840 wd:Q92640 wd:Q92582 wd:Q95159 wd:Q120288 wd:Q119567 wd:Q127149 wd:Q117182 wd:Q116122 wd:Q112407 wd:Q95029 wd:Q106199 wd:Q130295 wd:Q135515 wd:Q140527 wd:Q144569 wd:Q138746 wd:Q140036 wd:Q140016 wd:Q137629 wd:Q137308 wd:Q137869 wd:Q1477

In [85]:
def get_list(series: pd.Series) -> list:
    return list(set(series.dropna().tolist()))

def mode(x: pd.Series) -> pd.Series:
    modes = x.mode()
    if len(modes) > 0:
        return modes.iloc[0]
    return None

categories = {
    'fiction': {'novel', 'short novel', 'novella', 'serialized fiction', 'short story', 'war fiction', 'magic realist fiction', 'metafiction', 'science fiction', 'suspense in literature', 'horror novel', 'horror fiction', 'crime fiction', 'psychological thriller', 'speculative/fantastic fiction', 'adventure fiction', 'detective fiction', 'noir fiction', 'political novel', 'vampire fiction', 'dystopian fiction', 'social science fiction', 'techno-thriller', 'thriller', 'fantasy', 'Gothic novel', 'picaresque novel', 'mystery fiction', 'post-apocalyptic fiction', 'philosophical fiction', 'romantic fiction', 'Bildungsroman', 'roman à clef', 'comedy', 'black comedy'},
    'non_fiction': {'nonfiction', 'memoir', 'autobiography', 'biographical novel', 'biography', 'essay'},
    'children': {'children\'s literature', 'children\'s fiction', 'young adult fiction', 'children\'s novel'},
    'historical': {'historical fiction', 'historical novel'},
    'drama': {'play', 'drama', 'tragedy'},
    'anime': {'adventure anime and manga', 'drama anime and manga'},
    'fantasy': {'magic realist fiction', 'fantasy', 'vampire fiction', 'fairy tale'},
    'science_fiction': {'science fiction', 'dystopian fiction', 'social science fiction', 'techno-thriller', 'post-apocalyptic fiction'},
    'horror': {'horror novel', 'horror fiction'},
    'thriller': {'psychological thriller', 'thriller'},
    'detective': {'detective fiction', 'noir fiction', 'mystery fiction', 'cloak and dagger novel'},
    'satire': {'satire', 'satirical fiction', 'metafiction'},
    'comedy': {'comedy', 'black comedy'},
}

book_wikidata_df = (pd.read_csv('data/wikidata_book.csv')
                    .assign(
                            book_wikidata_id = lambda x: x.book.str.split('/').str[-1],
                            year = lambda x: pd.to_datetime(x.pubdate, errors='coerce').dt.year.astype('Int64'),
                    )
                    .drop(['book'], axis=1)
                    .groupby('book_wikidata_id')
                    .agg(
                            title = pd.NamedAgg(column='bookLabel', aggfunc=mode),
                            author = ('authorLabel', 'first'),
                            year = ('year', 'first'),
                            instance_of = pd.NamedAgg(column='instanceofLabel', aggfunc=get_list),
                            form = pd.NamedAgg(column='formLabel', aggfunc=get_list),
                            genre = pd.NamedAgg(column='genreLabel', aggfunc=get_list),
                            award = pd.NamedAgg(column='awardLabel', aggfunc=get_list),
                            series = ('seriesLabel', 'first'),
                        )
                    .assign(
                        part_of_series = lambda x: x.series.notnull().astype(int),
                        is_literary_work = lambda x: x.instance_of.apply(lambda y: 'literary work' in y).astype(int),
                        is_written_work = lambda x: x.instance_of.apply(lambda y: 'written work' in y).astype(int),
                        is_comic_book_seris = lambda x: x.instance_of.apply(lambda y: 'comic book series' in y).astype(int),
                        is_book_series = lambda x: x.instance_of.apply(lambda y: 'book series' in y).astype(int),
                        is_manga_series = lambda x: x.instance_of.apply(lambda y: 'manga series' in y).astype(int),
                        is_novel = lambda x: x.form.apply(lambda y: 'novel' in y).astype(int),
                        is_short_story = lambda x: x.form.apply(lambda y: 'short story' in y).astype(int),    
                        is_play = lambda x: x.form.apply(lambda y: 'fiction' in y).astype(int),
                        is_novella = lambda x: x.form.apply(lambda y: 'novella' in y).astype(int),
                        is_fiction = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['fiction'])) > 0).astype(int),
                        is_non_fiction = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['non_fiction'])) > 0).astype(int),
                        is_children = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['children'])) > 0).astype(int),
                        is_historical = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['historical'])) > 0).astype(int),
                        is_drama = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['drama'])) > 0).astype(int),
                        is_anime = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['anime'])) > 0).astype(int),
                        is_fantasy = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['fantasy'])) > 0).astype(int),
                        is_science_fiction = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['science_fiction'])) > 0).astype(int),
                        is_horror = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['horror'])) > 0).astype(int),
                        is_thriller = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['thriller'])) > 0).astype(int),
                        is_detective = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['detective'])) > 0).astype(int),
                        is_satire = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['satire'])) > 0).astype(int),
                        is_comedy = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['comedy'])) > 0).astype(int),
                        won_price = lambda x: x.award.apply(lambda y: len(y) > 0).astype(int),
                    )
                    .drop(['instance_of', 'form', 'genre', 'award', 'series'], axis=1)
)

book_wikidata_df.to_csv('dataset/book_metadata.csv', index=True)


In [86]:
wikidata_meta_df = (pd.read_csv('data/wikidata_book.csv')
                    .assign(
                        book_wikidata_id = lambda x: x.book.str.split('/').str[-1]
                        )
                    .drop(['book'], axis=1)
                    .rename(columns={'authorLabel': 'author', 'title': 'title_wikidata'})
                    .melt(id_vars=['book_wikidata_id', 'author'], value_vars=['bookLabel', 'title_wikidata'], value_name='title')
                    .drop(columns=['variable'])
                    .query('title.notnull()')
                    )

## Join with Goodreads Id

In [87]:

goodreads_df_list = []
goodreads_df_list.append(pd.read_csv('archive/book1-100k.csv'))
for i in range(1,20):
    goodreads_df_list.append(pd.read_csv(f'archive/book{i*100}k-{(i+1)*100}k.csv'))
goodreads_df_list.append(pd.read_csv('archive/book2000k-3000k.csv'))
goodreads_df_list.append(pd.read_csv('archive/book3000k-4000k.csv'))
goodreads_df_list.append(pd.read_csv('archive/book4000k-5000k.csv'))
goodreads_df = (pd.concat(goodreads_df_list)
                .assign(
                    n_pages = lambda x: x.pagesNumber.fillna(x.PagesNumber),
                    n_ratings = lambda x: x.RatingDistTotal.str.split(':').str[1].astype(int)
                    )
                .rename(columns={'Id': 'goodreads_id', 'Authors': 'authors', 'Name': 'title', 'CountsOfReview': 'n_reviews', 'Rating': 'rating', 'PublishYear': 'year', 'Description': 'summary'})
                .loc[:, ['goodreads_id', 'title', 'authors', 'year', 'n_pages', 'n_ratings', 'n_reviews', 'rating', 'summary']]
                )
goodreads_meta_df = (goodreads_df.assign(
                        author = lambda x: x.authors.str.split('/'),
                     )
                     .loc[:, ['goodreads_id', 'title', 'author']]
                     .explode('author')
                     )

In [88]:
def clean_title(title_series: pd.Series) -> pd.Series:
    return (title_series
            .str.split('(').str[0]
            .str.split(':').str[0]
            .str.lower()
            .str.replace('and', '&')
            .str.replace('.', '')
            .str.replace("'", '')
            .str.replace('-', ' ')
            .str.replace(r'\s+', ' ', regex=True)
            .str.strip()
    )

def clean_author(author_series: pd.Series) -> pd.Series:
    initial_letter = (author_series
                      .str.strip()
                      .str[0]
                      .str.lower())
    last_name = (author_series
                 .str.split(r"(\s|-|')", regex=True)
                 .str[-1]
                 .str.replace('.', '')
                 .str.replace("'", '')
                 .str.replace(r'\s+', ' ', regex=True)
                 .str.strip()
                 .str.lower()
                 )
    return initial_letter + " " + last_name

In [89]:
goodreads_meta_df = (goodreads_meta_df
                        .assign(
                            title_clean = lambda x: clean_title(x.title),
                            author_clean = lambda x: clean_author(x.author)
                    ))

In [90]:
wikidata_meta_df = (wikidata_meta_df
                    .assign(
                        title_clean = lambda x: clean_title(x.title),
                        author_clean = lambda x: clean_author(x.author)
                    ))

In [91]:
wikidata_to_goodreads = (wikidata_meta_df
                            .merge(goodreads_meta_df, on=['title_clean', 'author_clean'], how='left')
                            .query('goodreads_id.notnull()')
                            .assign(goodreads_id = lambda x: x.goodreads_id.astype(int))
                            .loc[:, ['book_wikidata_id', 'goodreads_id']])

In [92]:
relevant_goodreads_df = (wikidata_to_goodreads
                            .merge(goodreads_df, on='goodreads_id', how='inner')
                        )

In [93]:
book_ratings_df = (relevant_goodreads_df
 .groupby('book_wikidata_id')
 .agg(
     title = pd.NamedAgg(column='title', aggfunc=mode),
     author = pd.NamedAgg(column='authors', aggfunc=mode),
     year = ('year', 'min'),
     n_pages = ('n_pages', 'median'),
     n_ratings = ('n_ratings', 'max'),
     n_reviews = ('n_reviews', 'max'),
     rating = ('rating', 'median'),
     summary = pd.NamedAgg(column='summary', aggfunc=mode)
 )
.assign(
    n_pages = lambda x: x.n_pages.astype(int),
)
.reset_index()
)
book_ratings_df.to_csv('dataset/book_ratings.csv', index=False)

## Book Summaries

In [96]:
cmu_book_summaries_df = pd.read_csv('booksummaries/booksummaries.txt', 
                                    sep='\t', header=None, 
                                    names=['wikipedia_id', 'freebase_id', 'title', 'author', 'pub_date', 'genres', 'summary'])

In [98]:
cmu_book_summaries_df = (cmu_book_summaries_df
                        .drop(['freebase_id'], axis=1)
                        .assign(
                                wikidata_id = lambda x: x.wikipedia_id.apply(lambda y: mapper.wikipedia_id_to_id(y))
                                )
                        )

In [103]:
(movie_book_df
 .merge(cmu_book_summaries_df, left_on='book_wikidata_id', right_on='wikidata_id', how='inner')
 .loc[:,['book_wikidata_id', 'title', 'author', 'pub_date', 'genres', 'summary']]
 .to_csv('dataset/book_summaries.csv', index=False)
 )


In [105]:
movie_book_df.to_csv('dataset/movie_book.csv', index=False)