In [1]:
import pandas as pd
import numpy as np
from wikimapper import WikiMapper

# Load the wikidata index 
The file has to be downloaded using the following [link](https://public.ukp.informatik.tu-darmstadt.de/wikimapper/).

In [2]:
mapper = WikiMapper("data/index_enwiki-latest.db")

## Load CMU Movie Dataset

In [3]:
# Define columns names
cmu_movie_cols = ['wikipedia_id', 'freebase_id', 'cmu_movie_title', 'release_date', 'box_office_revenue', 'runtime', 'languages', 'countries', 'genres']

# Open movie dataset and remove 'freebase_id' since it is not available anymore
# Add movie wikidata ID using the mapper
cmu_movie_df = (pd.read_csv('data/movie.metadata.tsv', sep='\t', header=None, names=cmu_movie_cols)
                .drop(['freebase_id'], axis=1)
                .assign(
                    wikidata_id = lambda x: x.wikipedia_id.apply(lambda y: mapper.wikipedia_id_to_id(y))
                )
)

## Find movies based on books / book series
To find all movies that are based on books we run the following SPARQL query on the [Wikidata Query Service](https://query.wikidata.org/):

<code>
SELECT DISTINCT ?movie ?book
  
WHERE 
{

  VALUES ?bookType { wd:Q47461344 wd:Q7725634 wd:Q571 wd:Q14406742 wd:Q21198342 wd:Q277759}

  VALUES ?movieType { wd:Q11424 wd:Q506240 }
  
  ?book wdt:P31 ?bookType.   

  ?movie wdt:P31 ?movieType;          
        
        wdt:P144 ?book.

  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
</code>

It searches for instances of `film` or `television film` that are based on an instance of `literary work`, `written work`, `book`, `comic book series`, `manga series` or `book series`. This query gives a csv file which can be found in `data/raw_wiki/raw_movie_book.csv`. 

In [4]:
# Open the csv file we just created and extract the wikidata ID of movies and their corresponding book.
movie_book_df = (pd.read_csv('data/raw_wiki/raw_movie_book.csv')
                    .assign(
                        movie_wikidata_id = lambda x: x.movie.str.split('/').str[-1],
                        book_wikidata_id = lambda x: x.book.str.split('/').str[-1],
                    )
                    .drop(['movie', 'book'], axis=1)
                )

In [5]:
# Merge the CMU dataframe with the movie_book_df containing the wikidata ID
movie_book_df = (movie_book_df.merge(cmu_movie_df, left_on='movie_wikidata_id', right_on='wikidata_id', how='left')
              .query('wikipedia_id.notnull()')
              .reset_index(drop=True)
              .assign(movie_wikipedia_id = lambda x: x.wikipedia_id.astype(int))
              .loc[:, ['movie_wikipedia_id', 'book_wikidata_id']]
            )

In [6]:
display(movie_book_df)

Unnamed: 0,movie_wikipedia_id,book_wikidata_id
0,18920019,Q480
1,21447227,Q480
2,2205704,Q480
3,7379134,Q480
4,10117133,Q480
...,...,...
4677,9767560,Q120669834
4678,1750951,Q123168810
4679,61191,Q121775426
4680,6851697,Q122186265


## Get information about books
To later on be able to merge the books with goodreads data get title and author from [Wikidata Query Service](https://query.wikidata.org/). The 'query_string' must be replaced with the output of the next code cell which gives all the wikidata ID of the books we are looking for.
<code>

SELECT DISTINCT

  ?book ?bookLabel

  ?title

  ?authorLabel

  ?pubdate
  
WHERE 
{

  VALUES ?book { query_string }

  OPTIONAL { ?book wdt:P1476 ?title. }
  
  OPTIONAL { ?book wdt:P50 ?author. }

  OPTIONAL { ?book wdt:P577 ?pubdate. }

  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}

</code>

This query gives a csv file containing data of books, which can be found in `data/raw_wiki/wikidata_book.csv`.

In [None]:
# Copy the output of this cell into 'query_string' within the above query
query_string = " ".join([f'wd:{wikidata_id}' for wikidata_id in movie_book_df.book_wikidata_id.unique()])
print(query_string)

In [7]:
def get_list(series: pd.Series) -> list:
    return list(set(series.dropna().tolist()))

def mode(x: pd.Series) -> pd.Series:
    modes = x.mode()
    if len(modes) > 0:
        return modes.iloc[0]
    return None

categories = {
    'fiction': {'novel', 'short novel', 'novella', 'serialized fiction', 'short story', 'war fiction', 'magic realist fiction', 'metafiction', 'science fiction', 'suspense in literature', 'horror novel', 'horror fiction', 'crime fiction', 'psychological thriller', 'speculative/fantastic fiction', 'adventure fiction', 'detective fiction', 'noir fiction', 'political novel', 'vampire fiction', 'dystopian fiction', 'social science fiction', 'techno-thriller', 'thriller', 'fantasy', 'Gothic novel', 'picaresque novel', 'mystery fiction', 'post-apocalyptic fiction', 'philosophical fiction', 'romantic fiction', 'Bildungsroman', 'roman à clef', 'comedy', 'black comedy'},
    'non_fiction': {'nonfiction', 'memoir', 'autobiography', 'biographical novel', 'biography', 'essay'},
    'children': {'children\'s literature', 'children\'s fiction', 'young adult fiction', 'children\'s novel'},
    'historical': {'historical fiction', 'historical novel'},
    'drama': {'play', 'drama', 'tragedy'},
    'anime': {'adventure anime and manga', 'drama anime and manga'},
    'fantasy': {'magic realist fiction', 'fantasy', 'vampire fiction', 'fairy tale'},
    'science_fiction': {'science fiction', 'dystopian fiction', 'social science fiction', 'techno-thriller', 'post-apocalyptic fiction'},
    'horror': {'horror novel', 'horror fiction'},
    'thriller': {'psychological thriller', 'thriller'},
    'detective': {'detective fiction', 'noir fiction', 'mystery fiction', 'cloak and dagger novel'},
    'satire': {'satire', 'satirical fiction', 'metafiction'},
    'comedy': {'comedy', 'black comedy'},
}

book_wikidata_df = (pd.read_csv('data/raw_wiki/wikidata_book.csv')
                    .assign(
                            book_wikidata_id = lambda x: x.book.str.split('/').str[-1],
                            year = lambda x: pd.to_datetime(x.pubdate, errors='coerce').dt.year.astype('Int64'),
                    )
                    .drop(['book'], axis=1)
                    .groupby('book_wikidata_id')
                    .agg(
                            title = pd.NamedAgg(column='bookLabel', aggfunc=mode),
                            author = ('authorLabel', 'first'),
                            year = ('year', 'first'),
                            instance_of = pd.NamedAgg(column='instanceofLabel', aggfunc=get_list),
                            form = pd.NamedAgg(column='formLabel', aggfunc=get_list),
                            genre = pd.NamedAgg(column='genreLabel', aggfunc=get_list),
                            award = pd.NamedAgg(column='awardLabel', aggfunc=get_list),
                            series = ('seriesLabel', 'first'),
                        )
                    .assign(
                        part_of_series = lambda x: x.series.notnull().astype(int),
                        is_literary_work = lambda x: x.instance_of.apply(lambda y: 'literary work' in y).astype(int),
                        is_written_work = lambda x: x.instance_of.apply(lambda y: 'written work' in y).astype(int),
                        is_comic_book_seris = lambda x: x.instance_of.apply(lambda y: 'comic book series' in y).astype(int),
                        is_book_series = lambda x: x.instance_of.apply(lambda y: 'book series' in y).astype(int),
                        is_manga_series = lambda x: x.instance_of.apply(lambda y: 'manga series' in y).astype(int),
                        is_novel = lambda x: x.form.apply(lambda y: 'novel' in y).astype(int),
                        is_short_story = lambda x: x.form.apply(lambda y: 'short story' in y).astype(int),    
                        is_play = lambda x: x.form.apply(lambda y: 'fiction' in y).astype(int),
                        is_novella = lambda x: x.form.apply(lambda y: 'novella' in y).astype(int),
                        is_fiction = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['fiction'])) > 0).astype(int),
                        is_non_fiction = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['non_fiction'])) > 0).astype(int),
                        is_children = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['children'])) > 0).astype(int),
                        is_historical = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['historical'])) > 0).astype(int),
                        is_drama = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['drama'])) > 0).astype(int),
                        is_anime = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['anime'])) > 0).astype(int),
                        is_fantasy = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['fantasy'])) > 0).astype(int),
                        is_science_fiction = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['science_fiction'])) > 0).astype(int),
                        is_horror = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['horror'])) > 0).astype(int),
                        is_thriller = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['thriller'])) > 0).astype(int),
                        is_detective = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['detective'])) > 0).astype(int),
                        is_satire = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['satire'])) > 0).astype(int),
                        is_comedy = lambda x: x.genre.apply(lambda y: len(set(y).intersection(categories['comedy'])) > 0).astype(int),
                        won_price = lambda x: x.award.apply(lambda y: len(y) > 0).astype(int),
                    )
                    .drop(['instance_of', 'form', 'genre', 'award', 'series'], axis=1)
)

book_wikidata_df.to_csv('dataset/book_metadata.csv', index=True)


KeyError: "Column(s) ['awardLabel', 'formLabel', 'genreLabel', 'instanceofLabel', 'seriesLabel'] do not exist"

In [None]:
wikidata_meta_df = (pd.read_csv('data/wikidata_book.csv')
                    .assign(
                        book_wikidata_id = lambda x: x.book.str.split('/').str[-1]
                        )
                    .drop(['book'], axis=1)
                    .rename(columns={'authorLabel': 'author', 'title': 'title_wikidata'})
                    .melt(id_vars=['book_wikidata_id', 'author'], value_vars=['bookLabel', 'title_wikidata'], value_name='title')
                    .drop(columns=['variable'])
                    .query('title.notnull()')
                    )

## Join with Goodreads Id

In [None]:

goodreads_df_list = []
goodreads_df_list.append(pd.read_csv('archive/book1-100k.csv'))
for i in range(1,20):
    goodreads_df_list.append(pd.read_csv(f'archive/book{i*100}k-{(i+1)*100}k.csv'))
goodreads_df_list.append(pd.read_csv('archive/book2000k-3000k.csv'))
goodreads_df_list.append(pd.read_csv('archive/book3000k-4000k.csv'))
goodreads_df_list.append(pd.read_csv('archive/book4000k-5000k.csv'))
goodreads_df = (pd.concat(goodreads_df_list)
                .assign(
                    n_pages = lambda x: x.pagesNumber.fillna(x.PagesNumber),
                    n_ratings = lambda x: x.RatingDistTotal.str.split(':').str[1].astype(int)
                    )
                .rename(columns={'Id': 'goodreads_id', 'Authors': 'authors', 'Name': 'title', 'CountsOfReview': 'n_reviews', 'Rating': 'rating', 'PublishYear': 'year', 'Description': 'summary'})
                .loc[:, ['goodreads_id', 'title', 'authors', 'year', 'n_pages', 'n_ratings', 'n_reviews', 'rating', 'summary']]
                )
goodreads_meta_df = (goodreads_df.assign(
                        author = lambda x: x.authors.str.split('/'),
                     )
                     .loc[:, ['goodreads_id', 'title', 'author']]
                     .explode('author')
                     )

In [None]:
def clean_title(title_series: pd.Series) -> pd.Series:
    return (title_series
            .str.split('(').str[0]
            .str.split(':').str[0]
            .str.lower()
            .str.replace('and', '&')
            .str.replace('.', '')
            .str.replace("'", '')
            .str.replace('-', ' ')
            .str.replace(r'\s+', ' ', regex=True)
            .str.strip()
    )

def clean_author(author_series: pd.Series) -> pd.Series:
    initial_letter = (author_series
                      .str.strip()
                      .str[0]
                      .str.lower())
    last_name = (author_series
                 .str.split(r"(\s|-|')", regex=True)
                 .str[-1]
                 .str.replace('.', '')
                 .str.replace("'", '')
                 .str.replace(r'\s+', ' ', regex=True)
                 .str.strip()
                 .str.lower()
                 )
    return initial_letter + " " + last_name

In [None]:
goodreads_meta_df = (goodreads_meta_df
                        .assign(
                            title_clean = lambda x: clean_title(x.title),
                            author_clean = lambda x: clean_author(x.author)
                    ))

In [None]:
wikidata_meta_df = (wikidata_meta_df
                    .assign(
                        title_clean = lambda x: clean_title(x.title),
                        author_clean = lambda x: clean_author(x.author)
                    ))

In [None]:
wikidata_to_goodreads = (wikidata_meta_df
                            .merge(goodreads_meta_df, on=['title_clean', 'author_clean'], how='left')
                            .query('goodreads_id.notnull()')
                            .assign(goodreads_id = lambda x: x.goodreads_id.astype(int))
                            .loc[:, ['book_wikidata_id', 'goodreads_id']])

In [None]:
relevant_goodreads_df = (wikidata_to_goodreads
                            .merge(goodreads_df, on='goodreads_id', how='inner')
                        )

In [None]:
book_ratings_df = (relevant_goodreads_df
 .groupby('book_wikidata_id')
 .agg(
     title = pd.NamedAgg(column='title', aggfunc=mode),
     author = pd.NamedAgg(column='authors', aggfunc=mode),
     year = ('year', 'min'),
     n_pages = ('n_pages', 'median'),
     n_ratings = ('n_ratings', 'max'),
     n_reviews = ('n_reviews', 'max'),
     rating = ('rating', 'median'),
     summary = pd.NamedAgg(column='summary', aggfunc=mode)
 )
.assign(
    n_pages = lambda x: x.n_pages.astype(int),
)
.reset_index()
)
book_ratings_df.to_csv('dataset/book_ratings.csv', index=False)

## Book Summaries

In [None]:
cmu_book_summaries_df = pd.read_csv('booksummaries/booksummaries.txt', 
                                    sep='\t', header=None, 
                                    names=['wikipedia_id', 'freebase_id', 'title', 'author', 'pub_date', 'genres', 'summary'])

In [None]:
cmu_book_summaries_df = (cmu_book_summaries_df
                        .drop(['freebase_id'], axis=1)
                        .assign(
                                wikidata_id = lambda x: x.wikipedia_id.apply(lambda y: mapper.wikipedia_id_to_id(y))
                                )
                        )

In [None]:
(movie_book_df
 .merge(cmu_book_summaries_df, left_on='book_wikidata_id', right_on='wikidata_id', how='inner')
 .loc[:,['book_wikidata_id', 'title', 'author', 'pub_date', 'genres', 'summary']]
 .to_csv('dataset/book_summaries.csv', index=False)
 )


In [None]:
movie_book_df.to_csv('dataset/movie_book.csv', index=False)

# Clean CMU movies dataset

In [7]:
# Open the CMU dataset
movie_df = pd.read_csv("data/movie.metadata.tsv", sep='\t',names=['movie_wikipedia_id', 'freebase_id', 'cmu_movie_title', 'movie_release_date', 
                                                                  'movie_box_office_revenue', 'runtime', 'movie_languages', 'movie_countries', 
                                                                  'movie_genres']).drop('freebase_id', axis=1).assign(
                    movie_wikidata_id = lambda x: x.movie_wikipedia_id.apply(lambda y: mapper.wikipedia_id_to_id(y))
                )

#Clean columns
movie_df['movie_genres'] = movie_df['movie_genres'].apply(lambda x: np.take(x.split('"'), np.linspace(3,len(x.split('"'))-2, int((len(x.split('"'))-1)/4)).tolist()))
movie_df['movie_languages'] = movie_df['movie_languages'].apply(lambda x: np.take(x.split('"'), np.linspace(3,len(x.split('"'))-2, int((len(x.split('"'))-1)/4)).tolist()))
movie_df['movie_countries'] = movie_df['movie_countries'].apply(lambda x: np.take(x.split('"'), np.linspace(3,len(x.split('"'))-2, int((len(x.split('"'))-1)/4)).tolist()))
display(movie_df)

Unnamed: 0,movie_wikipedia_id,cmu_movie_title,movie_release_date,movie_box_office_revenue,runtime,movie_languages,movie_countries,movie_genres,movie_wikidata_id
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...",Q261700
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]",Q16250726
2,28463795,Brun bitter,1988,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",Q4978832
3,9363483,White Of The Eye,1987,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",Q7995657
4,261236,A Woman in Flames,1983,,106.0,[German Language],[Germany],[Drama],Q869644
...,...,...,...,...,...,...,...,...,...
81736,35228177,Mermaids: The Body Found,2011-03-19,,120.0,[English Language],[United States of America],[Drama],Q6819873
81737,34980460,Knuckle,2011-01-21,,96.0,[English Language],"[Ireland, United Kingdom]","[Biographical film, Drama, Documentary]",Q12125420
81738,9971909,Another Nice Mess,1972-09-22,,66.0,[English Language],[United States of America],"[Satire, Comedy]",Q4770308
81739,913762,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,[Japanese Language],[Japan],"[Science Fiction, Japanese Movies, Adventure, ...",Q2663931


# Add movies ratings to CMU movies dataset

To add movies ratings from IMDB to the CMU dataset, use the following query on [Wikidata Query Service](https://query.wikidata.org/).
First, we look for the IMDB ID of the movies.  

<code>

SELECT

?movie 

?IMDB_ID 

WHERE 

{

  VALUES ?movieType { wd:Q11424 wd:Q506240 }
  
  ?movie wdt:P31 ?movieType.

  ?movie wdt:P345 ?IMDB_ID.
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  
}

</code>

This query gives a csv file containing IMDB ID and wikidata ID, which can be found in `data/raw_wiki/IMDb_id.csv`.

The non-commercial IMDb rating data set can be found [here](https://developer.imdb.com/non-commercial-datasets/). It contains the movie ratings (score out of 10) with the corresponding IMDb_ID. This dataset is in `data/IMDb_ratings.tsv`.

In [8]:
# Open IMDb_ratings file and clean the ID (remove 'tt' at the beginning)
ratings_df = pd.read_csv('data/IMDb_ratings.tsv', sep='\t').rename(columns={"tconst" : "imdb_id", "averageRating" : "movie_rating"})
ratings_df['imdb_id'] = ratings_df['imdb_id'].str[2:]
ratings_df.head()

Unnamed: 0,imdb_id,movie_rating,numVotes
0,1,5.7,2004
1,2,5.8,269
2,3,6.5,1902
3,4,5.5,178
4,5,6.2,2685


Now, let's add the movie ratings to the movie dataframe :

In [9]:
# Open IMDb_id and merge with ratings
IMDb_ID_df = pd.read_csv('data/raw_wiki/IMDb_id.csv').assign(wikidata_id = lambda x: x.movie.str.split('/').str[-1]).assign(imdb_id = lambda x: x['IMDB_ID'].str[2:]).drop(['movie', 'IMDB_ID'], axis=1)
IMDb_ID_df = IMDb_ID_df.merge(ratings_df, on='imdb_id', how='left').copy()
IMDb_ID_df.head(5)

# Merge rating to movie_df using the wikidata_id
final_movie_df = movie_df.merge(IMDb_ID_df, left_on='movie_wikidata_id', right_on='wikidata_id', how='left').drop('wikidata_id', axis=1)
display(final_movie_df)

Unnamed: 0,movie_wikipedia_id,cmu_movie_title,movie_release_date,movie_box_office_revenue,runtime,movie_languages,movie_countries,movie_genres,movie_wikidata_id,imdb_id,movie_rating,numVotes
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...",Q261700,0228333,4.9,56854.0
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]",Q16250726,0245916,6.0,69.0
2,28463795,Brun bitter,1988,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",Q4978832,0094806,5.6,40.0
3,9363483,White Of The Eye,1987,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",Q7995657,0094320,6.1,2888.0
4,261236,A Woman in Flames,1983,,106.0,[German Language],[Germany],[Drama],Q869644,0083949,6.0,621.0
...,...,...,...,...,...,...,...,...,...,...,...,...
81906,35228177,Mermaids: The Body Found,2011-03-19,,120.0,[English Language],[United States of America],[Drama],Q6819873,1816585,4.6,1710.0
81907,34980460,Knuckle,2011-01-21,,96.0,[English Language],"[Ireland, United Kingdom]","[Biographical film, Drama, Documentary]",Q12125420,1606259,6.8,3191.0
81908,9971909,Another Nice Mess,1972-09-22,,66.0,[English Language],[United States of America],"[Satire, Comedy]",Q4770308,0362411,5.8,110.0
81909,913762,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,[Japanese Language],[Japan],"[Science Fiction, Japanese Movies, Adventure, ...",Q2663931,,,


Finally, let's modify the 'release_date' column to keep only the release year for each movie.

In [10]:
# Add a column with movie release year to the dataframe

# Keep only year in the string 'year-month-day'
final_movie_df['movie_release_year'] = final_movie_df['movie_release_date'].str.split('-').str[0].astype("Int32")

# Drop the 'movie_release_date' column
final_movie_df.drop('movie_release_date', axis=1)

Unnamed: 0,movie_wikipedia_id,cmu_movie_title,movie_box_office_revenue,runtime,movie_languages,movie_countries,movie_genres,movie_wikidata_id,imdb_id,movie_rating,numVotes,movie_release_year
0,975900,Ghosts of Mars,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...",Q261700,0228333,4.9,56854.0,2001
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]",Q16250726,0245916,6.0,69.0,2000
2,28463795,Brun bitter,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",Q4978832,0094806,5.6,40.0,1988
3,9363483,White Of The Eye,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",Q7995657,0094320,6.1,2888.0,1987
4,261236,A Woman in Flames,,106.0,[German Language],[Germany],[Drama],Q869644,0083949,6.0,621.0,1983
...,...,...,...,...,...,...,...,...,...,...,...,...
81906,35228177,Mermaids: The Body Found,,120.0,[English Language],[United States of America],[Drama],Q6819873,1816585,4.6,1710.0,2011
81907,34980460,Knuckle,,96.0,[English Language],"[Ireland, United Kingdom]","[Biographical film, Drama, Documentary]",Q12125420,1606259,6.8,3191.0,2011
81908,9971909,Another Nice Mess,,66.0,[English Language],[United States of America],"[Satire, Comedy]",Q4770308,0362411,5.8,110.0,1972
81909,913762,The Super Dimension Fortress Macross II: Lover...,,150.0,[Japanese Language],[Japan],"[Science Fiction, Japanese Movies, Adventure, ...",Q2663931,,,,1992


### The final movie dataframe is ready ! Let's save it for the analysis :

In [11]:
final_movie_df.to_csv('data/final_movie_metadata.csv')