In [1]:
# Prepares Studio Ghibli.csv for both content-based filtering and collaborative filtering experiments.
# This script processes the data and generates three output CSV files:
# 1. prepared_ghibli.csv: Contains Ghibli movie IDs extracted from MovieLens's movies dataset and standardized titles for consistency.
# 2. prepared_genres.csv: Lists Ghibli movie titles alongside their genres.
# 3. prepared_ratings.csv: Combines Ghibli movie ratings from the MovieLens ratings dataset with their corresponding movie titles.

import pandas as pd

movies = pd.read_csv('Studio Ghibli.csv')
print(movies.head())

                                             Name  Year              Director  \
0            When Marnie Was There\n       (2014)  2014  Hiromasa Yonebayashi   
1  The Tale of The Princess Kaguya\n       (2013)  2013         Isao Takahata   
2                   The Wind Rises\n       (2013)  2013        Hayao Miyazaki   
3            From Up on Poppy Hill\n       (2011)  2011         Goro Miyazaki   
4     The Secret World of Arrietty\n       (2010)  2010  Hiromasa Yonebayashi   

         Screenplay          Budget        Revenue    Genre 1    Genre 2  \
0  Joan G. Robinson  $1150000000.00   $34949567.00  Animation      Drama   
1    Riko Sakaguchi    $49300000.00   $24366656.00  Animation      Drama   
2       Tatsuo Hori    $30000000.00  $117932401.00      Drama  Animation   
3    Hayao Miyazaki    $22000000.00   $61037844.00  Animation      Drama   
4       Mary Norton    $23000000.00  $149480483.00    Fantasy  Animation   

   Genre 3 Duration  
0      NaN   1h 43m  
1  Fantasy  

In [2]:
# crosscheck Studio Ghibli.csv and movies.csv and filter out common movies
movies_df = pd.read_csv('movies.csv')
keywords = [
    "Kaguya", "Wind Rises", "Ponyo", "Poppy Hill", "Arrietty", "Only Yesterday",
    "Spirited Away", "Whisper of the Heart", "Grave of the fireflies", "Totoro",
    "Mononoke", "Moving Castle", "Castle in the Sky", "Kiki",
    "Pom Poko", "Porco Rosso", "Valley of the Wind"
]
keywords = [kw.lower() for kw in keywords]

# Clean up and standardize titles
movies_df['name'] = (
    movies_df['title']
    .str.strip()
    .str.replace(r"\s\([^()]*\)", "", regex=True)  # Remove text in parentheses
    .str.replace(r"^(.*), The$", r"The \1", regex=True)  # Add "The " to the front
)


# Filter movies based on keywords
filtered_movies = movies_df[movies_df['name'].str.lower().str.contains('|'.join(keywords), na=False)]

# Save the movieIds and cleaned titles of Ghibli movies into a separate file
filtered_movies[['movieId', 'name']].to_csv('prepared_ghibli.csv', index=False)

# Display the result
print(filtered_movies[['movieId', 'name']])

      movieId                                name
2260     3000                   Princess Mononoke
3984     5618                       Spirited Away
4025     5690              Grave of the Fireflies
4151     5971                  My Neighbor Totoro
4348     6350           Laputa: Castle in the Sky
4769     7099  Nausicaä of the Valley of the Wind
5546    26662             Kiki's Delivery Service
5566    26743                      Only Yesterday
5572    26776                         Porco Rosso
5596    26903                Whisper of the Heart
5791    31658                Howl's Moving Castle
5842    32456                            Pom Poko
6944    65261                               Ponyo
7499    83132        The Secret World of Arrietty
8047    98604               From Up on Poppy Hill
8241   104283                      The Wind Rises
8520   114554         The Tale of Princess Kaguya


In [5]:
# Grab the same list of Ghibli movies and their genres
studio_ghibli_df = pd.read_csv('Studio Ghibli.csv')
studio_ghibli_df['Name_cleaned'] = studio_ghibli_df['Name'].str.strip().str.lower()
filtered_ghibli = studio_ghibli_df[studio_ghibli_df['Name_cleaned'].str.contains('|'.join(keywords), na=False)].copy()

# merge Genre 1, 2, 3
filtered_ghibli['Genres'] = filtered_ghibli[['Genre 1', 'Genre 2', 'Genre 3']].fillna('').agg(', '.join, axis=1)
filtered_ghibli['Genres'] = filtered_ghibli['Genres'].str.replace(', ,', ',', regex=False).str.strip(', ').str.strip()

# clean up movie names to exclude new line and year
filtered_ghibli['Name'] = filtered_ghibli['Name'].str.replace(r'\n', '', regex=True)  
filtered_ghibli['Name'] = filtered_ghibli['Name'].str.replace(r'\s*\(\d{4}\)', '', regex=True) 

# save as a separate file
ghibli_genres = filtered_ghibli[['Name', 'Genres']].to_csv('prepared_genres.csv', index=False)

print(filtered_ghibli[['Name', 'Genres']])


                                  Name                         Genres
1      The Tale of The Princess Kaguya      Animation, Drama, Fantasy
2                       The Wind Rises      Drama, Animation, Romance
3                From Up on Poppy Hill               Animation, Drama
4         The Secret World of Arrietty     Fantasy, Animation, Family
5                                Ponyo     Animation, Fantasy, Family
8                       Only Yesterday      Animation, Drama, Romance
9                        Spirited Away     Animation, Family, Fantasy
11                Whisper of the Heart       Animation, Drama, Family
12              Grave of the Fireflies          Animation, Drama, War
13                  My Neighbor Totoro     Fantasy, Animation, Family
14                   Princess Mononoke  Adventure, Fantasy, Animation
15                Howl's Moving Castle  Fantasy, Animation, Adventure
16                   Castle in the Sky  Adventure, Fantasy, Animation
17             Kiki'

In [7]:
# filter out all the ratings that contains ghibli movieIds
ghibli_movies = pd.read_csv('prepared_ghibli.csv')
ratings = pd.read_csv('ratings.csv')

# add the corresponding movie titles to the ratings
ghibli_ratings = ratings[ratings['movieId'].isin(ghibli_movies['movieId'])]
ghibli_ratings = ghibli_ratings.merge(ghibli_movies, on='movieId', how='left')
ghibli_ratings = ghibli_ratings.drop(columns=['timestamp'])
ghibli_ratings.to_csv('prepared_ratings.csv', index=False)
print(ghibli_ratings)

     userId  movieId  rating                     name
0         7     5618     5.0            Spirited Away
1        15     5618     3.0            Spirited Away
2        15     5971     2.0       My Neighbor Totoro
3        16     3000     4.0        Princess Mononoke
4        16     5618     4.5            Spirited Away
..      ...      ...     ...                      ...
329     606    26662     4.0  Kiki's Delivery Service
330     606    26776     4.0              Porco Rosso
331     606    31658     4.0     Howl's Moving Castle
332     608     5618     3.5            Spirited Away
333     610     5618     4.0            Spirited Away

[334 rows x 4 columns]
