In [1]:
import requests
import pandas as pd
import time
import json
import zipfile
import os
import re
import ast


import warnings
warnings.filterwarnings('ignore')


### TMDB API Pull with Anime Keyword

In [89]:
def get_anime_info(api_key, keyword_id, genre_id, start_page=1, end_page=5000):
    all_anime = []
    for page in range(start_page, end_page + 1):
        url = "https://api.themoviedb.org/3/discover/tv"
        params = {
            'api_key': api_key,
            'with_genres': genre_id,
            'with_keywords': keyword_id,
            'language': 'en-US',
            'sort_by': 'popularity.desc',
            'page': page
        }
        response = requests.get(url, params=params)
        if response.status_code == 200:
            all_anime.extend(response.json().get('results', []))
            if page % 50 == 0:
                pd.DataFrame(all_anime).to_csv(f'anime_data_{page}.csv')
                all_anime = []
        else:
            break
        time.sleep(0.5)
    return all_anime

api_key = '2c818416a26da0b84f6a14364b6aa0eb'

# Get the keyword ID for 'anime'
anime_keyword_id = get_keyword_id(api_key, 'anime')

# Animation genre ID (also used for anime)
genre_id = 16

if anime_keyword_id:
    anime_info = get_anime_info(api_key, anime_keyword_id, genre_id, start_page=1, end_page=5000)
    if anime_info:
        df_anime = pd.DataFrame(anime_info)
        df_anime.to_csv('final_anime_data.csv')
        print("Data fetching complete.")
    else:
        print("No anime information found")
else:
    print("Anime keyword ID not found")


No anime information found


In [5]:
df_anime = pd.read_csv('zipped_data/anime_data_tmdb.csv')
df_anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         271 non-null    int64  
 1   backdrop_path      151 non-null    object 
 2   genre_ids          271 non-null    object 
 3   id                 271 non-null    int64  
 4   name               271 non-null    object 
 5   origin_country     271 non-null    object 
 6   original_language  271 non-null    object 
 7   original_name      271 non-null    object 
 8   overview           220 non-null    object 
 9   popularity         271 non-null    float64
 10  poster_path        254 non-null    object 
 11  vote_average       271 non-null    float64
 12  vote_count         271 non-null    int64  
 13  first_air_date     179 non-null    object 
dtypes: float64(2), int64(3), object(9)
memory usage: 29.8+ KB


In [6]:
df_anime_1000 = pd.read_csv('zipped_data/anime_data_100.csv')
df_anime_1000.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         1000 non-null   int64  
 1   backdrop_path      983 non-null    object 
 2   first_air_date     999 non-null    object 
 3   genre_ids          1000 non-null   object 
 4   id                 1000 non-null   int64  
 5   name               1000 non-null   object 
 6   origin_country     1000 non-null   object 
 7   original_language  1000 non-null   object 
 8   original_name      1000 non-null   object 
 9   overview           981 non-null    object 
 10  popularity         1000 non-null   float64
 11  poster_path        1000 non-null   object 
 12  vote_average       1000 non-null   float64
 13  vote_count         1000 non-null   int64  
dtypes: float64(2), int64(3), object(9)
memory usage: 109.5+ KB


In [7]:
df_anime_1500 = pd.read_csv('zipped_data/anime_data_150.csv')
df_anime_1500.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         1000 non-null   int64  
 1   backdrop_path      905 non-null    object 
 2   first_air_date     969 non-null    object 
 3   genre_ids          1000 non-null   object 
 4   id                 1000 non-null   int64  
 5   name               1000 non-null   object 
 6   origin_country     1000 non-null   object 
 7   original_language  1000 non-null   object 
 8   original_name      1000 non-null   object 
 9   overview           921 non-null    object 
 10  popularity         1000 non-null   float64
 11  poster_path        997 non-null    object 
 12  vote_average       1000 non-null   float64
 13  vote_count         1000 non-null   int64  
dtypes: float64(2), int64(3), object(9)
memory usage: 109.5+ KB


### Combining TMDB anime datasets

In [13]:
csv_file_paths = ['zipped_data/anime_data_50.csv', 'zipped_data/anime_data_100.csv', 'zipped_data/anime_data_150.csv', 'zipped_data/anime_data_250.csv']

# Read each csv file into a dataframe
dataframes = [pd.read_csv(file_path) for file_path in csv_file_paths]


In [14]:
combined_anime_tmdb = pd.concat(dataframes, ignore_index=True)

In [15]:
# Convert list to string
combined_anime_tmdb['genre_ids'] = combined_anime_tmdb['genre_ids'].apply(lambda x: ','.join(map(str, x)) if isinstance(x, list) else x)

# Drop irrelevant columns
combined_anime_tmdb = combined_anime_tmdb.drop(columns=['origin_country', 'backdrop_path', 'poster_path', 'Unnamed: 0'])

# Remove duplicates
combined_anime_tmdb = combined_anime_tmdb.drop_duplicates()

In [16]:
combined_anime_tmdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3271 entries, 0 to 3270
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   first_air_date     3146 non-null   object 
 1   genre_ids          3271 non-null   object 
 2   id                 3271 non-null   int64  
 3   name               3271 non-null   object 
 4   original_language  3271 non-null   object 
 5   original_name      3271 non-null   object 
 6   overview           3120 non-null   object 
 7   popularity         3271 non-null   float64
 8   vote_average       3271 non-null   float64
 9   vote_count         3271 non-null   int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 281.1+ KB


In [17]:
# Save to csv file
combined_anime_tmdb.to_csv('zipped_data/anime_tmbd_master.csv', index=False)

### Adding data from MyAnimeList

- animes.csv
- profiles.csv
- reviews.csv

In [3]:
zip_file_path = 'zipped_data/Archive.zip'

extract_path = 'zipped_data/extracted_csv_files'

if not os.path.exists(extract_path):
    os.makedirs(extract_path)

# Extract csv files
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all csv files
    for file_name in zip_ref.namelist():
        if file_name.endswith('.csv'):
            zip_ref.extract(file_name, path=extract_path)


In [2]:
animes_df = pd.read_csv('zipped_data/extracted_csv_files/animes.csv')
animes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19311 entries, 0 to 19310
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   uid         19311 non-null  int64  
 1   title       19311 non-null  object 
 2   synopsis    18336 non-null  object 
 3   genre       19311 non-null  object 
 4   aired       19311 non-null  object 
 5   episodes    18605 non-null  float64
 6   members     19311 non-null  int64  
 7   popularity  19311 non-null  int64  
 8   ranked      16099 non-null  float64
 9   score       18732 non-null  float64
 10  img_url     19131 non-null  object 
 11  link        19311 non-null  object 
dtypes: float64(3), int64(3), object(6)
memory usage: 1.8+ MB


In [3]:
profiles_df = pd.read_csv('zipped_data/extracted_csv_files/profiles.csv')
profiles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81727 entries, 0 to 81726
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   profile          81727 non-null  object
 1   gender           53856 non-null  object
 2   birthday         46807 non-null  object
 3   favorites_anime  81727 non-null  object
 4   link             81727 non-null  object
dtypes: object(5)
memory usage: 3.1+ MB


In [4]:
reviews_df = pd.read_csv('zipped_data/extracted_csv_files/reviews.csv')
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192136 entries, 0 to 192135
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   uid         192134 non-null  object
 1   profile     192122 non-null  object
 2   anime_uid   192121 non-null  object
 3   score       192115 non-null  object
 4   scores      192113 non-null  object
 5   link        192028 non-null  object
 6   Unnamed: 6  3 non-null       object
 7   Unnamed: 7  2 non-null       object
 8   Unnamed: 8  1 non-null       object
 9   Unnamed: 9  1 non-null       object
dtypes: object(10)
memory usage: 14.7+ MB


### Splitting the dictionary key-value pairs from the 'scores' column into separate columns for readability

In [5]:
# Function to convert a string to a dictionary, return None if conversion fails
def safe_literal_eval(s):
    try:
        return ast.literal_eval(s)
    except:
        return None

# Apply function to the 'scores' column
reviews_df['scores'] = reviews_df['scores'].apply(lambda x: safe_literal_eval(x) if pd.notnull(x) else None)

# Drop rows where 'scores' is None
reviews_df = reviews_df.dropna(subset=['scores'])

# Split the 'scores' column into separate columns
scores_df = reviews_df['scores'].apply(pd.Series)

# Merge new columns back in
combined_reviews = pd.concat([reviews_df, scores_df], axis=1)

combined_reviews.head()


Unnamed: 0,uid,profile,anime_uid,score,scores,link,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Overall,Story,Animation,Sound,Character,Enjoyment
0,255938,DesolatePsyche,34096,8,"{'Overall': '8', 'Story': '8', 'Animation': '8...",https://myanimelist.net/reviews.php?id=255938,,,,,8,8,8,10,9,8
1,259117,baekbeans,34599,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=259117,,,,,10,10,10,10,10,10
2,253664,skrn,28891,7,"{'Overall': '7', 'Story': '7', 'Animation': '9...",https://myanimelist.net/reviews.php?id=253664,,,,,7,7,9,8,8,8
3,8254,edgewalker00,2904,9,"{'Overall': '9', 'Story': '9', 'Animation': '9...",https://myanimelist.net/reviews.php?id=8254,,,,,9,9,9,10,10,9
4,291149,aManOfCulture99,4181,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=291149,,,,,10,10,8,9,10,10


In [6]:
# Drop irrelevant/duplicate columns
columns_to_drop = ['link', 'score', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9']
combined_reviews = combined_reviews.drop(columns=columns_to_drop, errors='ignore')

# Sanity check
combined_reviews['Overall'].value_counts().sum()

192107

### Joining MyAnimeList data files

In [7]:
# Merge reviews dataset with animes dataset on anime_uid
merged_reviews_animes = pd.merge(combined_reviews, animes_df, left_on='anime_uid', right_on='uid', suffixes=('_review', '_anime'))

# Merge combined dataset with the profiles dataset on profile
myanimelist_combined = pd.merge(merged_reviews_animes, profiles_df, on='profile')

myanimelist_combined.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 588437 entries, 0 to 588436
Data columns (total 26 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   uid_review       588437 non-null  object 
 1   profile          588437 non-null  object 
 2   anime_uid        588437 non-null  object 
 3   scores           588437 non-null  object 
 4   Overall          588437 non-null  object 
 5   Story            588437 non-null  object 
 6   Animation        588437 non-null  object 
 7   Sound            588437 non-null  object 
 8   Character        588437 non-null  object 
 9   Enjoyment        588437 non-null  object 
 10  uid_anime        588437 non-null  int64  
 11  title            588437 non-null  object 
 12  synopsis         586629 non-null  object 
 13  genre            588437 non-null  object 
 14  aired            588437 non-null  object 
 15  episodes         583455 non-null  float64
 16  members          588437 non-null  int6

### Handling missing values

In [8]:
# Drop null values from 'birthday' and 'gender' columns
myanimelist_combined = myanimelist_combined.dropna(subset=['birthday', 'gender'])
myanimelist_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 349877 entries, 0 to 588433
Data columns (total 26 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   uid_review       349877 non-null  object 
 1   profile          349877 non-null  object 
 2   anime_uid        349877 non-null  object 
 3   scores           349877 non-null  object 
 4   Overall          349877 non-null  object 
 5   Story            349877 non-null  object 
 6   Animation        349877 non-null  object 
 7   Sound            349877 non-null  object 
 8   Character        349877 non-null  object 
 9   Enjoyment        349877 non-null  object 
 10  uid_anime        349877 non-null  int64  
 11  title            349877 non-null  object 
 12  synopsis         349036 non-null  object 
 13  genre            349877 non-null  object 
 14  aired            349877 non-null  object 
 15  episodes         346915 non-null  float64
 16  members          349877 non-null  int6

In [14]:
# Save to csv
myanimelist_combined.to_csv('zipped_data/myanimelist_data.csv', index=False)

### Checking matching values on additional IMDb dataset

In [18]:
imdb_anime_df = pd.read_csv('zipped_data/imdb_anime.csv')

In [19]:
matches = imdb_anime_df['Title'].isin(myanimelist_combined['title'])
exists = matches.any()

print(f"Are there any matching titles? {exists}")
if exists:
    print("Matching titles:", imdb_anime_df['Title'][matches])


Are there any matching titles? True
Matching titles: 0                              One Piece
8                                 Bleach
14                                Naruto
16                          Vinland Saga
20                       Hunter x Hunter
                      ...               
45669                     Mob Psycho 100
45692                          RobiHachi
45694                     Mob Psycho 100
45702    Boruto: Naruto Next Generations
45714                          One Piece
Name: Title, Length: 7479, dtype: object


In [20]:
# Find matching titles
matches = imdb_anime_df['Title'].isin(['title'])

# Count the number of matching records
num_matching_titles = matches.sum()

print(f"Number of matching titles: {num_matching_titles}")

Number of matching titles: 0


### Identifying non-matching values between IMDb & MyAnimeList datasets

In [22]:
non_matches = imdb_anime_df[~imdb_anime_df['Title'].isin(myanimelist_combined['title'])]
non_matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38238 entries, 1 to 45716
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Title            38238 non-null  object
 1   Genre            38238 non-null  object
 2   User Rating      18878 non-null  object
 3   Number of Votes  18878 non-null  object
 4   Runtime          26701 non-null  object
 5   Year             38113 non-null  object
 6   Summary          17616 non-null  object
 7   Stars            11046 non-null  object
 8   Certificate      22411 non-null  object
 9   Metascore        314 non-null    object
 10  Gross            18878 non-null  object
 11  Episode          38238 non-null  object
 12  Episode Title    28335 non-null  object
dtypes: object(13)
memory usage: 4.1+ MB


### API Pull Request to Identify True Anime Records in Remaining IMDb dataset

In [48]:
def is_anime(title, api_key):
    url = 'https://api.myanimelist.net/v2/anime'
    headers = {'Authorization': f'Bearer {api_key}'}
    params = {'q': title, 'limit': 1}

    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            return True
        else:
            return None
    except Exception as e:
        print(f"Error occurred while fetching data for {title}: {e}")
        return None

api_key = '8547440239b36521577596c71a7c7be4'

# Iterate over each title in the DataFrame and check if it's an anime
non_matches['is_anime'] = non_matches['Title'].apply(lambda x: is_anime(x, api_key))

# Pause between requests to adhere to rate limits
time.sleep(1)

# Filtering to find titles identified as anime
anime_titles_df = non_matches[non_matches['is_anime'] == True]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_matches['is_anime'] = non_matches['Title'].apply(lambda x: is_anime(x, api_key))


In [50]:
anime_titles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Title            0 non-null      object
 1   Genre            0 non-null      object
 2   User Rating      0 non-null      object
 3   Number of Votes  0 non-null      object
 4   Runtime          0 non-null      object
 5   Year             0 non-null      object
 6   Summary          0 non-null      object
 7   Stars            0 non-null      object
 8   Certificate      0 non-null      object
 9   Metascore        0 non-null      object
 10  Gross            0 non-null      object
 11  Episode          0 non-null      object
 12  Episode Title    0 non-null      object
 13  is_anime         0 non-null      object
dtypes: object(14)
memory usage: 0.0+ bytes


#### Unable to confirm the remaining 26k+ records in IMDb as Anime. Therefore, will continue with the combined MyAnimeList datasets for our master dataset.

### Creating Master Dataset

In [47]:
# Save to csv
master_data = myanimelist_combined
master_data.to_csv('zipped_data/master_anime_data.csv', index=False)