In [3]:
import requests
import pandas as pd
import time
import json



### TMDB API Pull with Anime Keyword

In [89]:
def get_anime_info(api_key, keyword_id, genre_id, start_page=1, end_page=5000):
    all_anime = []
    for page in range(start_page, end_page + 1):
        url = "https://api.themoviedb.org/3/discover/tv"
        params = {
            'api_key': api_key,
            'with_genres': genre_id,
            'with_keywords': keyword_id,
            'language': 'en-US',
            'sort_by': 'popularity.desc',
            'page': page
        }
        response = requests.get(url, params=params)
        if response.status_code == 200:
            all_anime.extend(response.json().get('results', []))
            if page % 50 == 0:
                pd.DataFrame(all_anime).to_csv(f'anime_data_{page}.csv')
                all_anime = []
        else:
            break
        time.sleep(0.5)
    return all_anime

api_key = '2c818416a26da0b84f6a14364b6aa0eb'

# Get the keyword ID for 'anime'
anime_keyword_id = get_keyword_id(api_key, 'anime')

# Animation genre ID (also used for anime)
genre_id = 16

if anime_keyword_id:
    anime_info = get_anime_info(api_key, anime_keyword_id, genre_id, start_page=1, end_page=5000)
    if anime_info:
        df_anime = pd.DataFrame(anime_info)
        df_anime.to_csv('final_anime_data.csv')
        print("Data fetching complete.")
    else:
        print("No anime information found")
else:
    print("Anime keyword ID not found")


No anime information found


In [5]:
df_anime = pd.read_csv('data/anime_data_tmdb.csv')
df_anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         271 non-null    int64  
 1   backdrop_path      151 non-null    object 
 2   genre_ids          271 non-null    object 
 3   id                 271 non-null    int64  
 4   name               271 non-null    object 
 5   origin_country     271 non-null    object 
 6   original_language  271 non-null    object 
 7   original_name      271 non-null    object 
 8   overview           220 non-null    object 
 9   popularity         271 non-null    float64
 10  poster_path        254 non-null    object 
 11  vote_average       271 non-null    float64
 12  vote_count         271 non-null    int64  
 13  first_air_date     179 non-null    object 
dtypes: float64(2), int64(3), object(9)
memory usage: 29.8+ KB


In [6]:
df_anime_1000 = pd.read_csv('data/anime_data_100.csv')
df_anime_1000.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         1000 non-null   int64  
 1   backdrop_path      983 non-null    object 
 2   first_air_date     999 non-null    object 
 3   genre_ids          1000 non-null   object 
 4   id                 1000 non-null   int64  
 5   name               1000 non-null   object 
 6   origin_country     1000 non-null   object 
 7   original_language  1000 non-null   object 
 8   original_name      1000 non-null   object 
 9   overview           981 non-null    object 
 10  popularity         1000 non-null   float64
 11  poster_path        1000 non-null   object 
 12  vote_average       1000 non-null   float64
 13  vote_count         1000 non-null   int64  
dtypes: float64(2), int64(3), object(9)
memory usage: 109.5+ KB


In [7]:
df_anime_1500 = pd.read_csv('data/anime_data_150.csv')
df_anime_1500.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         1000 non-null   int64  
 1   backdrop_path      905 non-null    object 
 2   first_air_date     969 non-null    object 
 3   genre_ids          1000 non-null   object 
 4   id                 1000 non-null   int64  
 5   name               1000 non-null   object 
 6   origin_country     1000 non-null   object 
 7   original_language  1000 non-null   object 
 8   original_name      1000 non-null   object 
 9   overview           921 non-null    object 
 10  popularity         1000 non-null   float64
 11  poster_path        997 non-null    object 
 12  vote_average       1000 non-null   float64
 13  vote_count         1000 non-null   int64  
dtypes: float64(2), int64(3), object(9)
memory usage: 109.5+ KB


### Combining TMDB anime datasets

In [13]:
csv_file_paths = ['data/anime_data_50.csv', 'data/anime_data_100.csv', 'data/anime_data_150.csv', 'data/anime_data_250.csv']

# Read each csv file into a dataframe
dataframes = [pd.read_csv(file_path) for file_path in csv_file_paths]


In [14]:
combined_anime_tmdb = pd.concat(dataframes, ignore_index=True)

In [15]:
# Convert list to string
combined_anime_tmdb['genre_ids'] = combined_anime_tmdb['genre_ids'].apply(lambda x: ','.join(map(str, x)) if isinstance(x, list) else x)

# Drop irrelevant columns
combined_anime_tmdb = combined_anime_tmdb.drop(columns=['origin_country', 'backdrop_path', 'poster_path', 'Unnamed: 0'])

# Remove duplicates
combined_anime_tmdb = combined_anime_tmdb.drop_duplicates()

In [16]:
combined_anime_tmdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3271 entries, 0 to 3270
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   first_air_date     3146 non-null   object 
 1   genre_ids          3271 non-null   object 
 2   id                 3271 non-null   int64  
 3   name               3271 non-null   object 
 4   original_language  3271 non-null   object 
 5   original_name      3271 non-null   object 
 6   overview           3120 non-null   object 
 7   popularity         3271 non-null   float64
 8   vote_average       3271 non-null   float64
 9   vote_count         3271 non-null   int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 281.1+ KB


In [17]:
# Save to csv file
combined_anime_tmdb.to_csv('data/anime_tmbd_master.csv', index=False)

### Adding data from MyAnimeList

In [18]:
import zipfile
import os

zip_file_path = 'data/myanimelist.zip'

extract_path = 'extracted_csv_files'

if not os.path.exists(extract_path):
    os.makedirs(extract_path)

# Extract csv files
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all csv files
    for file_name in zip_ref.namelist():
        if file_name.endswith('.csv'):
            zip_ref.extract(file_name, path=extract_path)


In [20]:
animes_df = pd.read_csv('data/extracted_csv_files/animes.csv')
animes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19311 entries, 0 to 19310
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   uid         19311 non-null  int64  
 1   title       19311 non-null  object 
 2   synopsis    18336 non-null  object 
 3   genre       19311 non-null  object 
 4   aired       19311 non-null  object 
 5   episodes    18605 non-null  float64
 6   members     19311 non-null  int64  
 7   popularity  19311 non-null  int64  
 8   ranked      16099 non-null  float64
 9   score       18732 non-null  float64
 10  img_url     19131 non-null  object 
 11  link        19311 non-null  object 
dtypes: float64(3), int64(3), object(6)
memory usage: 1.8+ MB


In [21]:
profiles_df = pd.read_csv('data/extracted_csv_files/profiles.csv')
profiles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81727 entries, 0 to 81726
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   profile          81727 non-null  object
 1   gender           53856 non-null  object
 2   birthday         46807 non-null  object
 3   favorites_anime  81727 non-null  object
 4   link             81727 non-null  object
dtypes: object(5)
memory usage: 3.1+ MB


In [22]:
reviews_df = pd.read_csv('data/extracted_csv_files/reviews.csv')
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192112 entries, 0 to 192111
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   uid        192112 non-null  int64 
 1   profile    192112 non-null  object
 2   anime_uid  192112 non-null  int64 
 3   text       192112 non-null  object
 4   score      192112 non-null  int64 
 5   scores     192112 non-null  object
 6   link       192112 non-null  object
dtypes: int64(3), object(4)
memory usage: 10.3+ MB


### Joining MyAnimeList data files

In [23]:
# Join anime_df and reviews_df
myanimelist_merged = animes_df.merge(reviews_df, left_on='uid', right_on='anime_uid', how='outer')

# Join the result with profiles_df
myanimelist_df = myanimelist_merged.merge(profiles_df, on='profile', how='outer')


In [153]:
myanimelist_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 820627 entries, 0 to 820626
Data columns (total 23 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   uid_x            820627 non-null  int64  
 1   title            820627 non-null  object 
 2   synopsis         817619 non-null  object 
 3   genre            820627 non-null  object 
 4   aired            820627 non-null  object 
 5   episodes         814581 non-null  float64
 6   members          820627 non-null  int64  
 7   popularity       820627 non-null  int64  
 8   ranked           787639 non-null  float64
 9   score_x          820048 non-null  float64
 10  img_url          820349 non-null  object 
 11  link_x           820627 non-null  object 
 12  uid_y            811636 non-null  float64
 13  profile          811636 non-null  object 
 14  anime_uid        811636 non-null  float64
 15  text             811636 non-null  object 
 16  score_y          811636 non-null  floa

In [154]:
myanimelist_df.head()

Unnamed: 0,uid_x,title,synopsis,genre,aired,episodes,members,popularity,ranked,score_x,...,profile,anime_uid,text,score_y,scores,link_y,gender,birthday,favorites_anime,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,...,skrn,28891.0,\n \n \n \n ...,7.0,"{'Overall': '7', 'Story': '7', 'Animation': '9...",https://myanimelist.net/reviews.php?id=253664,,,"['918', '2904', '11741', '17074', '23273', '32...",https://myanimelist.net/profile/skrn
1,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,...,skrn,28891.0,\n \n \n \n ...,7.0,"{'Overall': '7', 'Story': '7', 'Animation': '9...",https://myanimelist.net/reviews.php?id=253664,,,"['918', '2904', '11741', '17074', '23273', '32...",https://myanimelist.net/profile/skrn
2,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,...,skrn,28891.0,\n \n \n \n ...,7.0,"{'Overall': '7', 'Story': '7', 'Animation': '9...",https://myanimelist.net/reviews.php?id=253664,,,"['918', '2904', '11741', '17074', '23273', '32...",https://myanimelist.net/profile/skrn
3,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,...,skrn,28891.0,\n \n \n \n ...,7.0,"{'Overall': '7', 'Story': '7', 'Animation': '9...",https://myanimelist.net/reviews.php?id=253664,,,"['918', '2904', '11741', '17074', '23273', '32...",https://myanimelist.net/profile/skrn
4,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,...,skrn,28891.0,\n \n \n \n ...,7.0,"{'Overall': '7', 'Story': '7', 'Animation': '9...",https://myanimelist.net/reviews.php?id=253664,,,"['918', '2904', '11741', '17074', '23273', '32...",https://myanimelist.net/profile/skrn


In [24]:
# Save to csv
myanimelist_df.to_csv('data/myanimelist_data.csv', index=False)

### Checking matching values on additional MyAnimeList dataset

In [34]:
top_anime_df = pd.read_csv('data/Anime_Top10000.csv')
top_anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Anime_Name       10000 non-null  object 
 1   Anime_Episodes   10000 non-null  object 
 2   Anime_Air_Years  10000 non-null  object 
 3   Anime_Rating     10000 non-null  float64
 4   Synopsis         10000 non-null  object 
dtypes: float64(1), object(4)
memory usage: 390.8+ KB


In [35]:
matches = top_anime_df['Anime_Name'].isin(myanimelist_df['title'])

# Count the number of matching records
num_matching_records = matches.sum()

print(f"Number of matching records: {num_matching_records}")


Number of matching records: 9342


### Identify matching columns from both MyAnimeList datasets

In [36]:
import re

def extract_number_to_float(s):
    # Use regular expression to find numbers in the string
    matches = re.findall(r'\d+', str(s))
    if matches:
        # Convert the first found number to float
        return float(matches[0])
    else:
        # Return NaN if no number is found
        return float('nan')

# Apply the function to the entire column
top_anime_df['Anime_Episodes'] = top_anime_df['Anime_Episodes'].apply(extract_number_to_float)

In [37]:
top_anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Anime_Name       10000 non-null  object 
 1   Anime_Episodes   9950 non-null   float64
 2   Anime_Air_Years  10000 non-null  object 
 3   Anime_Rating     10000 non-null  float64
 4   Synopsis         10000 non-null  object 
dtypes: float64(2), object(3)
memory usage: 390.8+ KB


### Merging MyAnimeList datasets

In [38]:
top_anime_columns = ['Anime_Name', 'Anime_Episodes', 'Anime_Air_Years', 'Anime_Rating', 'Synopsis']
myanimelist_columns = ['title', 'episodes', 'aired', 'score_x', 'synopsis']

# Merge data
merged_animes_df = top_anime_df.merge(myanimelist_df, left_on=top_anime_columns, right_on=myanimelist_columns, how='outer', indicator=True)

# Filter out non-matching records from top_anime_df
non_matching_top_anime = merged_animes_df[merged_animes_df['_merge'] == 'left_only']


In [39]:
# Keep only columns from top_anime_df
non_matching_top_anime = non_matching_top_anime[top_anime_df.columns]

# Concatenate with myanimelist_df
anime_master_df = pd.concat([myanimelist_df, non_matching_top_anime], ignore_index=True)

### Combining identical columns

In [40]:
anime_master_df['title_master'] = anime_master_df['title'].combine_first(anime_master_df['Anime_Name'])
anime_master_df['synopsis_master'] = anime_master_df['synopsis'].combine_first(anime_master_df['Synopsis'])
anime_master_df['num_episodes'] = anime_master_df['episodes'].combine_first(anime_master_df['Anime_Episodes'])
anime_master_df['air_date'] = anime_master_df['aired'].combine_first(anime_master_df['Anime_Air_Years'])
anime_master_df['ratings'] = anime_master_df['score_x'].combine_first(anime_master_df['Anime_Rating'])


### Dropping unnecessary and duplicate columns

In [41]:
cleaned_anime_df = anime_master_df.drop(['Anime_Name', 'Anime_Episodes', 
                      'Anime_Air_Years', 'Anime_Rating',
                      'title', 'score_x', 'aired', 'link_y', 'link_x', 
                      'link', 'members', 'uid_y', 'uid_x', 'Synopsis', 
                      'score_y', 'anime_uid', 'ranked'], axis=1)


In [42]:
cleaned_anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 830627 entries, 0 to 830626
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   synopsis         817619 non-null  object 
 1   genre            820627 non-null  object 
 2   episodes         814581 non-null  float64
 3   popularity       820627 non-null  float64
 4   img_url          820349 non-null  object 
 5   profile          811636 non-null  object 
 6   text             811636 non-null  object 
 7   scores           811636 non-null  object 
 8   gender           600087 non-null  object 
 9   birthday         523667 non-null  object 
 10  favorites_anime  811636 non-null  object 
 11  title_master     830627 non-null  object 
 12  synopsis_master  827619 non-null  object 
 13  num_episodes     824531 non-null  float64
 14  air_date         830627 non-null  object 
 15  ratings          830048 non-null  float64
dtypes: float64(4), object(12)
memory usage

In [43]:
cleaned_anime_df.to_csv('data/anime_master_data.csv', index=False)

### Checking matching values on additional IMDb dataset

In [44]:
imdb_anime_df = pd.read_csv('data/imdb_anime.csv')

In [45]:
matches = imdb_anime_df['Title'].isin(cleaned_anime_df['title_master'])
exists = matches.any()

print(f"Are there any matching titles? {exists}")
if exists:
    print("Matching titles:", imdb_anime_df['Title'][matches])


Are there any matching titles? True
Matching titles: 0                              One Piece
4                         Jujutsu Kaisen
8                                 Bleach
14                                Naruto
16                          Vinland Saga
                      ...               
45693                       Juuni Taisen
45694                     Mob Psycho 100
45702    Boruto: Naruto Next Generations
45714                          One Piece
45715                      Marmalade Boy
Name: Title, Length: 18958, dtype: object


In [46]:
# Find matching titles
matches = imdb_anime_df['Title'].isin(cleaned_anime_df['title_master'])

# Count the number of matching records
num_matching_titles = matches.sum()

print(f"Number of matching titles: {num_matching_titles}")

Number of matching titles: 18958


### Identifying non-matching values between IMDb & MyAnimeList datasets

In [47]:
non_matches = imdb_anime_df[~imdb_anime_df['Title'].isin(cleaned_anime_df['title_master'])]
non_matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26759 entries, 1 to 45716
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Title            26759 non-null  object
 1   Genre            26759 non-null  object
 2   User Rating      15438 non-null  object
 3   Number of Votes  15438 non-null  object
 4   Runtime          20922 non-null  object
 5   Year             26640 non-null  object
 6   Summary          14815 non-null  object
 7   Stars            6844 non-null   object
 8   Certificate      13946 non-null  object
 9   Metascore        300 non-null    object
 10  Gross            15438 non-null  object
 11  Episode          26759 non-null  object
 12  Episode Title    18300 non-null  object
dtypes: object(13)
memory usage: 2.9+ MB


### API Pull Request to Identify True Anime Records in Remaining IMDb dataset

In [48]:
def is_anime(title, api_key):
    url = 'https://api.myanimelist.net/v2/anime'
    headers = {'Authorization': f'Bearer {api_key}'}
    params = {'q': title, 'limit': 1}

    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            return True
        else:
            return None
    except Exception as e:
        print(f"Error occurred while fetching data for {title}: {e}")
        return None

api_key = '8547440239b36521577596c71a7c7be4'

# Iterate over each title in the DataFrame and check if it's an anime
non_matches['is_anime'] = non_matches['Title'].apply(lambda x: is_anime(x, api_key))

# Pause between requests to adhere to rate limits
time.sleep(1)

# Filtering to find titles identified as anime
anime_titles_df = non_matches[non_matches['is_anime'] == True]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_matches['is_anime'] = non_matches['Title'].apply(lambda x: is_anime(x, api_key))


In [50]:
anime_titles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Title            0 non-null      object
 1   Genre            0 non-null      object
 2   User Rating      0 non-null      object
 3   Number of Votes  0 non-null      object
 4   Runtime          0 non-null      object
 5   Year             0 non-null      object
 6   Summary          0 non-null      object
 7   Stars            0 non-null      object
 8   Certificate      0 non-null      object
 9   Metascore        0 non-null      object
 10  Gross            0 non-null      object
 11  Episode          0 non-null      object
 12  Episode Title    0 non-null      object
 13  is_anime         0 non-null      object
dtypes: object(14)
memory usage: 0.0+ bytes


#### Unable to confirm the remaining 26k+ records in IMDb as Anime. Therefore, will continue with the combine TMDB & MyAnimeList datasets for our master dataset.