In [1]:
# Anime Recommender System using AniList API
# This notebook builds a content-based anime recommender system using data from the AniList GraphQL API.

!pip install -q requests beautifulsoup4 fuzzywuzzy python-Levenshtein
import requests
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from bs4 import BeautifulSoup
from fuzzywuzzy import process
tqdm.pandas()

def clean_description(text):
    if not text:
        return ''
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def fetch_anime_page(page=1, per_page=50):
    url = 'https://graphql.anilist.co'
    query = '''
    query ($page: Int, $perPage: Int) {
      Page(page: $page, perPage: $perPage) {
        pageInfo {
          total
          currentPage
          lastPage
          hasNextPage
        }
        media(type: ANIME, sort: POPULARITY_DESC) {
          id
          title {
            romaji
            english
            native
          }
          trailer {
            id
            site
          }
          description(asHtml: false)
          genres
          tags {
            name
            rank
            isGeneralSpoiler
          }
          averageScore
          popularity
          episodes
          duration
          startDate {
            year
            month
            day
          }
          coverImage {
            large
          }
          studios(isMain: true) {
            nodes {
              name
            }
          }
        }
      }
    }
    '''
    variables = {'page': page, 'perPage': per_page}
    try:
        response = requests.post(url, json={'query': query, 'variables': variables})
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        print(f"Error fetching page {page}: {e}")
        return {}

anime_list = []
max_pages = 20

for page in tqdm(range(1, max_pages + 1)):
    data = fetch_anime_page(page)
    if 'data' not in data:
        break
    media = data['data']['Page']['media']
    for anime in media:
        anime_list.append({
            'id': anime['id'],
            'title': anime['title']['english'] or anime['title']['romaji'],
            'romaji': anime['title']['romaji'],
            'native': anime['title']['native'],
            'description': clean_description(anime['description']),
            'genres': ', '.join(anime['genres']),
            'tags': ', '.join([tag['name'] for tag in anime['tags'] if not tag['isGeneralSpoiler']]),
            'average_score': anime['averageScore'],
            'popularity': anime['popularity'],
            'episodes': anime['episodes'],
            'duration': anime['duration'],
            'start_date': f"{anime['startDate']['year']}-{anime['startDate']['month'] or 1}-{anime['startDate']['day'] or 1}",
            'trailer_url': f"https://www.youtube.com/watch?v={anime['trailer']['id']}" if anime.get('trailer') and anime['trailer'].get('site') == 'youtube' else None,
            'cover_image': anime['coverImage']['large'],
            'studios': ', '.join([studio['name'] for studio in anime['studios']['nodes']]),
            'anilist_url': f"https://anilist.co/anime/{anime['id']}"
        })

anime_df = pd.DataFrame(anime_list)
anime_df.dropna(subset=['title'], inplace=True)
anime_df.head()

anime_df['combined_features'] = anime_df['genres'] + ' ' + anime_df['tags'] + ' ' + anime_df['description'].fillna('')

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(anime_df['combined_features'])
similarity = cosine_similarity(tfidf_matrix)

def recommend_anime(title, top_n=10):
    title = title.lower()
    titles = anime_df['title'].str.lower().tolist()
    best_match = process.extractOne(title, titles)
    if best_match[1] < 80:
        print(f"Anime '{title}' not found. Did you mean '{best_match[0]}'?")
        return
    idx = anime_df[anime_df['title'].str.lower() == best_match[0]].index[0]
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    print(f"Top {top_n} recommendations for '{anime_df.loc[idx, 'title']}':\n")
    for i, score in sim_scores:
        print(f"{anime_df.loc[i, 'title']} (Score: {score:.2f})")

# Example
recommend_anime("Naruto")

import pickle
with open("anime_data.pkl", "wb") as f:
    pickle.dump(anime_df, f)
with open("anime_similarity.pkl", "wb") as f:
    pickle.dump(similarity, f)

100%|█████████████████████████| 20/20 [00:32<00:00,  1.64s/it]


Top 10 recommendations for 'Naruto':

Road to Ninja: Naruto the Movie (Score: 0.39)
Naruto: Shippuden (Score: 0.31)
Boruto: Naruto Next Generations (Score: 0.29)
Boruto: Naruto the Movie (Score: 0.19)
The Last: Naruto the Movie (Score: 0.13)
Solo Leveling Season 2 -Arise from the Shadow- (Score: 0.12)
Fruits Basket (2019) (Score: 0.10)
Gurren Lagann (Score: 0.10)
Ninja Kamui (Score: 0.10)
Demon Slayer -Kimetsu no Yaiba- The Movie: Mugen Train (Score: 0.09)


In [2]:
print(anime_df.columns)


Index(['id', 'title', 'romaji', 'native', 'description', 'genres', 'tags',
       'average_score', 'popularity', 'episodes', 'duration', 'start_date',
       'trailer_url', 'cover_image', 'studios', 'anilist_url',
       'combined_features'],
      dtype='object')


In [3]:

import requests
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from bs4 import BeautifulSoup
from fuzzywuzzy import process

def clean_description(text):
    if not text:
        return ''
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def fetch_anime_page(page=1, per_page=50):
    url = 'https://graphql.anilist.co'
    query = '''
    query ($page: Int, $perPage: Int) {
      Page(page: $page, perPage: $perPage) {
        pageInfo {
          total
          currentPage
          lastPage
          hasNextPage
        }
        media(type: ANIME, sort: POPULARITY_DESC) {
          id
          title {
            romaji
            english
            native
          }
          trailer {
            id
            site
          }
          description(asHtml: false)
          genres
          tags {
            name
            isGeneralSpoiler
          }
          averageScore
          popularity
          episodes
          duration
          startDate {
            year
            month
            day
          }
          coverImage {
            large
          }
          studios(isMain: true) {
            nodes {
              name
            }
          }
        }
      }
    }
    '''
    variables = {'page': page, 'perPage': per_page}
    try:
        response = requests.post(url, json={'query': query, 'variables': variables})
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        print(f"Error fetching page {page}: {e}")
        return {}

anime_list = []
max_pages = 20
for page in tqdm(range(1, max_pages + 1)):
    data = fetch_anime_page(page)
    if 'data' not in data:
        break
    media = data['data']['Page']['media']
    for anime in media:
        title = anime['title'].get('english') or anime['title'].get('romaji') or anime['title'].get('native')
        if not title:
            continue
        anime_list.append({
            'id': anime['id'],
            'title': title,
            'romaji': anime['title']['romaji'],
            'native': anime['title']['native'],
            'description': clean_description(anime.get('description', '')),
            'genres': ', '.join(anime.get('genres', [])),
            'tags': ', '.join([tag['name'] for tag in anime.get('tags', []) if not tag.get('isGeneralSpoiler')]),
            'average_score': anime.get('averageScore'),
            'popularity': anime.get('popularity'),
            'episodes': anime.get('episodes'),
            'duration': anime.get('duration'),
            'start_date': f"{anime['startDate'].get('year', '')}-{anime['startDate'].get('month', 1)}-{anime['startDate'].get('day', 1)}",
            'trailer_url': f"https://www.youtube.com/watch?v={anime['trailer']['id']}" if anime.get('trailer') and anime['trailer'].get('site') == 'youtube' else None,
            'cover_image': anime['coverImage']['large'] if anime.get('coverImage') else None,
            'studios': ', '.join([studio['name'] for studio in anime.get('studios', {}).get('nodes', [])]),
            'anilist_url': f"https://anilist.co/anime/{anime['id']}"
        })

anime_df = pd.DataFrame(anime_list)
anime_df.dropna(subset=['title'], inplace=True)
anime_df['combined_features'] = anime_df['genres'] + ' ' + anime_df['tags'] + ' ' + anime_df['description'].fillna('')
anime_df['title_lower'] = anime_df['title'].str.lower()


100%|█████████████████████████| 20/20 [00:33<00:00,  1.66s/it]


In [4]:

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(anime_df['combined_features'])
similarity = cosine_similarity(tfidf_matrix)


In [5]:

def recommend_anime(title, top_n=10, weight_popularity=0.3):
    title = title.lower()
    titles = anime_df['title_lower'].tolist()
    best_match = process.extractOne(title, titles)
    if best_match[1] < 80:
        print(f"Anime '{title}' not found. Did you mean '{best_match[0]}'?")
        return
    idx = anime_df[anime_df['title_lower'] == best_match[0]].index[0]
    sim_scores = list(enumerate(similarity[idx]))
    weighted_scores = []
    for i, sim in sim_scores:
        pop_score = anime_df.loc[i, 'popularity'] or 0
        pop_score_norm = pop_score / anime_df['popularity'].max()
        score = sim * (1 - weight_popularity) + pop_score_norm * weight_popularity
        weighted_scores.append((i, score))
    sorted_scores = sorted(weighted_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    print(f"Top {top_n} recommendations for '{anime_df.loc[idx, 'title']}':\n")
    for i, score in sorted_scores:
        print(f"{anime_df.loc[i, 'title']} (Score: {score:.2f})")
