# Anime Recommender System

In [1]:
import pandas as pd
import numpy as np
import requests
import time

In [61]:
%%time

query = '''
query($page: Int, $perPage: Int){
    Page(page: $page, perPage: $perPage) {
    pageInfo{
      total
      currentPage
      lastPage
      hasNextPage
      perPage
    }
    media(type:ANIME){
      id
      title {
        romaji
        english
      }
      genres
      format
      description
      startDate {
        year
        month
        day
      }
      endDate {
        year
        month
        day
      }
      episodes
      description
      season
      seasonYear
      duration
      coverImage {
        large
        medium
      }
      averageScore
      meanScore
      relations {
        nodes {
          id
          title {
            romaji
            english
          }
        }
      }
    }
  }
}
'''

url = 'https://graphql.anilist.co'
api_output_anime = []
page = 1
atLastPage = False

while not atLastPage:
#for i in range(5):
    variables = {
        'page': page,
        'perPage': 50,
    }
    
    time.sleep(0.667)
    response = requests.post(url, json={'query': query, 'variables': variables})
    data = response.json()
    
    for anime in data['data']['Page']['media']:
        start_year = ""
        start_month = ""
        start_day = ""
        end_year = ""
        end_month = ""
        end_day = ""
        
        start_year = f"{anime['startDate']['year']}" if anime['startDate']['year'] is not None and anime['startDate']['year'] >= 1900 else ""
        start_month = f"{anime['startDate']['month']:02d}" if anime['startDate']['month'] is not None and anime['startDate']['month'] <= 12 else ""
        start_day = f"{anime['startDate']['day']:02d}" if anime['startDate']['day'] is not None and anime['startDate']['day'] <= 31 else ""
        end_year = f"{anime['endDate']['year']}" if anime['endDate']['year'] is not None and anime['endDate']['year'] >= 1900 else ""
        end_month = f"{anime['endDate']['month']:02d}" if anime['endDate']['month'] is not None and anime['endDate']['month'] <= 12  else ""
        end_day = f"{anime['endDate']['day']:02d}" if anime['endDate']['day'] is not None and anime['endDate']['day'] <= 31 else ""

        anime['startDate'] = f"{start_year}-{start_month}-{start_day}" if len(f"{start_year}-{start_month}-{start_day}") == 10 else None
        anime['endDate'] = f"{end_year}-{end_month}-{end_day}" if len(f"{end_year}-{end_month}-{end_day}") == 10 else None            
        
        anime['title_romaji'] = anime['title']['romaji']
        anime['title_english'] = anime['title']['english']
        anime['coverImage_large'] = anime['coverImage']['large']
        anime['coverImage_medium'] = anime['coverImage']['medium']
        anime['genres'] = ", ".join(anime['genres'])
        
        # to skip hentai titles
        if "Hentai" in anime['genres']:
            continue
            
        anime.pop('coverImage')
        anime.pop('title')
        api_output_anime.append(anime)
        
    #api_output_anime.extend(data['data']['Page']['media'])
    print(f"Page {page}, total collated: {len(api_output_anime)} anime titles")
    
    if data['data']['Page']['pageInfo']['hasNextPage'] == True:
        page += 1
    else:
        atLastPage = True
        
print("Completed")

Page 1, total collated: 50 anime titles
Page 2, total collated: 100 anime titles
Page 3, total collated: 150 anime titles
Page 4, total collated: 200 anime titles
Page 5, total collated: 250 anime titles
Page 6, total collated: 300 anime titles
Page 7, total collated: 350 anime titles
Page 8, total collated: 400 anime titles
Page 9, total collated: 450 anime titles
Page 10, total collated: 500 anime titles
Page 11, total collated: 550 anime titles
Page 12, total collated: 600 anime titles
Page 13, total collated: 650 anime titles
Page 14, total collated: 700 anime titles
Page 15, total collated: 750 anime titles
Page 16, total collated: 800 anime titles
Page 17, total collated: 850 anime titles
Page 18, total collated: 900 anime titles
Page 19, total collated: 950 anime titles
Page 20, total collated: 1000 anime titles
Page 21, total collated: 1050 anime titles
Page 22, total collated: 1100 anime titles
Page 23, total collated: 1150 anime titles
Page 24, total collated: 1200 anime titl

In [62]:
anime_db = pd.DataFrame(api_output_anime)
print(anime_db.shape)
anime_db.head()

(17187, 17)


Unnamed: 0,id,genres,format,description,startDate,endDate,episodes,season,seasonYear,duration,averageScore,meanScore,relations,title_romaji,title_english,coverImage_large,coverImage_medium
0,1,"Action, Adventure, Drama, Sci-Fi",TV,"Enter a world in the distant future, where Bou...",1998-04-03,1999-04-24,26.0,SPRING,1998.0,24.0,86.0,86.0,"{'nodes': [{'id': 5, 'title': {'romaji': 'Cowb...",Cowboy Bebop,Cowboy Bebop,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...
1,5,"Action, Drama, Mystery, Sci-Fi",MOVIE,"As the Cowboy Bebop crew travels the stars, th...",2001-09-01,2001-09-01,1.0,SUMMER,2001.0,115.0,82.0,82.0,"{'nodes': [{'id': 1, 'title': {'romaji': 'Cowb...",Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie - Knockin' on Heaven's...,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...
2,6,"Action, Adventure, Comedy, Drama, Sci-Fi",TV,Trigun takes place in the distant future on a ...,1998-04-01,1998-09-30,26.0,SPRING,1998.0,24.0,79.0,79.0,"{'nodes': [{'id': 30703, 'title': {'romaji': '...",Trigun,Trigun,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...
3,7,"Action, Drama, Mystery, Supernatural",TV,Robin Sena is a powerful craft user drafted in...,2002-07-02,2002-12-24,26.0,SUMMER,2002.0,25.0,68.0,68.0,{'nodes': []},Witch Hunter ROBIN,Witch Hunter ROBIN,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...
4,8,"Adventure, Fantasy, Supernatural",TV,It is the dark century and the people are suff...,2004-09-30,2005-09-29,52.0,FALL,2004.0,23.0,62.0,64.0,"{'nodes': [{'id': 1123, 'title': {'romaji': 'B...",Bouken Ou Beet,Beet the Vandel Buster,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...


In [63]:
anime_db.to_csv("./anime_db_18.csv",index=False,encoding="utf-8")

In [54]:
%%time

query = '''
query ($page: Int, $perPage: Int) {
    Page (page: $page, perPage: $perPage) {
        pageInfo {
            total
            currentPage
            lastPage
            hasNextPage
            perPage
        }
        users{
            id
            name
            mediaListOptions {
                scoreFormat
            }
            statistics {
                anime {
                    scores {
                        score
                        mediaIds
                    }
                    
                }
            }
        }
    }
}
'''

url = 'https://graphql.anilist.co'
api_output_users = []
atLastPage = False
page_numbers = list(range(1,30001))
pages = list(np.random.choice(page_numbers,size=300,replace=False))
#max page as of 10 Feb 2022 is 30582

#while not atLastPage:
#pages = [5,6]
#pages = list(range(1,11))
for idx, page in enumerate(pages):
#for i in range(5):
    variables = {
        'page': int(page),
        'perPage': 50,
    }

    time.sleep(0.667)
    response = requests.post(url, json={'query': query, 'variables': variables})
    data = response.json()
    
    #print(data)
    
    for user in data['data']['Page']['users']:  
        user['score_format'] = user['mediaListOptions']['scoreFormat']
        
        # exclude users that did not rate any anime, or are on a 3-point scale
        # The 3 point scale in AniList is equivalent to rating the anime with a smiley face, neutral face or sad face
        # Difficult to quantify the faces so we will exclude it in the data collection 
        if len(user['statistics']['anime']['scores']) == 0 or user['score_format'] == "POINT_3":
            continue
        # The 10 point scale in Anilist is on a scale of 1 to 10, integers only
        elif user['score_format'] == "POINT_10":
            # scaling the score by 10 so that the maximum score is 100
            # to have every user's scores on the same scale
            for score in user['statistics']['anime']['scores']:
                score['score'] *= 10
        # The 5 point scale in AniList is on a scale of 1 to 5, integers only
        elif user['score_format'] == "POINT_5":
            # scaling the score by 20 so that the maximum score is 100
            # to have every user's scores on the same scale
            for score in user['statistics']['anime']['scores']:
                score['score'] *= 20
        
        user['statistics'] = user['statistics']['anime']['scores']
        user.pop('mediaListOptions')
        #user.pop('statistics')
        api_output_users.append(user)
        
    #api_output_anime.extend(data['data']['Page']['media'])
    print(f"#{idx+1}: Users page {page}, total collated: {len(api_output_users)} users")

#1: Users page 12477, total collated: 23 users
#2: Users page 22037, total collated: 38 users
#3: Users page 21174, total collated: 45 users
#4: Users page 25244, total collated: 64 users
#5: Users page 13504, total collated: 89 users
#6: Users page 18988, total collated: 102 users
#7: Users page 26910, total collated: 117 users
#8: Users page 4028, total collated: 135 users
#9: Users page 16159, total collated: 142 users
#10: Users page 20393, total collated: 163 users
#11: Users page 27594, total collated: 173 users
#12: Users page 23577, total collated: 185 users
#13: Users page 20998, total collated: 199 users
#14: Users page 20090, total collated: 222 users
#15: Users page 1358, total collated: 226 users
#16: Users page 19034, total collated: 232 users
#17: Users page 2881, total collated: 248 users
#18: Users page 17423, total collated: 267 users
#19: Users page 27605, total collated: 279 users
#20: Users page 1621, total collated: 292 users
#21: Users page 5006, total collated: 

In [55]:
user_db = pd.DataFrame(api_output_users)

In [56]:
def split_scores(stat_row):
    scores_dict = {}
    for item in stat_row:
        for media_id in item['mediaIds']:
            scores_dict[media_id] = item['score']
    
    return scores_dict

In [57]:
user_db['statistics'] = user_db['statistics'].apply(lambda x: x['anime']['scores'])
user_db['statistics'] = user_db['statistics'].apply(split_scores)
user_db = user_db.join(pd.json_normalize(user_db['statistics']))
user_db = user_db.drop(columns=['statistics'])

In [59]:
print(user_db.shape)
user_db.head()

(5097, 6483)


Unnamed: 0,id,name,score_format,21459,20755,6547,14719,15809,98659,21450,...,21806,107875,12863,107202,97707,114745,21478,119484,10419,1065
0,690982,HaradaShin02,POINT_10_DECIMAL,75.0,75.0,75.0,75.0,75.0,75.0,75.0,...,,,,,,,,,,
1,690986,Jozen15,POINT_10_DECIMAL,85.0,,,,,,,...,,,,,,,,,,
2,690989,Ser88,POINT_10_DECIMAL,60.0,,60.0,70.0,80.0,90.0,40.0,...,,,,,,,,,,
3,690991,Daniel2620,POINT_5,,,,,,,,...,,,,,,,,,,
4,690992,Sanderson,POINT_10,,,,,,,,...,,,,,,,,,,


In [60]:
user_db.to_csv("./user_list.csv",index=False,encoding="utf-8")

In [None]:
query = '''
query ($page: Int, $perPage: Int) {
    Page (page: $page, perPage: $perPage) {
        pageInfo {
            total
            currentPage
            lastPage
            hasNextPage
            perPage
        }
        users{
            id
            name
            mediaListOptions {
                scoreFormat
            }
            statistics {
                anime {
                    scores {
                        score
                        mediaIds
                    }
                    
                }
            }
        }
    }
}
'''

url = 'https://graphql.anilist.co'
api_output_users = []
atLastPage = False
page_numbers = list(range(1,30001))
pages = np.random.choice(page_numbers,size=300,replace=False)
#max page as of 10 Feb 2022 is 30582

#while not atLastPage:
pages = [5,6]
for idx,page in enumerate(pages):
#for i in range(5):
    variables = {
        'page': page,
        'perPage': 50,
    }
    
    time.sleep(0.667)
    response = requests.post(url, json={'query': query, 'variables': variables})
    data = response.json()
    
    #print(data)
    
    for user in data['data']['Page']['users']:  
        user['score_format'] = user['mediaListOptions']['scoreFormat']
        
        if len(user['statistics']['anime']['scores']) == 0 or user['score_format'] == "POINT_3":
            continue
        elif user['score_format'] == "POINT_10":
            for score in user['statistics']['anime']['scores']:
                score['score'] *= 10
        elif user['score_format'] == "POINT_5":
            for score in user['statistics']['anime']['scores']:
                score['score'] *= 20
            
        for score in user['statistics']['anime']['scores']:
            media_id = [str(mediaid) for mediaid in score['mediaIds']]
            user[f"score_{score['score']}"] = ", ".join(media_id)
        user.pop('mediaListOptions')
        user.pop('statistics')
        api_output_users.append(user)
        
    #api_output_anime.extend(data['data']['Page']['media'])
    print(f"#{idx+1}: Users page {page}, collated {len(api_output_users)} users")