# Anime Recommender System

In [1]:
import pandas as pd
import numpy as np
import requests
import time
from tqdm import tqdm

In [17]:
#%%time

query = '''
query($page: Int, $perPage: Int){
    Page(page: $page, perPage: $perPage) {
    pageInfo{
      total
      currentPage
      lastPage
      hasNextPage
      perPage
    }
    media(type:ANIME){
      id
      title {
        romaji
        english
      }
      status
      genres
      format
      description
      startDate {
        year
        month
        day
      }
      endDate {
        year
        month
        day
      }
      episodes
      description
      season
      seasonYear
      duration
      coverImage {
        large
        medium
      }
      averageScore
      meanScore
      popularity
      relations {
        edges {
            node {
                id
                type
            }
            relationType
        }
        nodes {
          id
          title {
            romaji
            english
          }
        }
      }
      siteUrl
    }
  }
}
'''

url = 'https://graphql.anilist.co'
api_output_anime = []
api_output_anime_18only = []
page = 1
atLastPage = False

while not atLastPage:
#for i in range(5):
    variables = {
        'page': page,
        'perPage': 50,
    }
    
    time.sleep(0.667)
    response = requests.post(url, json={'query': query, 'variables': variables})
    data = response.json()
    
    for anime in data['data']['Page']['media']:
        start_year = ""
        start_month = ""
        start_day = ""
        end_year = ""
        end_month = ""
        end_day = ""
        
        start_year = f"{anime['startDate']['year']}" if anime['startDate']['year'] is not None and anime['startDate']['year'] >= 1900 else ""
        start_month = f"{anime['startDate']['month']:02d}" if anime['startDate']['month'] is not None and anime['startDate']['month'] <= 12 else ""
        start_day = f"{anime['startDate']['day']:02d}" if anime['startDate']['day'] is not None and anime['startDate']['day'] <= 31 else ""
        end_year = f"{anime['endDate']['year']}" if anime['endDate']['year'] is not None and anime['endDate']['year'] >= 1900 else ""
        end_month = f"{anime['endDate']['month']:02d}" if anime['endDate']['month'] is not None and anime['endDate']['month'] <= 12  else ""
        end_day = f"{anime['endDate']['day']:02d}" if anime['endDate']['day'] is not None and anime['endDate']['day'] <= 31 else ""

        anime['startDate'] = f"{start_year}-{start_month}-{start_day}" if len(f"{start_year}-{start_month}-{start_day}") == 10 else None
        anime['endDate'] = f"{end_year}-{end_month}-{end_day}" if len(f"{end_year}-{end_month}-{end_day}") == 10 else None            
        
        anime['title_romaji'] = anime['title']['romaji']
        anime['title_english'] = anime['title']['english']
        anime['coverImage_large'] = anime['coverImage']['large']
        anime['coverImage_medium'] = anime['coverImage']['medium']
    
        relation_list = []
        #for relation in anime['relations']['nodes']:
        #    relation_list.append(str(relation['id']))
        for relation in anime['relations']['edges']:
            if relation['relationType'] != "CHARACTER" and relation['relationType'] != "OTHER" and relation['node']['type'] == "ANIME":
                relation_list.append(str(relation['node']['id']))
        
        anime['relations'] = ", ".join(relation_list)
        
        anime['genres'] = ", ".join(anime['genres'])
        anime.pop('coverImage')
        anime.pop('title')
        
        # separating 18+ titles into another dataframe to allow user to include 18+ titles if desired
        if "Hentai" in anime['genres']:
            api_output_anime_18only.append(anime)
        else:
            api_output_anime.append(anime)
        
    print(f"Page {page}, total collated: {len(api_output_anime)} anime titles (excludes 18+), {len(api_output_anime_18only)} anime titles (18+)")
    
    if data['data']['Page']['pageInfo']['hasNextPage'] == True:
        page += 1
    else:
        atLastPage = True
        
print("Completed")

Page 1, total collated: 50 anime titles (excludes 18+), 0 anime titles (18+)
Page 2, total collated: 100 anime titles (excludes 18+), 0 anime titles (18+)
Page 3, total collated: 150 anime titles (excludes 18+), 0 anime titles (18+)
Page 4, total collated: 192 anime titles (excludes 18+), 8 anime titles (18+)
Page 5, total collated: 242 anime titles (excludes 18+), 8 anime titles (18+)
Page 6, total collated: 290 anime titles (excludes 18+), 10 anime titles (18+)
Page 7, total collated: 339 anime titles (excludes 18+), 11 anime titles (18+)
Page 8, total collated: 389 anime titles (excludes 18+), 11 anime titles (18+)
Page 9, total collated: 439 anime titles (excludes 18+), 11 anime titles (18+)
Page 10, total collated: 489 anime titles (excludes 18+), 11 anime titles (18+)
Page 11, total collated: 539 anime titles (excludes 18+), 11 anime titles (18+)
Page 12, total collated: 589 anime titles (excludes 18+), 11 anime titles (18+)
Page 13, total collated: 637 anime titles (excludes 18+

In [18]:
anime_db = pd.DataFrame(api_output_anime)
print(anime_db.shape)
anime_db.head()

(15954, 20)


Unnamed: 0,id,status,genres,format,description,startDate,endDate,episodes,season,seasonYear,duration,averageScore,meanScore,popularity,relations,siteUrl,title_romaji,title_english,coverImage_large,coverImage_medium
0,1,FINISHED,"Action, Adventure, Drama, Sci-Fi",TV,"Enter a world in the distant future, where Bou...",1998-04-03,1999-04-24,26.0,SPRING,1998.0,24.0,86.0,86.0,249940,"5, 17205, 4037",https://anilist.co/anime/1,Cowboy Bebop,Cowboy Bebop,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...
1,5,FINISHED,"Action, Drama, Mystery, Sci-Fi",MOVIE,"As the Cowboy Bebop crew travels the stars, th...",2001-09-01,2001-09-01,1.0,SUMMER,2001.0,115.0,82.0,82.0,47081,1,https://anilist.co/anime/5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie - Knockin' on Heaven's...,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...
2,6,FINISHED,"Action, Adventure, Comedy, Drama, Sci-Fi",TV,Vash the Stampede is a wanted man with a habit...,1998-04-01,1998-09-30,26.0,SPRING,1998.0,24.0,79.0,79.0,81725,4106,https://anilist.co/anime/6,TRIGUN,Trigun,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...
3,7,FINISHED,"Action, Drama, Mystery, Supernatural",TV,Robin Sena is a powerful craft user drafted in...,2002-07-02,2002-12-24,26.0,SUMMER,2002.0,25.0,68.0,68.0,11850,,https://anilist.co/anime/7,Witch Hunter ROBIN,Witch Hunter ROBIN,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...
4,8,FINISHED,"Adventure, Fantasy, Supernatural",TV,It is the dark century and the people are suff...,2004-09-30,2005-09-29,52.0,FALL,2004.0,23.0,62.0,64.0,1683,1123,https://anilist.co/anime/8,Bouken Ou Beet,Beet the Vandel Buster,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...


In [19]:
anime_db_18 = pd.DataFrame(api_output_anime_18only)
print(anime_db_18.shape)
anime_db_18.head()

(1432, 20)


Unnamed: 0,id,status,genres,format,description,startDate,endDate,episodes,season,seasonYear,duration,averageScore,meanScore,popularity,relations,siteUrl,title_romaji,title_english,coverImage_large,coverImage_medium
0,188,FINISHED,"Comedy, Hentai, Mystery, Supernatural",OVA,Masquerade is the story surrounding the Hiraga...,1998-09-25,1999-03-25,4.0,FALL,1998.0,30.0,51.0,54.0,448,4561,https://anilist.co/anime/188,Gosenzo San'e,Masquerade,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...
1,203,FINISHED,"Adventure, Fantasy, Hentai",OVA,Based on the erotic game by Elf.<br>\n<br>\nTh...,1999-08-25,2000-11-25,5.0,SUMMER,1999.0,30.0,57.0,59.0,954,"2185, 125574",https://anilist.co/anime/203,Words Worth,,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...
2,211,FINISHED,Hentai,OVA,"Episode 1:<br>\n""There is no need for relation...",2001-09-25,2001-12-21,2.0,FALL,2001.0,30.0,52.0,55.0,483,,https://anilist.co/anime/211,Pure Mail,,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...
3,213,FINISHED,"Comedy, Hentai, Romance",OVA,Yusuke Kinoshita is forced by his father to wo...,1997-10-24,1998-04-24,3.0,FALL,1997.0,29.0,48.0,51.0,451,"214, 215",https://anilist.co/anime/213,Pia Carrot e Youkoso!!,Welcome to Pia Carrot,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...
4,214,FINISHED,"Comedy, Drama, Hentai, Romance",OVA,Yususke (from the first Pia Carrot series) ret...,1998-10-23,1999-04-23,3.0,FALL,1998.0,30.0,47.0,50.0,324,"213, 215",https://anilist.co/anime/214,Pia Carrot e Youkoso!! 2,Welcome To Pia Carrot 2,https://s4.anilist.co/file/anilistcdn/media/an...,https://s4.anilist.co/file/anilistcdn/media/an...


In [20]:
anime_db.to_csv("../data/anime_db.csv",index=False,encoding="utf-8")
anime_db_18.to_csv("../data/anime_db_18.csv",index=False,encoding="utf-8")

In [113]:
%%time

query = '''
query ($page: Int, $perPage: Int) {
    Page (page: $page, perPage: $perPage) {
        pageInfo {
            total
            currentPage
            lastPage
            hasNextPage
            perPage
        }
        users{
            id
            name
            mediaListOptions {
                scoreFormat
            }
            statistics {
                anime {
                    count
                    scores {
                        score
                        mediaIds
                    }
                    
                }
            }
            siteUrl
        }
    }
}
'''

url = 'https://graphql.anilist.co'
api_output_users = []
api_output_users_dict = {'userId':[], 'mediaId':[],'rating':[]}
atLastPage = False
page_numbers = list(range(1,30001))
pages = list(np.random.choice(page_numbers,size=1000,replace=False))
point_10 = [6,7,8,9]
point_5 = [1,2,3,4]

# to include my own anilist account in the output
if 12841 not in pages:
    pages.pop(999)
    pages = [12841] + pages
else:
    # to move the page with my own anilist account to the front
    # as i will be terminating the loop once i have 10_000 users
    pages.remove(12841)
    pages = [12841] + pages
    
#max page as of 10 Feb 2022 is 30582
#while not atLastPage:
#pages = [5,6]
#pages = list(range(1,51))
for page in tqdm(pages):
#for i in range(5):
    variables = {
        'page': int(page),
        'perPage': 50,
    }
    multiplier = 1
    time.sleep(0.667)
    
    response = requests.post(url, json={'query': query, 'variables': variables})
    data = response.json()
    
    for user in data['data']['Page']['users']:
        user_scores = []
        multiplier = 1
        point_5_flag = False
        point_10_flag = False
        
        user['score_format'] = user['mediaListOptions']['scoreFormat']
        
        # exclude users that did not rate any anime, or are on a 3-point scale
        # The 3 point scale in AniList is equivalent to rating the anime with a smiley face, neutral face or sad face
        # Difficult to quantify the faces so we will exclude it in the data collection 
        if len(user['statistics']['anime']['scores']) == 0 or user['score_format'] == "POINT_3":
            continue
        
        if user['statistics']['anime']['count']<10:
            continue
            
        for scores in user['statistics']['anime']['scores']:
            user_scores.append(scores['score'])
        
        for score in user_scores:
            if score <= 10:
                if score in point_5:
                    point_5_flag = True
                if score in point_10:
                    point_10_flag = True
        
        if point_10_flag:
            multiplier = 10
        elif point_5_flag and not point_10_flag:
            multiplier = 20
                    
        
        if multiplier > 1:
            for scores in user['statistics']['anime']['scores']:
                scores['score'] *= multiplier

        # The 5 point scale in AniList is on a scale of 1 to 5, integers only
        ##if user['score_format'] == "POINT_5":
            # scaling the score by 20 so that the maximum score is 100
            # to have every user's scores on the same scale
            ##for score in user['statistics']['anime']['scores']:
                ##if score['score'] * 20 <= 100 :
                    ##score['score'] *= 20
                    
        # The 10 point scale in Anilist is on a scale of 1 to 10, integers only
        #if user['score_format'] == "POINT_10":
            # scaling the score by 10 so that the maximum score is 100
            # to have every user's scores on the same scale
         #   for score in user['statistics']['anime']['scores']:
         #       score['score'] *= 10
        
        # generating the data in the format taken in by recommender
        for scores in user['statistics']['anime']['scores']:
            for mediaId in scores['mediaIds']:
                api_output_users_dict['userId'].append(user['id'])
                api_output_users_dict['mediaId'].append(mediaId)
                api_output_users_dict['rating'].append(scores['score'])
                
        user['statistics'] = user['statistics']['anime']['scores']
        user.pop('mediaListOptions')
        #user.pop('statistics')
        api_output_users.append(user)
    
    #if len(api_output_users)>= 10000:
    #    print(f"Completed. Collated {len(api_output_users)} users")
    #    break

100%|██████████| 1000/1000 [40:59<00:00,  2.46s/it]

Wall time: 40min 59s





In [114]:
users_db_lightfm = pd.DataFrame(api_output_users_dict)
print(users_db_lightfm.shape)
users_db_lightfm.head()

(937799, 3)


Unnamed: 0,userId,mediaId,rating
0,710080,1535,70
1,710080,21459,70
2,710080,113415,70
3,710080,11757,70
4,710080,5114,70


In [116]:
users_db_lightfm.to_csv("../data/user_list_lightfm.csv",index=False,encoding="utf-8")

In [117]:
user_db = pd.DataFrame(api_output_users)

In [118]:
user_db.to_pickle("../data/user_db.pkl")

In [119]:
def split_scores(stat_row):
    scores_dict = {}
    for item in stat_row:
        for media_id in item['mediaIds']:
            scores_dict[media_id] = item['score']
    
    return scores_dict

In [120]:
#user_db['statistics'] = user_db['statistics'].apply(lambda x: x['anime']['scores'])
user_db['statistics'] = user_db['statistics'].apply(split_scores)
user_db = user_db.join(pd.json_normalize(user_db['statistics']))
user_db = user_db.drop(columns=['statistics'])

In [121]:
print(user_db.shape)
user_db.head()

(14358, 9032)


Unnamed: 0,id,name,siteUrl,score_format,1535,21459,113415,11757,5114,20958,...,12141,6492,6000,314,1966,8993,109931,7305,6885,16718
0,710080,idkhowtoplay,https://anilist.co/user/710080,POINT_10_DECIMAL,70.0,70.0,70.0,70.0,70.0,70.0,...,,,,,,,,,,
1,710090,minteu,https://anilist.co/user/710090,POINT_10_DECIMAL,,,100.0,,,,...,,,,,,,,,,
2,710095,Jeremiah27,https://anilist.co/user/710095,POINT_10_DECIMAL,90.0,,,,,,...,,,,,,,,,,
3,710096,offworldshadow,https://anilist.co/user/710096,POINT_10_DECIMAL,,70.0,,,,,...,,,,,,,,,,
4,710098,mathiassassin,https://anilist.co/user/710098,POINT_100,95.0,80.0,95.0,,95.0,90.0,...,,,,,,,,,,


In [122]:
user_db.to_csv("../data/user_list.csv",index=False,encoding="utf-8")

In [2]:
def save_progress(user_list,num):
    df = pd.DataFrame(user_list)
    df['statistics'] = df['statistics'].apply(split_scores)
    df = df.join(pd.json_normalize(df['statistics']))
    df = df.drop(columns=['statistics'])
    df.to_csv(f"../data/user_list_{num}.csv")
    print(f"user_list_{num}.csv saved")

In [3]:
def split_scores(stat_row):
    scores_dict = {}
    for item in stat_row:
        for media_id in item['mediaIds']:
            scores_dict[media_id] = item['score']
    
    return scores_dict

In [None]:
%%time

query = '''
query ($page: Int, $perPage: Int) {
    Page (page: $page, perPage: $perPage) {
        pageInfo {
            total
            currentPage
            lastPage
            hasNextPage
            perPage
        }
        users{
            id
            name
            mediaListOptions {
                scoreFormat
            }
            statistics {
                anime {
                    scores {
                        score
                        mediaIds
                    }
                    
                }
            }
            siteUrl
        }
    }
}
'''

url = 'https://graphql.anilist.co'
api_output_users_full = []
atLastPage = False
page = 1
num = 0
is_retry = False
retries = 0
#max page as of 10 Feb 2022 is 30582
#max page as of 27 Feb 2022 is 30956

while not atLastPage:
    variables = {
        'page': page,
        'perPage': 50,
    }
    
    if is_retry:
        print(f"API response code {response.status_code}: retrying attempt #{retries}")
    
    if retries % 10:
        print("Retried {retries} times, pausing 30 seconds before trying again")
        time.sleep(30)
    elif retries > 50:
        raise ValueError("Retried more than 50 times")
    
    time.sleep(0.667)
    response = requests.post(url, json={'query': query, 'variables': variables})
    
    if response.status_code == 200:
        retries = 0
        is_retry = False
        data = response.json()

        #print(data)

        for user in data['data']['Page']['users']:
            ratings_count = 0
            user['score_format'] = user['mediaListOptions']['scoreFormat']

            # exclude users that did not rate any anime, or are on a 3-point scale
            # The 3 point scale in AniList is equivalent to rating the anime with a smiley face, neutral face or sad face
            # Difficult to quantify the faces so we will exclude it in the data collection 
            if len(user['statistics']['anime']['scores']) == 0 or user['score_format'] == "POINT_3":
                continue
            
            for score in user['statistics']['anime']['scores']:
            ratings_count += len(user['statistics']['anime']['scores']['mediaIds'])
        
            if ratings_count < 10:
                continue
            
            # The 10 point scale in Anilist is on a scale of 1 to 10, integers only
            if user['score_format'] == "POINT_10":
                # scaling the score by 10 so that the maximum score is 100
                # to have every user's scores on the same scale
                for score in user['statistics']['anime']['scores']:
                    score['score'] *= 10
            # The 5 point scale in AniList is on a scale of 1 to 5, integers only
            elif user['score_format'] == "POINT_5":
                # scaling the score by 20 so that the maximum score is 100
                # to have every user's scores on the same scale
                for score in user['statistics']['anime']['scores']:
                    score['score'] *= 20

            user['statistics'] = user['statistics']['anime']['scores']
            user.pop('mediaListOptions')
            #user.pop('statistics')
            api_output_users_full.append(user)

        #api_output_anime.extend(data['data']['Page']['media'])
        print(f"Users page {page}, total collated: {len(api_output_users_full)} users")
        
        if page % 6500 == 0:
            num += 1
            save_progress(api_output_users_full,num)
            api_output_users_full = []
        
        if data['data']['Page']['pageInfo']['hasNextPage'] == True:
            page += 1
        else:
            atLastPage = True
    else:
        is_retry = True
        retries += 1

print("Data collection of users completed")

Users page 1, total collated: 12 users
Users page 2, total collated: 23 users
Users page 3, total collated: 34 users
Users page 4, total collated: 44 users
Users page 5, total collated: 51 users
Users page 6, total collated: 56 users
Users page 7, total collated: 63 users
Users page 8, total collated: 73 users
Users page 9, total collated: 87 users
Users page 10, total collated: 95 users
Users page 11, total collated: 101 users
Users page 12, total collated: 106 users
Users page 13, total collated: 110 users
Users page 14, total collated: 116 users
Users page 15, total collated: 122 users
Users page 16, total collated: 130 users
Users page 17, total collated: 136 users
Users page 18, total collated: 143 users
Users page 19, total collated: 151 users
Users page 20, total collated: 158 users
Users page 21, total collated: 164 users
Users page 22, total collated: 172 users
Users page 23, total collated: 180 users
Users page 24, total collated: 184 users
Users page 25, total collated: 193 

In [5]:
user_db_full = pd.DataFrame(api_output_users_full)

In [7]:
#user_db['statistics'] = user_db['statistics'].apply(lambda x: x['anime']['scores'])
user_db_full['statistics'] = user_db_full['statistics'].apply(split_scores)
user_db_full = user_db_full.join(pd.json_normalize(user_db_full['statistics']))
user_db_full = user_db_full.drop(columns=['statistics'])

In [8]:
print(user_db_full.shape)
user_db_full.head()

(127635, 13504)


Unnamed: 0,id,name,score_format,10620,849,225,12477,1575,8460,6880,...,145209,145354,111859,138565,128675,135605,139354,135801,140111,113138
0,2,matchai,POINT_10_DECIMAL,80.0,80.0,80.0,80.0,80.0,80.0,80.0,...,,,,,,,,,,
1,5,kaotikmynd,POINT_5,,100.0,,,80.0,,,...,,,,,,,,,,
2,7,jamiejakov,POINT_10_DECIMAL,70.0,70.0,,,90.0,,,...,,,,,,,,,,
3,12,JesterOW,POINT_10,70.0,80.0,,,80.0,,,...,,,,,,,,,,
4,19,rayanp,POINT_10,,,,,,,,...,,,,,,,,,,


In [9]:
user_db_full.to_csv("../data/user_list_full.csv",index=False,encoding="utf-8")

In [19]:
d = {'userId': [1, 1, 1,2,2,2], 'mediaId': [1,2,3,1,2,3],'rating': [6,7,8,6,7,8]}
df = pd.DataFrame(d)
df

Unnamed: 0,userId,mediaId,rating
0,1,1,6
1,1,2,7
2,1,3,8
3,2,1,6
4,2,2,7
5,2,3,8
