In [1]:
from jikanpy import Jikan
import numpy as np
import pandas as pd
import requests
import time

In [2]:
jikan = Jikan(session=requests.Session())

### Get Anime IDs

In [3]:
def get_ani_id(pages):
    """Requests animeIDs from top Anime pages and appends them to a single list"""
    ani_list = []
    
    for j in range(1, pages+1):
        try:
            search_result = jikan.top(type='anime', page=j)
            time.sleep(4)
            for i in search_result['top']:
                ani_list.append(i['mal_id'])
            time.sleep(4)
        except:
            pass
    
    return ani_list

In [4]:
ani_list = get_ani_id(5)
# time.sleep(4)

### Get Anime Details

In [5]:
def get_ani_details(ani_list):
    """Takes in list of animeIDs and returns the list that contains the title, rating, and genres for each anime"""
    total_list = []
    looped_id = set({})
    
    for id in ani_list:
        try:
            details = []
            genres = []

            anime_result = jikan.anime(id)
            time.sleep(4)
            
            details.append(anime_result['title'].strip())
            details.append(anime_result['rating'].strip())

            for g in anime_result['genres']:
                genres.append(g['name'].strip())

            details.append(genres)
            
            looped_id.add(anime_result['mal_id'])
            total_list.append(details)
        except:
            pass
        
    return total_list, looped_id

In [6]:
det_list, rem_list = get_ani_details(ani_list)

In [7]:
def get_ani_details2(ani_list, current_list, remainder_set):
    """Returns items that get_ani_details missed due to HTTP error"""
    remainder_list = list(remainder_set)
    
    diff_list = np.setdiff1d(ani_list, remainder_list)
    new_list, new_set = get_ani_details(list(diff_list))
    
    current_list += new_list
    remainder_list += list(new_set)
    
    while list(diff_list):
        diff_list = np.setdiff1d(ani_list, remainder_list)
        new_list, new_set = get_ani_details(list(diff_list))
        
        current_list += new_list
        remainder_list += list(new_set)
        
        if not list(diff_list):
            print('break')
        else:
            print('pass')
        
    return current_list, remainder_list

In [8]:
det2_list, rem2_list = get_ani_details2(ani_list, det_list, rem_list)

pass
pass
pass
pass
pass
break


In [9]:
det_df = pd.DataFrame(det2_list, columns=['Anime', 'Rating', 'Genres'])

In [10]:
det_df.head()

Unnamed: 0,Anime,Rating,Genres
0,Fullmetal Alchemist: Brotherhood,R - 17+ (violence & profanity),"[Action, Military, Adventure, Comedy, Drama, M..."
1,Steins;Gate,PG-13 - Teens 13 or older,"[Thriller, Sci-Fi]"
2,Hunter x Hunter (2011),PG-13 - Teens 13 or older,"[Action, Adventure, Fantasy, Shounen, Super Po..."
3,Gintama°,PG-13 - Teens 13 or older,"[Action, Comedy, Historical, Parody, Samurai, ..."
4,Koe no Katachi,PG-13 - Teens 13 or older,"[Drama, School, Shounen]"


In [11]:
ani = pd.DataFrame(ani_list, columns=['ID'])
len(ani['ID'].unique())

250

In [12]:
# should match len(ani['ID'].unique())
len(det_df['Anime'].unique())

250

In [13]:
# det_df.to_csv('details_df.csv', index=False)

### Get Anime Reviews

In [14]:
def ani_rev(ani_list):
    """Takes in list of animeIDs and returns the list that contains the reviews for each anime, with the anime title and user who reviewed"""
    total_rev = []
    looped_id = set({})
    
    for i in ani_list:
        try:
            anime_result = jikan.anime(i)
            time.sleep(4)
            anime_rev = jikan.anime(i, extension='reviews')
            time.sleep(4)
            for j in anime_rev['reviews']:
                user_rev = []
                
                user_rev.append(anime_result['title'].strip())
                user_rev.append(j['reviewer']['username'])
                user_rev.append(j['reviewer']['scores']['overall'])
                
                total_rev.append(user_rev)
                looped_id.add(anime_result['mal_id'])

            time.sleep(10)
        except:
            pass
    
    return total_rev, looped_id

In [15]:
list_1, list_2 = ani_rev(ani_list)

In [16]:
def ani_rev2(ani_list, current_list, remainder_set):
    """Returns items that ani_rev missed due to HTTP error"""
    remainder_list = list(remainder_set)
    
    diff_list = np.setdiff1d(ani_list, remainder_list)
    new_list, new_set = ani_rev(list(diff_list))
    
    current_list += new_list
    remainder_list += list(new_set)
    
    while new_set:
        diff_list = np.setdiff1d(ani_list, remainder_list)
        new_list, new_set = ani_rev(list(diff_list))
        
        current_list += new_list
        remainder_list += list(new_set)
        
        if not new_set:
            print('break')
        else:
            print('pass')
        
    return current_list, remainder_list

In [17]:
list_3, list_4 = ani_rev2(ani_list, list_1, list_2)

pass
pass
pass
break


In [32]:
rate_df = pd.DataFrame(list_3, columns=['Anime', 'User', 'Score'])

In [33]:
rate_df.head()

Unnamed: 0,Anime,User,Score
0,Fullmetal Alchemist: Brotherhood,tazillo,10
1,Fullmetal Alchemist: Brotherhood,Archaeon,9
2,Fullmetal Alchemist: Brotherhood,ChristopherKClaw,7
3,Fullmetal Alchemist: Brotherhood,literaturenerd,7
4,Fullmetal Alchemist: Brotherhood,M0nkeyD_Luffy,5


In [34]:
# should match len(ani['ID'].unique()), if not move to next section
len(rate_df['Anime'].unique())

249

In [35]:
len(ani['ID'].unique())

250

In [45]:
# rate_df.to_csv('rate_df.csv', index=False)

### Remove shows with No Reviews

In [39]:
null_list = []

for show in list(det_df['Anime'].unique()):
    if show not in list(rate_df['Anime'].unique()):
        null_list.append(show)
        
null_list

['Gintama: Yorinuki Gintama-san on Theater 2D']

In [40]:
for null_show in null_list:
    det2_df = det_df[det_df['Anime'] != null_show]

In [44]:
# should match len(rate_df['Anime'].unique())
len(det2_df['Anime'].unique())

249

In [46]:
# det2_df.to_csv('det_df.csv', index=False)