In [1]:
import time
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import Image
import math

In [2]:
def get_the_info(link):
    """
    Recives the requests.get(url) and returns the ranks, titles, scores, formats, date and images(?)
    of every anime in the page. Images are a work in progress, maybe in the future
    """
    webcode=BeautifulSoup(link.content,'lxml')
    rankscode=webcode.select('td[class="rank ac"]>span')
    ranks=[i.text for i in rankscode]
    imagescode=webcode.select('td[class="title al va-t word-break"]>a>img')
    images=[Image(url=i['data-src']) for i in imagescode]
    #df['Images']=[display(i) for i in images]
    titlescode=webcode.select('div[class="di-ib clearfix"]>h3>a')
    titles=[i.text for i in titlescode]
    scorescode=webcode.select('td[class="score ac fs14"]>div>span')
    scores=[i.text for i in scorescode]
    detailscode=webcode.select('div[class="detail"]>div[class="information di-ib mt4"]')
    formats=[i.text.split('        ')[1].strip() for i in detailscode]
    date=[i.text.split('        ')[2].strip() for i in detailscode]
    return ranks,titles,scores,formats,date,images

In [3]:
def get_more_info(link):
    """
    Recives the requests.get(url),takes all the links in the page and returns the links,
    studio, genres, source, rating, recommendation of every anime in the page
    """
    webcode=BeautifulSoup(link.content,'lxml')
    linkscode=webcode.select('h3[class="hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3"]>a')
    links=[i['href'] for i in linkscode]
    producers=[]
    studio=[]
    genres=[]
    indvgenres=[]
    source=[]
    rating=[]
    recommendation=[]
    viewers=[]
    details=[]
    for i in links:
        time.sleep(2)
        sublink=requests.get(i)
        subwebcode=BeautifulSoup(sublink.content,'lxml')
        studiocode=subwebcode.select('div[class="spaceit_pad"]>a')
        for i in studiocode:
            if i['href'][:15]==('/anime/producer'):
                producers.append(i.text)
        studio.append(producers[-1])
        genrescode=subwebcode.select('div[class="spaceit_pad"]>a')
        for i in genrescode:
            if i['href'][:12]==('/anime/genre'):
                indvgenres.append(i.text)
        genres.append(indvgenres)
        indvgenres=[]
        sourcecode=subwebcode.select('div[class="spaceit_pad"]')
        for i in sourcecode:
            if i.text[:7]=='\nSource':
                source.append(i.text.split('\n')[2].strip())
        for i in sourcecode:
            if i.text[:7]=='\nRating':
                rating.append(i.text.split('\n')[2].strip())
        recommendationcode=subwebcode.select('a[class="link bg-center"]>span')
        recs=[i.text for i in recommendationcode]
        recommendation.append(recs[0])
        memberscode=subwebcode.select('span[class="numbers members"]>strong')
        members=[i.text for i in memberscode]
        viewers.append(members[0])
        continuationcode=subwebcode.select('td[class="ar fw-n borderClass"]')
        continuation=[i.text for i in continuationcode]
        details.append(continuation)
    return links,studio,genres,source,rating,recommendation,viewers,details

In [5]:
def put_it_in(info,more_info):
    """
    Recives the info regarding the get_the_info and get_more_info functions and joins them 
    in a pandas DataFrame
    """
    df=pd.DataFrame()
    df['Rank']=info[0]
    df['Title']=info[1]
    df['Score']=info[2]
    df['Format']=info[3]
    df['Date']=info[4]
    df['Viewers']=more_info[6]
    df['Studio']=more_info[1]
    df['Genres']=more_info[2]
    df['Source']=more_info[3]
    df['Rating']=more_info[4]
    df['Recommendation']=more_info[5]
    df['Links']=more_info[0]
    df['Details']=more_info[7]
    return df

In [6]:
def popular_animes(number):
    """
    Recives the number of animes you want to see and returns a pandas DataFrame and a .csv file with
    all the anime you requested
    """
    pages = 50 * math.floor((number-1)/50)
    url='https://myanimelist.net/topanime.php?type=bypopularity'
    link=requests.get(url)
    url2='&limit='
    animes=put_it_in(get_the_info(link),get_more_info(link))
    global anilinks
    if pages==0:
        anilinks=animes.pop('Links')
        anilinks=anilinks.reset_index(drop=True)
        animes=animes[:number]
        animes=animes.reset_index(drop=True)
        animes.to_csv('popular_anime.csv',index=False,sep=',')
        return animes
    else:
        for i in range(50,number,50):
            combinedurl=url+url2+str(i)
            combinedlink=requests.get(combinedurl)
            more_animes=put_it_in(get_the_info(combinedlink),get_more_info(combinedlink))
            animes=pd.concat([animes,more_animes])
        anilinks=animes.pop('Links')
        anilinks=anilinks.reset_index(drop=True)
        animes=animes[:number]
        animes=animes.reset_index(drop=True)
        animes.to_csv('popular_anime.csv',index=False,sep=',')
        return animes

In [46]:
test=popular_animes(1000)

In [47]:
test2=test

In [48]:
test2['Prequel']=[1 if 'Prequel:' in i else 0 for i in test['Details']]
test2['Sequel']=[1 if 'Sequel:' in i else 0 for i in test['Details']]
idvgenres=[j for i in test['Genres'] for j in i]
idvgenres=list(set(idvgenres))
for i in idvgenres:
    test2[i]=[1 if i in j else 0 for j in test2['Genres']]
test2['Media']=[i.split(" ")[0] for i in test2['Format']]
test2['Episodes']=[i.split(" ")[1].split("(")[1] for i in test2['Format']]
test2.to_csv('popular_anime_v2.csv',index=False,sep=',')