In [11]:
import re
import requests
import pandas as pd

from tqdm import tqdm_notebook
from bs4 import BeautifulSoup

## Functions 

In [12]:
def get_dict_ratings_anime(anime, ratings, genres, anime_img, anime_link, episodes):
    
    df_ratings = pd.DataFrame(ratings)    

    dict_ratings = {
        'anime' : anime,
        'anime_img' : anime_img,
        'anime_url' : anime_link,
        'episodes' : episodes,
        'votes' : df_ratings['rate_votes'].sum(),
        'weight' : df_ratings['rate_weight'].sum(),
        'rate' : round(df_ratings['rate_weight'].sum() / df_ratings['rate_votes'].sum(), 2),
    }
    
    # For each rating
    for i in range(1,6):
        dict_ratings[f'rate_{i}'] = df_ratings[df_ratings['rate_class'] == i]['rate_votes'].values[0]
        
    return {**genres, **dict_ratings}

## Crawling sitemap to get urls for each anime

In [13]:
url = 'https://www.crunchyroll.com/sitemap'
request = requests.get(url)
soup = BeautifulSoup(request.content)

sitemap = soup.findAll('loc')

languages_symbols = ['en-gb', 'es', 'es-es', 'pt-br', 'pt-pt', 'fr', 'de', 'ar', 'it', 'ru']
languages_symbols_url = [ f'https://www.crunchyroll.com/{i}' for i in languages_symbols]

anime_links = []
for url in tqdm_notebook(sitemap):
    request = requests.get(url.text)
    soup = BeautifulSoup(request.content)
    
    for link in soup.findAll('loc'):
        link = link.text

        # Looks only urls with len greater than https://www.crunchyroll.com/
        if len(link) > 28:
            if link.startswith('https://www.crunchyroll.com/') and 'forumtopic' not in link:
                if not link in languages_symbols_url:
                    re_search_tabs = re.search(r'(https://www\.crunchyroll\.com/.*)/', link)
                    if not re_search_tabs:
                        anime_links.append(link)
           
                    
anime_links = list(set(anime_links))

HBox(children=(IntProgress(value=0, max=66), HTML(value='')))




## Write list containing urls for animes

In [14]:
with open("../data/animes_list_urls.txt","w+") as f:
    f.write('\n'.join(anime_links))

## Crawling animes to get ratings and genres

In [15]:
anime_ratings = []
for anime_link in tqdm_notebook(anime_links):
    request = requests.get(anime_link)
    soup = BeautifulSoup(request.content)
    rating = soup.find('ul', {'class' : 'rating-histogram'})
    
    anime = soup.find('div', {'id' : 'showview-content-header'}).find('span').text
    anime_img = soup.find('img', {'class' : 'poster xsmall-margin-bottom'})['src']
    content_videos = soup.find('div', {'id' : 'showview_content_videos'})
    episodes = len(content_videos.findAll('a', {'class' : 'episode'}))
    
    ratings = []
    for rate in rating.findAll('li'):
        rate_class = rate.find('div', {'class' : 'left num strong'})
        rate_votes = rate.find(lambda tag: tag.name == 'div' and tag['class'] == ['left'])
        rate_votes = re.search(r'(\d+)', rate_votes.text)
        if rate_class:
            rate_class = int(rate_class.text)
            if rate_votes:
                rate_votes = int(rate_votes.group(1))
            else : 
                rate_votes = 0
                
            ratings.append({'rate_class' : rate_class, 'rate_votes' : rate_votes,
                            'rate_weight' : rate_class * rate_votes})
    
    genres = {}
    for link in soup.findAll(lambda tag: tag.name == 'a' and tag.get('class') == ['text-link']):
        if 'genres' in link['href']:
            genres[f'genre_{link.text}'] = 1
    
    anime_ratings.append(get_dict_ratings_anime(anime, ratings, genres, anime_img, anime_link, episodes))

HBox(children=(IntProgress(value=0, max=1255), HTML(value='')))

  if sys.path[0] == '':





## Write csv with ratings and genres

In [16]:
df_ratings = pd.DataFrame(anime_ratings).fillna(0)

# Sort columns
cols_genres = [col for col in df_ratings.columns if col.startswith('genre_')]
cols_genres.sort()

df_ratings = df_ratings[['anime', 'anime_url', 'anime_img', 'votes', 'weight', 'rate',
                         'rate_1', 'rate_2', 'rate_3', 'rate_4', 'rate_5'] + cols_genres]

df_ratings.sort_values(['votes', 'rate'], ascending=False).to_csv('../data/ratings.csv', index=False)

In [17]:
df_ratings

Unnamed: 0,anime,anime_url,anime_img,votes,weight,rate,rate_1,rate_2,rate_3,rate_4,...,genre_romance,genre_sci-fi,genre_seinen,genre_sgdrama,genre_shojo,genre_shonen,genre_slice of life,genre_sports,genre_supernatural,genre_thriller
0,Kabaneri of the Iron Fortress,https://www.crunchyroll.com/kabaneri-of-the-ir...,https://img1.ak.crunchyroll.com/i/spire4/b280b...,49,203,4.14,3,1,6,15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,Kiitaro's yokai picture diary,https://www.crunchyroll.com/kiitaros-yokai-pic...,https://img1.ak.crunchyroll.com/i/spire3/4d785...,8,34,4.25,1,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,JK-Meshi!,https://www.crunchyroll.com/jk-meshi,https://img1.ak.crunchyroll.com/i/spire2/8a323...,22,52,2.36,11,1,4,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,Rainy Cocoa sideG,https://www.crunchyroll.com/rainy-cocoa-sideg,https://img1.ak.crunchyroll.com/i/spire2/c0ffa...,8,15,1.88,4,2,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,Mischievous Kiss - The Movie,https://www.crunchyroll.com/mischievous-kiss-t...,https://img1.ak.crunchyroll.com/i/spire4/20606...,0,0,0.00,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,Hinako Note,https://www.crunchyroll.com/hinako-note,https://img1.ak.crunchyroll.com/i/spire2/174d1...,29,114,3.93,0,3,5,12,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1251,Masamune-kun's Revenge,https://www.crunchyroll.com/masamune-kuns-revenge,https://img1.ak.crunchyroll.com/i/spire4/e2ee0...,186,671,3.61,19,26,35,35,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1252,Kono Aozora ni Yakusoku wo,https://www.crunchyroll.com/kono-aozora-ni-yak...,https://img1.ak.crunchyroll.com/i/spire3/e36ef...,1119,5143,4.60,37,14,59,144,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1253,Sword Art Online Alternative: Gun Gale Online,https://www.crunchyroll.com/sword-art-online-a...,https://img1.ak.crunchyroll.com/i/spire4/88fd7...,182,678,3.73,26,13,25,39,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
