# Parse downloaded pages

### Importing libraries

In [407]:
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import os
import re
from natsort import natsorted

# Iterating over the folder

Define a function to scrab the anime (incomplite)

* anime_info: is a list that store the information of the anime in order of attributes list (the column of the dataframe)

Than the anime_info is store inside the __list_of_anime__ a list that store all the anime_info list. Than is pass to a daframe and create the tsv

In [476]:
def parse_time(dates):
# input: date >> a string of various format:
#            1)    Month day, Year to Month day, Year
#            2)    Month day, Year
#            3)    Year to Year

    # if we are in case 1 and 3
    if re.search("to", dates):
        clean_date = [re.sub("\n","",date).strip() for date in dates.split("to")]
        # if we are in case 1
        if re.search(",",clean_date[0]):
            # %b = for abbreviate month
            release = datetime.strptime(re.sub(",","",clean_date[0]), '%b %d %Y')
            # not everyone as end date (for example: still airing)
            try:
                end = datetime.strptime(re.sub(",","",clean_date[1]), '%b %d %Y')
            except:
                end = None
        # else we are in case 3
        else:
            release = datetime.strptime(clean_date[0], '%Y')
            end = datetime.strptime(clean_date[1], '%Y')
    # else we are in case 2
    else:
        clean_date = dates.replace(",","").strip()
        release = datetime.strptime(clean_date, '%b %d %Y')
        end = None
        
    return release, end

The function scrabbing_anime takes the __INFORMATION__ info of the anime + the __TITLE__ 

I will divide the scrubs in part:
* scrab 1: Title, Type, nEpisode, Realease / End date
* scrab 2: Score, Ranked, Poplarity, Members
* scrab 3: Anime characters and voices, synopsis
* scrab 4

In [477]:
def scrabbing_anime1(soup, anime_info):
    # Anime Title
    # <h1 class="title-name h1_bold_none"><strong>Fullmetal Alchemist: Brotherhood</strong></h1>
    title = str(soup.find("h1", attrs = {"class": "title-name h1_bold_none"}).string)
    # Taken Information
    # <h2>Information</h2> , there are a lot of h2 so i specify is written >>inside a div<<
    for h2 in soup.select('h2:has(+div)'):
        # I want the <h2> Information </h2> only
        if h2.text == "Information" :
            # iter over the next 4 <div> of Information, i go this way because
            # 1) i want to skip the "Status", is the 3th div
            # 2) more clear
            for inform in h2.find_all_next("div", attrs = {"class": "spaceit_pad"}, limit = 4):
                if inform.contents[1].string == "Type:":
                    Type = inform.get_text(separator=" ", strip=True).split()[-1] 
                if inform.contents[1].string == "Episodes:":
                    nEpisodes = inform.get_text(separator=" ", strip=True).split()[-1]
                if inform.contents[1].string == "Aired:":
                    # not always the date of start/end is store
                    # i can have "still airing" or NA value
                    try:
                        # take the string of where the data is store
                        date = inform.contents[2]
                        release_date, end_date = parse_time(date)
                    except:
                        release_date = None
                        end_date = None
    
    # save on anime info
    anime_info.extend((title,Type,nEpisodes,release_date,end_date))
    return anime_info

In [478]:
# scrabbing:
# Score, Ranked, Poplarity, Members
def scrabbing_anime2(soup, anime_info):
    #animeNumMembers
    members = soup.find("span",{"class":"numbers members"})
    members = int(members.find('strong').contents[0].replace(",",""))
    #animeScore
    score = soup.find("div", attrs = {"class": "fl-l score", "data-title": "score"})
    try:
        # is a number
        score = float(score.contents[0].string)
    except:
        # is N/A
        score = None
    # users
    try:
        users = soup.find("div", attrs = {"class": "fl-l score", "data-title": "score"}).get("data-user")
        users = int(users.replace(",","").split()[0])
    except:
        users = None
    #animeRank
    rank = soup.find("span",{"class":"numbers ranked"})
    try:
        #rank is a number
        rank =  int(rank.find('strong').contents[0].replace(r"#", ' '))
    except:
        # anime have a rank of NA
        rank = None
    #animePopularity  
    popularity = soup.find("span",{"class":"numbers popularity"})
    popularity = int(popularity.find('strong').contents[0].replace(r"#", ' '))

    # save on anime info
    anime_info.extend((members, score, users, rank, popularity))
        
    return anime_info

In [479]:
# Anime characters and voices, synopsis
def scrabbing_anime3(soup, anime_info):
    # Characters and voices doesn't exist always
    try:
        tag = soup.find_all("div", {"class": "detail-characters-list clearfix"})
        # there is some anime with empty attributes
        characters =  tag[0].find_all("h3", {"class": "h3_characters_voice_actors"})
        for i,char in enumerate(characters):
            characters[i] = char.get_text()
        voices = tag[0].find_all("td", {"class": "va-t ar pl4 pr4"})
        for i,voice in enumerate(voices):
            voices[i] = voice.get_text().replace("\n","")
    except:
        characters = None
        voices = None
    #synopsis (description)
    synopsis = str(soup.find("p", attrs = {"itemprop": "description"}).text)
    # related anime
    try:         
        related = soup.find_all("td", {"width": "100%",  "class": "borderClass"})
        for i,anime in enumerate(related):
            related[i] = anime.get_text()
        # only unique value
        related = list(set(related))
    except:
        related = None
    # Staff
    staff = []
    try:
        tag = soup.find_all("div", {"class": "detail-characters-list clearfix"})
        tag = tag[1].find_all("td")
        x = []
        y = []
        for i in range(1, len(tag), 2):
            x.append(tag[i].contents[1].contents[0])
            y.append(tag[i].find_all("small")[0].contents[0])
        staff.append([list(i) for i in list(zip(x,y))])
    except:
        staff = None
    
    # save on anime info
    anime_info.extend((synopsis,related,characters, voices, staff))
    return anime_info

# Try one anime

In [480]:
attrs = ["animeTitle", "animeType", "animeNumEpisode","releaseDate","endDate","animeNumMembers","animeScore","animeUsers","animeRank",
         "animePopularity","animeDescription","animeRelated","animeCharacters","animeVoices","animeStaff"]

list_of_anime = []
#html_name = r"./Folder_with_page/page43/anime_2110.html"
html_name = r"./Folder_with_page/page1/anime_1.html"

with open(html_name, "r",  encoding='utf-8') as fp:
    soup = BeautifulSoup(fp, "html.parser")

anime_info = []
anime_info + scrabbing_anime1(soup, anime_info)
anime_info + scrabbing_anime2(soup, anime_info)
anime_info + scrabbing_anime3(soup, anime_info)


# scrab of the anime finish
list_of_anime.append(anime_info)

# Creating the DataFrame
df = pd.DataFrame(list_of_anime, columns = attrs)
# change attributes to str
str_cols = ["animeTitle", "animeType", "animeDescription"]
df[str_cols] = df[str_cols].astype("string")
# Creating the tsv file
df.to_csv('anime1.tsv', index = False, sep = "\t")

In [481]:
list_of_anime

[['Fullmetal Alchemist: Brotherhood',
  'TV',
  '64',
  datetime.datetime(2009, 4, 5, 0, 0),
  datetime.datetime(2010, 7, 4, 0, 0),
  2676066,
  9.16,
  1622384,
  1,
  3,
  'After a horrific alchemy experiment goes wrong in the Elric household, brothers Edward and Alphonse are left in a catastrophic new reality. Ignoring the alchemical principle banning human transmutation, the boys attempted to bring their recently deceased mother back to life. Instead, they suffered brutal personal loss: Alphonse\'s body disintegrated while Edward lost a leg and then sacrificed an arm to keep Alphonse\'s soul in the physical realm by binding it to a hulking suit of armor.\n\n\nThe brothers are rescued by their neighbor Pinako Rockbell and her granddaughter Winry. Known as a bio-mechanical engineering prodigy, Winry creates prosthetic limbs for Edward by utilizing "automail," a tough, versatile metal used in robots and combat armor. After years of training, the Elric brothers set off on a quest to re

In [482]:
df.dtypes

animeTitle                  string
animeType                   string
animeNumEpisode             object
releaseDate         datetime64[ns]
endDate             datetime64[ns]
animeNumMembers              int64
animeScore                 float64
animeUsers                   int64
animeRank                    int64
animePopularity              int64
animeDescription            string
animeRelated                object
animeCharacters             object
animeVoices                 object
animeStaff                  object
dtype: object

In [483]:
df

Unnamed: 0,animeTitle,animeType,animeNumEpisode,releaseDate,endDate,animeNumMembers,animeScore,animeUsers,animeRank,animePopularity,animeDescription,animeRelated,animeCharacters,animeVoices,animeStaff
0,Fullmetal Alchemist: Brotherhood,TV,64,2009-04-05,2010-07-04,2676066,9.16,1622384,1,3,After a horrific alchemy experiment goes wrong...,[Fullmetal Alchemist: Brotherhood - 4-Koma The...,"[Elric, Edward, Elric, Alphonse, Mustang, Roy,...","[Park, RomiJapanese, Kugimiya, RieJapanese, Mi...","[[[Cook, Justin, Producer], [Yonai, Noritomo, ..."


# For all the anime

In [488]:
attrs = ["animeTitle", "animeType", "animeNumEpisode","releaseDate","endDate","animeNumMembers","animeScore","animeUsers","animeRank",
         "animePopularity","animeDescription","animeRelated","animeCharacters","animeVoices","animeStaff"]

list_of_anime = []
# from page 1 to 130 ( remember the page are from 1 to 130 and range goes from 0 to 129 )
for page in tqdm(range(0,30)): 
    folder = "./Folder_with_page/page"+str(page+1)
    for anime in natsorted(os.listdir(folder)):
        with open(folder + "/" + anime, "r",  encoding='utf-8') as fp:
            soup = BeautifulSoup(fp, "html.parser")
        anime_info = []
        anime_info + scrabbing_anime1(soup, anime_info)
        anime_info + scrabbing_anime2(soup, anime_info)
        anime_info + scrabbing_anime3(soup, anime_info)
        
        # Creating the DataFrame
        df = pd.DataFrame([anime_info], columns = attrs)
        # change attributes to str
        str_cols = ["animeTitle", "animeType", "animeDescription"]
        df[str_cols] = df[str_cols].astype("string")
        # Creating the tsv file, take the anime number (the id)
        name = re.sub(".html","",anime)
        df.to_csv("./tsv_anime/"+name+".tsv", index = False, sep = "\t")
        #cheack all the entries
        list_of_anime.append(anime_info)
        

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [03:34<00:00,  7.14s/it]


# Optional

In [490]:
# Creating the DataFrame
df_total = pd.DataFrame(list_of_anime, columns = attrs)
# Creating the tsv file
df_total.to_csv("./anime_totale.tsv", index = False, sep = "\t")    
    