In [44]:
import os
from bs4 import BeautifulSoup
import requests
import time
from datetime import datetime
import json
from zipfile import ZipFile
import pandas as pd
import shutil
import zipfile
import random

In [45]:
BASE_PATH = "data/anime_rec"
HTML_PATH = BASE_PATH + "/html"
USER_PATH = BASE_PATH + "/users"

In [46]:
os.makedirs(BASE_PATH, exist_ok=True)
os.makedirs(HTML_PATH, exist_ok=True)
os.makedirs(USER_PATH, exist_ok=True)

In [47]:
clubs_path = os.path.join(BASE_PATH, "clubs.txt")
users_list_path = os.path.join(BASE_PATH, "users_list.txt")
users_path = os.path.join(BASE_PATH, "users.csv")
revised_clubs_path = os.path.join(BASE_PATH, "revised_clubs.txt")
last_revised_user_path = os.path.join(BASE_PATH, "_last_revised_users.txt")

## Procedure

1. Fetch `myanimelist` web, API for club IDs. Store all IDs in a text file.
2. Get users per each club ID.
3. Get anime list per user
4. Download anime's data
5. Other processing steps

In [48]:
class Request:
    def __init__(self, base_url):
        self.base_url = base_url

    @staticmethod
    def parse_params(params: dict[str, str]) -> str:
        return "&".join([f"{x}={y}" for x, y in params.items()])

    def get_endpoint_url(self, endpoint: str, params: dict[str, str]) -> str:
        return f"{self.base_url}/{endpoint}?{Request.parse_params(params)}"

    def request(self, endpoint, **kwargs):
        try:
            url = self.get_endpoint_url(endpoint, kwargs)
            response = requests.get(url)
            response.raise_for_status()
            return response
        except KeyboardInterrupt:
            raise Error
        except:
            return None

anime_list_cli = Request("https://myanimelist.net")
jikan_cli = Request("https://api.jikan.moe/v4")

## Club ID

In [49]:
USER_COUNT_THRESHOLD = 1000000
club_ids = set()
possible_users = 0
page = 1


def get_number(string: str) -> int:
    return int(string.strip().replace(",", ""))


def get_club_ids_by_page(page: int):
    global possible_users
    """
    Fetch the URL for clubs. Add club id to `club_ids` set if the club has more than 30 members.
    """
    response = anime_list_cli.request("clubs.php", p=page)
    soup = BeautifulSoup(response.content, "html.parser")
    rows = soup.find_all("tr", {"class": "table-data"})
    for row in rows:
        club_id = get_number(row.find("a", class_="fw-b").get("href").split("=")[-1])
        member_count = get_number(row.find("td", class_="ac").text)
        if club_id not in club_ids and member_count > 30:
            possible_users += member_count
            club_ids.add(club_id)


while possible_users < USER_COUNT_THRESHOLD:
    print(f"\rPage: {page}", end="")
    time.sleep(3)
    get_club_ids_by_page(page)
    page += 1

Page: 23

In [50]:
with open(clubs_path, "w") as f:
    for club_id in club_ids:
        f.write(f"{club_id}\n")

## Get usernames in each club

In [51]:
with open(clubs_path) as f:
    clubs_id = [x.strip() for x in f.readlines()]
len(clubs_id)

726

### Get usernames

In [52]:
# Fetch `jikan` API to get users' information.
# Write data to `users.txt`
revised_clubs = set()
users = set()

for idx, club_id in enumerate(clubs_id):
    if idx + 1 > 10:
        break

    if club_id in revised_clubs:
        continue

    time.sleep(5)
    endpoint = f"clubs/{club_id}/members"
    response = jikan_cli.request(endpoint)
    if response is None:
        continue

    with open(users_list_path, mode="a", encoding="utf-8") as f:
        for user in map(lambda x: x["username"], response.json()["data"]):
            if user not in users and user != "":
                f.write(user + "\n")
                users.add(user)

    revised_clubs.add(club_id)
    with open(revised_clubs_path, "a", encoding="utf-8") as f:
        f.write(club_id + "\n")

    print(f"\r{idx+1}/{len(clubs_id)}", end="")

10/726

In [54]:
with open(clubs_path) as f:
    clubs_id = [x.strip() for x in f.readlines()]

with open(users_list_path, encoding="UTF-8") as f:
    users = set([x.strip() for x in f.readlines()])

with open(revised_clubs_path, encoding="UTF-8") as f:
    revised_clubs = set([int(x.strip()) for x in f.readlines()])

len(clubs_id), len(users), len(revised_clubs)

(726, 335, 10)

In [55]:
with open(users_list_path, encoding="UTF-8") as f:
    users = list(set([x.strip() for x in f.readlines()]))[1:]
    random.shuffle(users)

with open(users_path, "w", encoding="UTF-8") as f:
    f.write("user_id,username\n")
    for i, user in enumerate(users):
        f.write(f"{i},{user}\n")

## Get anime list per user

In [56]:
with open(users_path, "r", encoding="UTF-8") as file:
    file.readline()
    users = [x.strip().split(",") for x in file.readlines()]
    users = [(int(x[0]), x[1]) for x in users]

last_revised_user = -1
if os.path.exists(last_revised_user_path):
    with open(last_revised_user_path, "r", encoding="UTF-8") as file:
        last_revised_user = int(file.readline())

len(users), last_revised_user

(334, -1)

In [57]:
def record_anime_for_user(data: list[dict[str, str]], user_id: str):
    if len(data) > 0:
        with open(os.path.join(USER_PATH, f"{user_id}.csv"), "w") as f:
            f.write("anime_id,score,watching_status,watched_episodes\n")
            for a, b, c, d in data:
                f.write(f"{a},{b},{c},{d}" + "\n")


def store_last_processed_user(user_id):
    with open(last_revised_user_path, "w", encoding="UTF-8") as f:
        f.write(f"{user_id}\n")


for idx, (user_id, username) in enumerate(users):
    if idx + 1 > 5:
        break

    if user_id <= last_revised_user:
        continue

    all_animes = []
    offset = 0
    while True:
        time.sleep(5)
        endpoint = f"animelist/{username}/load.json"
        response = anime_list_cli.request(endpoint, offset=offset)
        if response is None:
            continue

        data = response.json()
        for anime in data:
            all_animes.append((anime["anime_id"], anime["score"], anime["status"], anime["num_watched_episodes"]))

        offset += 300
        if len(data) < 300:
            break

    record_anime_for_user(all_animes, user_id)

    store_last_processed_user(user_id)
    print(f"\rUser {idx + 1}", end="")

User 5

## Download anime HTML

In [59]:
unique_anime = set()
folder = os.listdir(USER_PATH)
for user_file in folder:
    if not user_file.endswith(".csv"):
        continue
    with open(f"{USER_PATH}/{user_file}", "r") as file:
        file.readline()
        for line in file:
            anime = line.strip().split(",")[0]
            unique_anime.add(anime)
len(unique_anime)

1946

In [60]:
MAX = 7  # MAX SECOND TO WAIT PER REQUEST
MIN = 4  # MIN SECONDS TO WAIT PER REQUEST


def sleep():
    time_to_sleep = random.random() * (MAX - MIN) + MIN
    time.sleep(time_to_sleep)


def get_link_by_text(soup, anime_id, text):
    links = list(filter(lambda x: anime_id in x["href"], soup.find_all("a", string=text)))
    return links[0]["href"]


def save(path, data):
    with open(path, "w", encoding="UTF-8") as file:
        file.write(data)


def save_link(link, anime_id, name):
    sleep()
    path = f"{HTML_PATH}/{anime_id}/{name}.html"
    data = requests.get(link)
    soup = BeautifulSoup(data.text, "html.parser")
    soup.script.decompose()
    save(path, soup.prettify())
    return soup


def save_reviews(link, anime_id):
    page = 1
    while True:
        sleep()
        actual_link = f"{link}?p={page}"
        data = requests.get(actual_link)
        soup = BeautifulSoup(data.text, "html.parser")
        reviews = soup.find_all("a", string="Overall Rating")
        if len(reviews) == 0:
            break

        path = f"{HTML_PATH}/{anime_id}/reviews_{page}.html"
        soup.script.decompose()
        save(path, soup.prettify())
        page += 1


def scrape_anime(anime_id):
    path = f"{HTML_PATH}/{anime_id}"
    os.makedirs(path, exist_ok=True)
    sleep()
    data = requests.get(f"https://myanimelist.net/anime/{anime_id}")

    anime_info = data.text
    soup = BeautifulSoup(anime_info, "html.parser")
    soup.script.decompose()
    save(f"{HTML_PATH}/{anime_id}/details.html", soup.prettify())

    link_review = get_link_by_text(soup, anime_id, "Reviews")
    link_recomendations = get_link_by_text(soup, anime_id, "Recommendations")
    link_stats = get_link_by_text(soup, anime_id, "Stats")
    link_staff = get_link_by_text(soup, anime_id, "Characters & Staff")
    link_pictures = get_link_by_text(soup, anime_id, "Pictures")

    save_link(link_pictures, anime_id, "pictures")
    save_link(link_staff, anime_id, "staff")
    save_link(link_stats, anime_id, "stats")
    save_link(link_recomendations, anime_id, "recomendations")
    save_reviews(link_review, anime_id)


def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(
                os.path.join(root, file),
                os.path.relpath(os.path.join(root, file), path),
            )

In [61]:
# Scrape data for anime by its ID.
# Store data in a temporary folder, which contains all html files of the anime.
# Compress the whole folder into a zip file.
# Finally, remove the temporary folder.

for i, anime_id in enumerate(unique_anime):
    if i + 1 > 3:
        break

    if os.path.isfile(f"{HTML_PATH}/{anime_id}.zip"):
        continue

    print(f"\r{i+1}/{len(unique_anime)}", end="")

    try:
        scrape_anime(anime_id)
    except KeyboardInterrupt:
        break
    except:
        time.sleep(120)
        continue

    path = f"{HTML_PATH}/{anime_id}"
    zipf = zipfile.ZipFile(f"{path}.zip", "w", zipfile.ZIP_DEFLATED)
    zipdir(path, zipf)
    zipf.close()

    shutil.rmtree(path)

3/1946

In [62]:
def extract_zip(input_zip: str):
    """
    :param input_zip: Path to the file.
    :type input_zip: str
    :return: Dict containing data of the anime. 
    :rtype: dict[str, str]
    """
    input_zip = ZipFile(input_zip)
    return {name: input_zip.read(name) for name in input_zip.namelist()}

KEYS = ['MAL_ID', 'Name', 'Score', 'Genders', 'English name', 'Japanese name', 'Type', 'Episodes',
        'Aired', 'Premiered', 'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
        'Ranked', 'Popularity', 'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
        'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6', 'Score-5', 'Score-4',
        'Score-3', 'Score-2', 'Score-1']

In [69]:
def get_name(info):
    try:
        return info.find("h1", {"class": "title-name h1_bold_none"}).text.strip()
    except: 
        return ""

def get_english_name(info):
    try:
        span = info.find_all("span", {"class": "dark_text"})
        return span.parent.text.strip()
    except:
        return ""

def get_table(a_soup):
    try:
        return a_soup.find("div", {"class": "po-r js-statistics-info di-ib"})
    except:
        return ""

def get_score(stats):
    try:
        score = stats.find("span", {"itemprop": "ratingValue"})
        if score is None:
            return "Unknown"
        return score.text.strip()
    except:
        return ""

def get_gender(sum_info):
    try:
        text = ", ".join(
            [x.text.strip() for x in sum_info.find_all("span", {"itemprop": "genre"})]
        )
        return text
    except:
        return ""

def get_description(sum_info):
    try:
        return sum_info.find("td", {"class": "borderClass", "width": "225"})
    except:
        return ""

def get_all_stats(soup):
    try:
        return soup.find("div", {"id": "horiznav_nav"}).parent.find_all(
            "div", {"class": "spaceit_pad"}
        )
    except:
        return ""

def get_info_anime(anime_id):
    path = os.path.join(HTML_PATH, f"{anime_id}.zip")
    data = extract_zip(path)
    anime_info = data["stats.html"].decode()
    soup = BeautifulSoup(anime_info, "html.parser")

    stats = get_table(soup)
    description = get_description(soup)
    anime_info = {key: "Unknown" for key in KEYS}

    anime_info["MAL_ID"] = anime_id
    anime_info["Name"] = get_name(soup)
    anime_info["Score"] = get_score(stats)
    anime_info["Genders"] = get_gender(description)

    for d in description.find_all("span", {"class": "dark_text"}):
        information = [x.strip().replace(" ", " ") for x in d.parent.text.split(":")]
        category, value = information[0], ":".join(information[1:])
        value.replace("\t", "")

        if category in ["Broadcast", "Synonyms", "Genres", "Score", "Status"]:
            continue

        if category in ["Ranked"]:
            value = value.split("\n")[0]
        if category in ["Producers", "Licensors", "Studios"]:
            value = ", ".join([x.strip() for x in value.split(",")])
        if category in ["Ranked", "Popularity"]:
            value = value.replace("#", "")
        if category in ["Members", "Favorites"]:
            value = value.replace(",", "")
        if category in ["English", "Japanese"]:
            category += " name"

        anime_info[category] = value.strip()

    # Stats (Watching, Completed, On-Hold, Dropped, Plan to Watch)
    for d in get_all_stats(soup)[:5]:
        category, value = [x.strip().replace(" ", " ") for x in d.text.split(":")]
        value = value.replace(",", "")
        anime_info[category] = value

    # Stast votes per score
    for d in get_all_stats(soup)[6:]:
        score = d.parent.parent.find("td", {"class": "score-label"}).text.strip()
        value = [x.strip().replace(" ", " ") for x in d.text.split("%")][1].strip(
            "(votes)"
        )
        label = f"Score-{score}"
        anime_info[label] = value.strip()

    unknown_list = ["?", "None found, add some", "None", "N/A", "Not available"]
    for key, value in anime_info.items():
        if str(value) in unknown_list:
            anime_info[key] = "Unknown"
    return anime_info

In [70]:
os.listdir(HTML_PATH)

['8311.zip', '151.zip', '10805.zip', '.ipynb_checkpoints']

In [72]:
get_info_anime(10805)

{'MAL_ID': 10805,
 'Name': 'Kami nomi zo Shiru Sekai: 4-nin to Idol',
 'Score': '',
 'Genders': 'Comedy, Supernatural, Music, Shounen',
 'English name': 'The World God Only Knows:Four Girls and an Idol',
 'Japanese name': '神のみぞ知るセカイ\u3000４人とアイドル',
 'Type': 'OVA',
 'Episodes': '1',
 'Aired': 'Sep 16, 2011',
 'Premiered': 'Unknown',
 'Producers': 'Geneon Universal Entertainment, Shogakukan-Shueisha Productions, Shogakukan Music & Digital Entertainment',
 'Licensors': 'Sentai Filmworks',
 'Studios': 'Manglobe',
 'Source': 'Manga',
 'Duration': '24 min.',
 'Rating': 'PG-13 - Teens 13 or older',
 'Ranked': '2938',
 'Popularity': '2089',
 'Members': '110009',
 'Favorites': '89',
 'Watching': '1501',
 'Completed': '90951',
 'On-Hold': '714',
 'Dropped': '492',
 'Plan to Watch': '16351',
 'Score-10': '4550',
 'Score-9': '6188',
 'Score-8': '15436',
 'Score-7': '22446',
 'Score-6': '9750',
 'Score-5': '3598',
 'Score-4': '977',
 'Score-3': '323',
 'Score-2': '113',
 'Score-1': '137',
 'Theme': 

In [73]:
anime_revised = set()
anime_tsv_path = os.path.join(BASE_PATH, "anime.tsv")
exist_file = os.path.exists(anime_tsv_path)
actual_data = pd.DataFrame()
if exist_file:
    actual_data = pd.read_csv(anime_tsv_path, sep="\t")
    anime_revised = list(actual_data.MAL_ID.unique())

actual_data.head()
total_data = []
zips = os.listdir(HTML_PATH)
for i, anime in enumerate(zips):
    if not ".zip" in anime:
        continue

    anime_id = int(anime.strip(".zip"))
    if int(anime_id) in anime_revised:
        continue

    print(f"{i+1}/{len(zips)} ({anime_id})")

    anime_id = anime.strip(".zip")
    total_data.append(get_info_anime(anime_id))

if len(total_data):
    df = pd.DataFrame.from_dict(total_data)
    df["MAL_ID"] = pd.to_numeric(df["MAL_ID"])
    df = df.sort_values(by="MAL_ID").reset_index(drop=True)

    if exist_file:
        df = (
            pd.concat([actual_data, df]).sort_values(by="MAL_ID").reset_index(drop=True)
        )

else:
    df = actual_data

pd.set_option("display.max_columns", None)
df.head()

1/4 (8311)
2/4 (151)
3/4 (10805)


Unnamed: 0,MAL_ID,Name,Score,Genders,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,German,Spanish,French,Demographic,Theme
0,151,Re: Cutie Honey,,"Action, Comedy, Girls Love, Sci-Fi, Ecchi",Unknown,Re:キューティーハニー,OVA,3,"Jul 24, 2004 to Sep 25, 2004",Unknown,Toei Video,Discotek Media,"Gainax, Toei Animation",Manga,45 min. per ep.,R+ - Mild Nudity,3779,4178,32154,194,1428,15030,816,748,14132,942,1295,3026,3596,1896,840,311,123,57,79,,,,,
1,8311,Tegamibachi Reverse,,"Adventure, Fantasy, Shounen",Tegami Bachi:Letter Bee Reverse,テガミバチ REVERSE,TV,25,"Oct 3, 2010 to Mar 26, 2011",Fall 2010,"Pierrot, TV Tokyo",Sentai Filmworks,Pierrot Plus,Manga,20 min. per ep.,PG-13 - Teens 13 or older,1277,2872,65596,213,3218,36712,2529,1941,21196,3310,4575,8216,7116,2792,1053,362,144,73,69,Tegami Bachi:Letter Bee Reverse,Tegami Bachi Letter Bee Temporada 2,Letter Bee Reverse,Shounen\n \n\n Shounen,
2,10805,Kami nomi zo Shiru Sekai: 4-nin to Idol,,"Comedy, Supernatural, Music, Shounen",The World God Only Knows:Four Girls and an Idol,神のみぞ知るセカイ　４人とアイドル,OVA,1,"Sep 16, 2011",Unknown,"Geneon Universal Entertainment, Shogakukan-Shu...",Sentai Filmworks,Manglobe,Manga,24 min.,PG-13 - Teens 13 or older,2938,2089,110009,89,1501,90951,714,492,16351,4550,6188,15436,22446,9750,3598,977,323,113,137,,,,Shounen\n \n\n Shounen,Music\n \n\n Music


In [74]:
df.to_csv(f"{BASE_PATH}/anime.tsv", index=False, sep="\t", encoding="UTF-8")

## Rating complete

In [75]:
rating_complete_path = os.path.join(BASE_PATH, "rating_complete.csv")
if not os.path.exists(rating_complete_path):
    with open(rating_complete_path, "w", encoding="UTF-8") as file:
        file.write("user_id,anime_id,rating\n")

In [76]:
unique_anime = set()
all_users = sorted(
    [user_file for user_file in os.listdir(USER_PATH) if user_file.endswith(".csv")],
    key=lambda x: int(x.split(".")[0])
)

with open(rating_complete_path, "a") as f1:
    for i, user_file in enumerate(all_users):
        if not user_file.endswith(".csv"):
            continue

        print(f"\r{i+1}/{len(all_users)}", end="")

        user_id = user_file.split(".")[0]
        with open(f"{USER_PATH}/{user_file}", "r") as file:
            file.readline()
            for line in file:
                anime_id, score, watching_status, _ = line.strip().split(",")
                if int(watching_status) == 2 and (score) != 0:
                    f1.write(f"{user_id},{anime_id},{score}\n")

5/5