In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

from pathlib import Path
data = Path("../data")
plot_path = data / "plots"
map_path = data / "graph3_map"
stat_path = map_path / "stats"

In [2]:
# Get the list of all the studios
url = "https://myanimelist.net/anime/producer"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
studio_list = soup.find_all("div", {"class": "genre-list al"})

In [3]:
studio_list

[<div class="genre-list al"><a class="genre-name-link" href="/anime/producer/441/8bit">8bit (57)</a></div>,
 <div class="genre-list al"><a class="genre-name-link" href="/anime/producer/1185/81_Produce">81 Produce (27)</a></div>,
 <div class="genre-list al"><a class="genre-name-link" href="/anime/producer/1286/10Gauge">10Gauge (9)</a></div>,
 <div class="genre-list al"><a class="genre-name-link" href="/anime/producer/2495/8PAN">8PAN (8)</a></div>,
 <div class="genre-list al"><a class="genre-name-link" href="/anime/producer/2503/7doc">7doc (7)</a></div>,
 <div class="genre-list al"><a class="genre-name-link" href="/anime/producer/2264/5_Inc">5 Inc. (2)</a></div>,
 <div class="genre-list al"><a class="genre-name-link" href="/anime/producer/2137/1IN">1IN (2)</a></div>,
 <div class="genre-list al"><a class="genre-name-link" href="/anime/producer/2531/_Youth_Film_Studio"> Youth Film Studio (1)</a></div>,
 <div class="genre-list al"><a class="genre-name-link" href="/anime/producer/1838/33_Col

In [4]:
# Get only the <a> tags
studio_list = [x.find_all("a")[0] for x in studio_list]
studio_list, len(studio_list)

([<a class="genre-name-link" href="/anime/producer/441/8bit">8bit (57)</a>,
  <a class="genre-name-link" href="/anime/producer/1185/81_Produce">81 Produce (27)</a>,
  <a class="genre-name-link" href="/anime/producer/1286/10Gauge">10Gauge (9)</a>,
  <a class="genre-name-link" href="/anime/producer/2495/8PAN">8PAN (8)</a>,
  <a class="genre-name-link" href="/anime/producer/2503/7doc">7doc (7)</a>,
  <a class="genre-name-link" href="/anime/producer/2264/5_Inc">5 Inc. (2)</a>,
  <a class="genre-name-link" href="/anime/producer/2137/1IN">1IN (2)</a>,
  <a class="genre-name-link" href="/anime/producer/2531/_Youth_Film_Studio"> Youth Film Studio (1)</a>,
  <a class="genre-name-link" href="/anime/producer/1838/33_Collective">33 Collective (1)</a>,
  <a class="genre-name-link" href="/anime/producer/17/Aniplex">Aniplex (578)</a>,
  <a class="genre-name-link" href="/anime/producer/48/AIC">AIC (257)</a>,
  <a class="genre-name-link" href="/anime/producer/56/A-1_Pictures">A-1 Pictures (238)</a>,
  

In [5]:
# Get the studio id and name
studio_id = []
studio_name = []
for studio in studio_list:
    studio_id.append(studio["href"].split("/")[3])
    # The studio name is of the form STUDIO_NAME (NUM_ANIMES). Get only the STUDIO_NAME
    studio_name.append(studio.text.split("(")[0].strip())

# Get the studio logo
studio_logo = []
for studio in studio_id:
    studio_logo.append(f"https://cdn.myanimelist.net/images/company/{studio}.png")

In [6]:
# Create the dataframe
studios = pd.DataFrame({"id": studio_id, "studio": studio_name, "logo_url": studio_logo})
studios.to_csv(data / "studios.csv", index=False)

In [7]:
# Display the studio logo of Madhouse
from IPython.display import Image
Image(url=studios[studios["studio"] == "Topcraft"]["logo_url"].values[0])

In [8]:
# Sort studio_name by the alphabetical order IF THE NAME WERE ALL LOWERCASE
studios_mal = sorted(studio_name, key=lambda x: x.lower())

with open(data / "studios_mal.txt", "w") as f:
    for studio in studios_mal:
        f.write(studio + "\n")

In [9]:
studio_num_animes = pd.read_csv(stat_path / "studio_num_animes.csv")

In [10]:
# Sort studios_num_animes["studio"] by the alphabetical order IF THE NAME WERE ALL LOWERCASE
studios_db = sorted(studio_num_animes["studio"], key=lambda x: x.lower())

# Write the list of studios in a file
with open(data / "studios_db.txt", "w") as f:
    for studio in studios_db:
        f.write(f"{studio}\n")