In [1]:
import requests 
import json 
import pandas as pd 
import pyarrow 
import time
import sys, os



# REQUEST AND APPEND INTO DATA FOLDER

In [2]:
# Set base directory to project root 
desired_dir = "manga_anilist_pipeline"

file_path = os.path.abspath('/Users/duartedasilva/Desktop/Data Work/Data Projects/manga_anilist_pipeline')
quoted_path = repr(file_path)

os.chdir(file_path)
os.getcwd()



'/Users/duartedasilva/Desktop/Data Work/Data Projects/manga_anilist_pipeline'

In [3]:
# Define GraphQL's URL

url = "https://graphql.anilist.co"

query = """
query ($page: Int, $perPage: Int) {
  Page(page: $page, perPage: $perPage) {
    media(type: MANGA, sort: SCORE_DESC) {
      id
      title {
        romaji
        english
        native
      }
      format
      popularity
      averageScore
      genres
      startDate {
        year
        month
        day
      }
      endDate {
        year
        month
        day
      }
      status
      chapters
      volumes
      staff {
        nodes {
          name {
            full
            native
          }
          primaryOccupations
        }
      }
    }
    pageInfo {
      hasNextPage
      currentPage
    }
  }
}
"""

In [4]:
all_manga = [] # create empty list to hold data 

per_page = 50 

for page in range(1, 25):
    variables = {"page": page, "perPage": per_page}
    resp = requests.post(url, json={"query": query, "variables":variables})

    # Respect the API's Rate limit
    time.sleep(1)

    # Check if the request succeeded 

    if resp.status_code != 200:
        raise Exception(f"Query failed: {resp.status_code}, {resp.text}")

    data = resp.json()
# ----- Insert Requested Data -----
    items = data["data"]["Page"]["media"]

    for m in items: 
        # Get staff nodes correctly
        staff = m.get("staff")
        if staff and isinstance(staff, dict):
            staff_nodes = staff.get("nodes", [])
        else:
            staff_nodes = []

        # Build a clean list of staff dictionaries

        staff_compact = [ 
            {
                "name": s.get("name", {}).get("full"),
                "primaryOccupations": s.get("primaryOccupations", []),    
            }
            for s in staff_nodes
            if s.get("name", {}).get("full")
        ]

        # Append one row per manga 
        all_manga.append({
            "manga_id": m["id"],
            "title_romaji": m["title"].get("romaji"),
            "title_english": m["title"].get("english"),
            "title_native": m["title"].get("native"),
            "format": m.get("format"),
            "popularity": m.get("popularity"),
            "average_score": m.get("averageScore"),
            "genres": m.get("genres", []),
            "start_date": m.get("startDate", {}),
            "end_date": m.get("endDate", {}),
            "status": m.get("status"),
            "chapters": m.get("chapters"),
            "volumes": m.get("volumes"),
            "staff_names": staff_compact,

        })

In [5]:
print(len(all_manga))

1200


In [6]:
df = pd.DataFrame(all_manga)
print(df.head(10), df.columns)



   manga_id                                       title_romaji  \
0     30002                                            Berserk   
1     31706           JoJo no Kimyou na Bouken: Steel Ball Run   
2     30656                                           Vagabond   
3     30013                                          ONE PIECE   
4     64053  Umineko no Naku Koro ni Chiru Episode 8: Twili...   
5     30001                                            MONSTER   
6     30642                                       Vinland Saga   
7     30051                                          SLAM DUNK   
8     74489                                    Houseki no Kuni   
9     30025                           Hagane no Renkinjutsushi   

                                       title_english  \
0                                            Berserk   
1    JoJo's Bizarre Adventure: Part 7–Steel Ball Run   
2                                           Vagabond   
3                                          One Pi

In [7]:
df.to_parquet("data/top_manga2.parquet", index=False)