In [85]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# URL to crawl
url = "https://mangacodex.com/oricon_weekly/2008/04_Abril.php"

# Send a request to the website
response = requests.get(url)
response.raise_for_status()  # Raise an error for failed requests

# Parse the HTML content
soup = BeautifulSoup(response.text, "html.parser")

# Extract all URLs
urls = [a["href"] for a in soup.find_all("a", href=True)]

# Convert relative URLs to absolute URLs
from urllib.parse import urljoin
url_2008 = [urljoin(url, href) for href in urls]

url_2008

['https://mangacodex.com/journal.php',
 'https://mangacodex.com/infographics.php',
 'https://mangacodex.com/sales_evo.php',
 'https://mangacodex.com/top_manga.php',
 'https://mangacodex.com/monthly_july.php',
 'https://mangacodex.com/oricon_weekly/2008/04_abril.php',
 'https://mangacodex.com/oricon_search.php',
 'https://mangacodex.com/oricon_weekly/2008/04_Abril.php',
 'https://mangacodex.com/oricon_monthly/2009/08_Agosto.php',
 'https://mangacodex.com/oricon_yearly.php',
 'https://mangacodex.com/oricon_weekly/2008/04_Abril.php',
 'https://mangacodex.com/oricon_weekly/2009/01_NovDec.php',
 'https://mangacodex.com/oricon_weekly/2010/01_NovDec.php',
 'https://mangacodex.com/oricon_weekly/2011/01_NovDec.php',
 'https://mangacodex.com/oricon_weekly/2012/01_NovDec.php',
 'https://mangacodex.com/oricon_weekly/2013/01_NovDec.php',
 'https://mangacodex.com/oricon_weekly/2014/01_NovDec.php',
 'https://mangacodex.com/oricon_weekly/2015/01_NovDec.php',
 'https://mangacodex.com/oricon_weekly/2016

In [87]:
# Define the years to scrape
years = range(2009, 2024)  # 2009 to 2023

# Headers to mimic a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Dictionary to store links by year
all_links = {}

all_links[2008] = url_2008

for year in years:
    # Generate the URL for the given year
    url = f"https://mangacodex.com/oricon_weekly/{year}/01_NovDec.php"
    try:
        # Request the page
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an error if request fails

        # Parse HTML
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract all URLs
        urls = set()
        for a_tag in soup.find_all("a", href=True):
            full_url = urljoin(url, a_tag["href"])  # Convert relative to absolute URLs
            urls.add(full_url)

        # Store results in the dictionary
        all_links[year] = list(urls)

        print(f"✅ {year} - Collected {len(urls)} links")
    
    except requests.exceptions.RequestException as e:
        print(f"❌ Failed to fetch {year}: {e}")

✅ 2009 - Collected 35 links
✅ 2010 - Collected 35 links
✅ 2011 - Collected 35 links
✅ 2012 - Collected 35 links
✅ 2013 - Collected 35 links
✅ 2014 - Collected 35 links
✅ 2015 - Collected 35 links
✅ 2016 - Collected 35 links
✅ 2017 - Collected 35 links
✅ 2018 - Collected 35 links
✅ 2019 - Collected 35 links
✅ 2020 - Collected 35 links
✅ 2021 - Collected 35 links
✅ 2022 - Collected 35 links
✅ 2023 - Collected 35 links


In [89]:
years = range(2008, 2023)

# Create a dictionary that filters URLs by year while ensuring "weekly" is in the URL
url_dict = {
    year: sorted(set(url for url in urls if str(year) in url and "weekly" in url.lower()))
    for year, urls in all_links.items()
}

# Display cleaned URL dictionary
url_dict

{2008: ['https://mangacodex.com/oricon_weekly/2008/04_Abril.php',
  'https://mangacodex.com/oricon_weekly/2008/04_abril.php',
  'https://mangacodex.com/oricon_weekly/2008/05_May.php',
  'https://mangacodex.com/oricon_weekly/2008/06_Junio.php',
  'https://mangacodex.com/oricon_weekly/2008/07_Julio.php',
  'https://mangacodex.com/oricon_weekly/2008/08_Agosto.php',
  'https://mangacodex.com/oricon_weekly/2008/09_Septiembre.php',
  'https://mangacodex.com/oricon_weekly/2008/10_Octubre.php',
  'https://mangacodex.com/oricon_weekly/2008/11_Noviembre.php'],
 2009: ['https://mangacodex.com/oricon_weekly/2009/01_NovDec.php',
  'https://mangacodex.com/oricon_weekly/2009/02_Enero.php',
  'https://mangacodex.com/oricon_weekly/2009/03_Febrero.php',
  'https://mangacodex.com/oricon_weekly/2009/04_Marzo.php',
  'https://mangacodex.com/oricon_weekly/2009/05_Abril.php',
  'https://mangacodex.com/oricon_weekly/2009/06_Mayo.php',
  'https://mangacodex.com/oricon_weekly/2009/07_Junio.php',
  'https://mang

In [101]:
# Function to extract the first number (month) from the URL
def extract_month_number(url):
    match = re.search(r"/\d{4}/(\d{2})_", url)  # Extracts the first two-digit number after the year
    if match:
        return match.group(1)  # Returns month as "01", "02", etc.
    return None  # Return None if no match is found

# Dictionary to store cleaned data
cleaned_url_dict = {}

# List to store structured data
structured_data = []

for year, urls in url_dict.items():
    # Remove duplicates while preserving case
    normalized_urls = {url.lower(): url for url in urls}  # Normalize case for uniqueness
    unique_urls = sorted(normalized_urls.values())  # Sort URLs for consistency

    # Store cleaned URLs
    cleaned_url_dict[year] = unique_urls  

    # Extract structured data
    for url in unique_urls:
        month_num = extract_month_number(url)
        structured_data.append([year, url, month_num])

# Convert to DataFrame
df = pd.DataFrame(structured_data, columns=["year", "url", "month"])

df

Unnamed: 0,year,url,month
0,2008,https://mangacodex.com/oricon_weekly/2008/04_a...,04
1,2008,https://mangacodex.com/oricon_weekly/2008/05_M...,05
2,2008,https://mangacodex.com/oricon_weekly/2008/06_J...,06
3,2008,https://mangacodex.com/oricon_weekly/2008/07_J...,07
4,2008,https://mangacodex.com/oricon_weekly/2008/08_A...,08
...,...,...,...
183,2023,https://mangacodex.com/oricon_weekly/2023/08_J...,08
184,2023,https://mangacodex.com/oricon_weekly/2023/09_A...,09
185,2023,https://mangacodex.com/oricon_weekly/2023/10_S...,10
186,2023,https://mangacodex.com/oricon_weekly/2023/11_O...,11


In [103]:
# Save cleaned data to CSV
df.to_csv("cleaned_oricon_urls.csv", index=False)