In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os

# Load the CSV containing URLs, years, and months
df_urls = pd.read_csv("cleaned_oricon_urls.csv")

# Iterate through each row in the CSV
for index, row in df_urls.iterrows():
    url = row["url"]
    year = row["year"]
    month = row["month"]

    print(f"Processing: {url} for {year}-{month}")

    # Send a GET request to the URL
    response = requests.get(url)

    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all sections containing the tables
        sections = soup.find_all('section', class_='table_grid')

        # List to store DataFrames
        df_list = []

        # Iterate over each section
        for section in sections:
            for div in section.find_all('div', recursive=False):
                # Extract the date range
                date_range_div = div.find('div', style=True)
                if date_range_div:
                    date_range = date_range_div.get_text(strip=True)

                    # Find the table within the div
                    table = div.find('table', class_='table')
                    if table:
                        # Extract table headers
                        headers = [th.get_text(strip=True) for th in table.find_all('th')]

                        # Extract table rows
                        rows = []
                        for tr in table.find_all('tr')[1:]:  # Skip the header row
                            cells = [td.get_text(strip=True) for td in tr.find_all('td')]
                            if cells:
                                rows.append(cells)

                        # Create DataFrame
                        df = pd.DataFrame(rows, columns=headers)

                        # Add date range column
                        df.insert(0, 'Date Range', date_range)

                        # Store in list
                        df_list.append(df)

        # Combine all DataFrames into a single DataFrame
        if df_list:
            final_df = pd.concat(df_list, ignore_index=True)

            # Define the folder where you want to save the files
            folder_name = "oricon_charts"
            # Ensure the folder exists, if not, create it
            os.makedirs(folder_name, exist_ok=True)

            # Define the filename inside the folder
            filename = os.path.join(folder_name, f"oricon_charts_{year}_{month}.csv")
            
            # Save the DataFrame as a CSV file
            final_df.to_csv(filename, index=False)
            print(f"Saved {filename}")
        else:
            print(f"No data found for {year}-{month}")

    else:
        print(f"Failed to retrieve {url}. Status code: {response.status_code}")