In [26]:
import pandas as pd
from rymscraper import rymscraper,RymUrl
from typing import List, Dict
import logging
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
import logging
import re
import time
import difflib
from tqdm import tqdm
from bs4 import BeautifulSoup, NavigableString, element
from selenium.webdriver.common.by import By
from typing import List
from rapidfuzz import fuzz, process

In [20]:


logger = logging.getLogger(__name__)


class RymBrowser(webdriver.Firefox):
    def __init__(self, headless=True):
        logger.debug("Starting Selenium Browser : headless = %s", headless)
        self.options = Options()
        if headless:
            self.options.headless = True

        webdriver.Firefox.__init__(self, options=self.options)

    def restart(self):
        self.quit()
        webdriver.Firefox.__init__(self, options=self.options)

    def get_url(self, url):
        logger.debug("get_url(browser, %s)", url)
        while True:
            self.get(str(url))
            class_to_click_on = [
                "as-oil__btn-optin",  # cookie bar
                "fc-cta-consent",  # consent popup
                # "ad-close-button",  # advertisement banner
            ]
            for i in class_to_click_on:
                if len(self.find_elements(By.CLASS_NAME, i)) > 0:
                    self.find_element(By.CLASS_NAME, i).click()
                    logger.debug(f"{i} found. Clicking on it.")

            if len(self.find_elements(By.CLASS_NAME, "disco_expand_section_link")) > 0:
                try:
                    for index, link in enumerate(
                        self.find_elements(By.CLASS_NAME, "disco_expand_section_link")
                    ):
                        self.execute_script(
                            f"document.getElementsByClassName('disco_expand_section_link')[{index}].scrollIntoView(true);"
                        )
                        link.click()
                        time.sleep(0.2)
                except Exception as e:
                    logger.debug('No "Show all" links found : %s.', e)
            # Test if IP is banned.
            if self.is_ip_banned():
                logger.error(
                    "IP banned from rym. Can't do any requests to the website. Exiting."
                )
                self.quit()
                exit()
            # Test if browser is rate-limited.
            if self.is_rate_limited():
                logger.error("Rate-limit detected. Restarting browser.")
                self.restart()
            else:
                break
        return

    def get_soup(self):
        return BeautifulSoup(self.page_source, "lxml")

    def is_ip_banned(self):
        logger.debug("soup.title : %s", self.get_soup().title)
        return self.get_soup().title.text.strip() == "IP blocked"

    def is_rate_limited(self):
        return self.get_soup().find("form", {"id": "sec_verify"})


In [27]:
def get_chart_row_infos(row: element.Tag) -> dict:
    """Returns a dict containing infos from a chart row."""
    dict_row = {}
    try:
        dict_row["Rank"] = row.get("id").replace("pos", "")
    except Exception as e:
        logger.error("Error when fetching Rank: %s", e)
        dict_row["Rank"] = "NA"
    try:
        artist_div = dict_row["Artist"] = row.find(
            "div",
            {"class": "page_charts_section_charts_item_credited_links_primary"},
        )
        romanized_version_span = artist_div.find(
            "span", {"class": "ui_name_locale_language"}
        )

        original_name_span = artist_div.find(
            "span", {"class": "ui_name_locale_original"}
        )

        if romanized_version_span:
            dict_row[
                "Artist"
            ] = f"{romanized_version_span.text} [{original_name_span.text}]"
        elif original_name_span:
            dict_row["Artist"] = original_name_span.text
        else:
            dict_row["Artist"] = artist_div.text

        dict_row["Artist"] = dict_row["Artist"].replace("\n", "")
    except Exception as e:
        logger.error("Error when fetching Artist: %s", e)
        dict_row["Artist"] = "NA"
    try:
        dict_row["Album"] = row.find(
            "div",
            {"class": "page_charts_section_charts_item_title"},
        ).text.replace("\n", "")
        logger.debug(
            "%s - %s - %s",
            dict_row["Rank"],
            dict_row["Artist"],
            dict_row["Album"],
        )
    except Exception as e:
        logger.error("Error when fetching Album: %s", e)
        dict_row["Album"] = "NA"
    try:
        dict_row["Date"] = (
            row.find("div", {"class": "page_charts_section_charts_item_date"})
            .find_all("span")[0]
            .text.replace("\n", "")
            .strip()
        )
    except Exception as e:
        logger.error("Error when fetching Date: %s", e)
        dict_row["Date"] = "NA"
    try:
        dict_row["Genres"] = ", ".join(
            [
                x.text
                for x in row.find(
                    "div", {"class": "page_charts_section_charts_item_genres_primary"}
                ).find_all("a", {"class": "genre"})
            ]
        )
    except Exception as e:
        logger.error("Error when fetching Genres: %s", e)
        dict_row["Genres"] = "NA"
    try:
        dict_row["RYM Rating"] = row.find(
            "span", {"class": "page_charts_section_charts_item_details_average_num"}
        ).text
    except Exception as e:
        logger.error("Error when fetching RYM Rating: %s", e)
        dict_row["RYM Rating"] = "NA"
    try:
        dict_row["Ratings"] = (
            row.find_all("span", {"class": "full"})[0]
            .text.replace("\n", "")
            .replace(" ", "")
        )
    except Exception as e:
        logger.error("Error when fetching Ratings: %s", e)
        dict_row["Ratings"] = "NA"
    try:
        dict_row["Reviews"] = (
            row.find_all("span", {"class": "full"})[1]
            .text.replace("\n", "")
            .replace(" ", "")
        )
    except Exception as e:
        logger.error("Error when fetching Reviews: %s", e)
        dict_row["Reviews"] = "NA"
    return dict_row



In [38]:
def get_chart_infos(max_page: int = None) -> List[Dict]:
    """Returns a list of dicts containing chart infos.

    Parameters:
        url: An url for a chart. Can be created with the RymUrl helper.
        See the get_chart.py script in the examples folder for an example.
        max_page: The max number of pages to extract from the chart.

    Returns:
        list_rows: List of dicts for each rows from the chart.

    """
    browser = RymBrowser(headless=True)
    #for page in range(1,max_page+1):
        #url=f"https://rateyourmusic.com/charts/top/album/all-time/g:city-pop/{page}/"
    page=1
    #logger.info("Extracting chart informations for %s.", url)

    list_rows = []
    while True:
        try:
            url=f"https://rateyourmusic.com/charts/top/album/all-time/g:city-pop/{page}/"
            browser.get_url(url)
            logger.debug("Extracting chart rows for url %s", url)
            soup = browser.get_soup()

            # table containing albums
            if soup.find("sections", {"id": "page_sections_charts"}):
                logger.debug("Table containing chart elements found")
                table = soup.find("section", {"id": "page_charts_section_charts"})
                rows = table.find_all(
                    "div", {"class": "page_section_charts_item_wrapper"}
                )
                if len(rows) == 0:
                    logger.debug("No rows extracted. Exiting")
                    break
                for row in rows:
                    # don't parse ads
                    if not row.find("script"):
                        dict_row = get_chart_row_infos(row)
                        list_rows.append(dict_row)
            else:
                logger.warning("Table class mbgen not found")
                break

            # link to the next page
            if soup.find("a", {"class": "ui_pagination_next"}):
                logger.debug("Next page found")
                if max_page and page == max_page:
                    break
                page += 1
                soup.decompose()
                try:
                    url=f"https://rateyourmusic.com/charts/top/album/all-time/g:city-pop/{page}/"
                    browser.get_url(url)
                    soup = browser.get_soup()
                except Exception as e:
                    logger.error(e)
                    break
            else:
                logger.debug("No next page found. Exiting.")
                break
        except Exception as e:
            logger.error("Error scraping page %s : %s", url, e)
            break

    return list_rows

In [40]:
#rym_url = RymUrl.RymUrl() # default: top of all-time. See examples/get_chart.py source code for more options.
chart_infos = get_chart_infos(max_page=30)
df = pd.DataFrame(chart_infos)
df=df[['Rank', 'Artist', 'Album', 'RYM Rating', 'Ratings']]

  self.options.headless = True


In [41]:
df.to_csv('allcitypop4.csv')

In [47]:
#sorted_df = df.sort_values(by=['Ratings'], ascending=False) //sort by popularity
#sorted_df = df.sort_values(by=['RYM Rating'], ascending=False) //sort by rating
#df['RYM Rating'].quantile(q=0.9)
df['RYM Rating'] = df['RYM Rating'].astype(float)

In [48]:
df['Ratings'] = df['Ratings'].str.replace(',', '').astype(int)

In [49]:
df['RYM Rating'].quantile(q=0.9)#q=0.9 means top 10% albums raed

3.63

In [51]:
df['Ratings'].quantile(q=0.5)

17.0

In [52]:
sorted_df = df.sort_values(by=['RYM Rating'], ascending=False)

In [53]:
sorted_df.to_csv('allcitypop5.csv')

In [54]:
print(df.head())

  Rank                      Artist                Album  RYM Rating  Ratings
0    1    Tatsuro Yamashita [山下達郎]         Ride on Time        3.89     2444
1    2    Tatsuro Yamashita [山下達郎]              For You        3.83     4231
2    3                   Anri [杏里]             Timely!!        3.79     3385
3    4   Masayoshi Takanaka [高中正義]           Seychelles        3.76     1222
4    5   Masayoshi Takanaka [高中正義]  The Rainbow Goblins        3.73     1170


In [55]:
df.to_csv('CITYPOP.csv',encoding='utf-8-sig')

In [56]:
sorted_df.to_csv('CITYPOPSORTED.csv',encoding='utf-8-sig')