In [300]:
import requests
import mwparserfromhell
import csv
from bs4 import BeautifulSoup
import re
import validators

In [301]:
def fetch_pages(category_title):
    url = "https://bots.snpedia.com/api.php"
    params = {
        "action" : "query",
        "list" : "categorymembers",
        "cmtitle" : category_title,
        "cmlimit" : "max",
        "format" : "json",
    }

    response = requests.get(url, params=params)
    data = response.json()

    pages = data["query"]["categorymembers"]
    
    return pages

In [302]:
def fetch_and_process_page_content(page_title):
    content = fetch_page_content_by_title(page_title)
    # Process the content here (e.g., extract information, write to file, etc.)
    print(f"Content of page {page_title}: {content}\n")

In [303]:
def fetch_html(url):
    response = requests.get(url)
    html_content = response.content.decode("utf-8")
    return html_content

In [304]:
def extract_rs_text(html):
    soup = BeautifulSoup(html, "html.parser")
    rs_text = None
    text_tags = soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"])
    for tag in text_tags:
        text = tag.get_text().strip()
        if text.startswith("rs"):
            rs_text = text
            break
    return rs_text

In [305]:
def is_valid(entry):
    if not entry or entry.startswith("23and") or entry == "common in clinvar":
        return False
    return True

In [306]:
def visit_rs_link(title, url, csv_writer):
    # Visit the URL and fetch the HTML content
    response = requests.get(url)
    html = response.content
    soup = BeautifulSoup(html, "html.parser")

    # Extract the SNP name from the URL
    snp_name = url.split('/')[-1]

    #Looking for the body text of the page
    content_div = soup.find('div', {'id': 'mw-content-text'})

    if content_div:
        # Using a lambda function to find the parent or ancestor div with the 'mw-content-ltr' class
        parent_div = content_div.find_parent(lambda element: element and 'mw-content-ltr' in element.get('class', []))

        # Fallback to using the 'mw-content-text' div directly if the parent with the desired class is not found
        target_div = parent_div if parent_div else content_div

        # Extracting only the text within the <p> tags
        paragraphs = target_div.find_all('p')
        text = ' '.join(p.get_text(separator=' ', strip=True).strip() for p in paragraphs)
        # Check if 'haplogroups' is present in the text and exclude it
        if 'haplogroups' in text.lower() or 'Haplogroups' in text.lower():
            explanation = "No data on this polymorphism"
        else:
            explanation = text.strip()
    else:
        explanation = "No data on this polymorphism"

    rows = []

    #Looking for the table elements on the page
    table = soup.find('table', {'class': ['sortable', 'smwtable', 'jquery-tablesorter']})
    if table:
        # Extracting headers
        headers = []
        for th in table.find_all('th'):
            headers.append(th.text.strip())

        # Extracting rows
        rows = []
        for tr in table.find_all('tr')[1:]:  # Skipping the header row
            row = []
            for td in tr.find_all('td'):
                row.append(td.text.strip())
            
            # Ensure row has at least 4 columns
            while len(row) < 4:
                row.append("")

            # Check and modify the row according to the requested changes
            if not row[1]: #Mag entry
                row[1] = "0"
            if not is_valid(row[2].strip()):  # Summary entry
                row[2] = "No data on this polymorphism"
            if not is_valid(row[3].strip()):  # Explanation entry
                row[3] = "No data on this polymorphism"
            rows.append(row)

    if 'haplogroups' in explanation.lower():
        explanation = "No data on this polymorphism"

    return explanation, rows, snp_name

pages = fetch_pages("Category:Is_a_snp")
page_titles = [page["title"] for page in pages]

with open('snp_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Polymorphism', 'Genome', 'Magnitude', 'Summary', 'Explanation'])  # Write the header row

    for title in page_titles:
        url = f"https://www.snpedia.com/index.php/{title.replace(' ', '_')}"
        rs_link = fetch_rs_link(url)

        if rs_link.startswith("http"):
            explanation, rows, snp_name = visit_rs_link(title, rs_link, csv_writer)
            explanation = explanation.strip()
            if not explanation:
                explanation = "No data on this polymorphism"

            for row in rows:
                new_row = [snp_name] + row[:3] + [explanation]
                csv_writer.writerow(new_row)

KeyboardInterrupt: 

In [None]:
def fetch_rs_link(url):
    response = requests.get(url)
    html = response.content
    soup = BeautifulSoup(html, "html.parser")
    rs_link_element = soup.select_one('a[href*="Rs"]')

    if rs_link_element:
        rs_link = "https://www.snpedia.com" + rs_link_element['href']
    else:
        rs_link = ""

    return rs_link

pages = fetch_pages("Category:Is_a_snp")
page_titles = [page["title"] for page in pages]

for title in page_titles:
    # Construct the URL for the page title
    url = f"https://www.snpedia.com/index.php/{title.replace(' ', '_')}"

    # Fetch the rs link for the page
    rs_link = fetch_rs_link(url)

    # Print the result
    print(f"{title}: {rs_link}")