In [2]:
import requests
from bs4 import BeautifulSoup
import re

In [3]:
def load_webpage(url):
    try:
        # Send a GET request to the webpage
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            print("Webpage loaded successfully!")

            return response.content
        else:
            print(f"Failed to load webpage. Status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

In [4]:
def extract_mmu_entries(html):
    soup = BeautifulSoup(html, 'html.parser')
    # Match hrefs starting with /entry/ followed by one or more mmu:number, separated by +
    pattern = re.compile(r'/entry/(mmu:\d+(?:\+mmu:\d+)*)')
    
    mmu_entries = []
    for area in soup.find_all('area', href=True):
        match = pattern.search(area['href'])
        if match:
            mmu_entries.append('/entry/' + match.group(1))
    
    return mmu_entries


In [5]:
def get_uniprot_info(accession):
    url = f"https://rest.uniprot.org/uniprotkb/{accession}"
    headers = {"Accept": "application/json"}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        return data.get("entryType")  # entryType contains 'Reviewed' or 'Unreviewed'
    else:
        return f"Error: {response.status_code} - {response.reason}"

In [6]:
def get_uniprot_entry_status(accession):
    url = f"https://rest.uniprot.org/uniprotkb/{accession}"
    headers = {"Accept": "application/json"}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        entry_data = data.get("entryType")  
        
        if not 'unreviewed' in entry_data.lower():
            
            locations = []
            for comment in data.get("comments", []):
                if comment.get("commentType") == "SUBCELLULAR LOCATION":
                    for subcellular_location in comment.get("subcellularLocations", []):
                        location = subcellular_location.get("location", {}).get("value")
                        if location:
                            locations.append(location)
            return locations
        else:
            return "unreviewed"
    else:
        return f"Error: {response.status_code} - {response.reason}"

In [7]:
def convert_text(text):
    # Define the mapping of keywords to their replacements
    conversion_map = {
        r'cytoplasm(?:ic)?': 'Cy',
        r'lysosome': 'Ly',
        r'mitochondri(?:a|al|ol|on)': 'Mi',
        r'nucle(?:us|osome|olus)': 'N',
        r'membrane': 'Mem',
        r'peroxisome': 'P',
        r'golgi apparatus': 'GA',
        r'matrix': 'Mat'
    }

    # Convert text case-insensitively based on the conversion map
    for pattern, replacement in conversion_map.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

    return text

In [8]:
def extract_kegg_pathway_name(html):
    soup = BeautifulSoup(html, 'html.parser')
    title_tag = soup.find('title')
    if title_tag and 'KEGG PATHWAY:' in title_tag.text:
        return title_tag.text.split('KEGG PATHWAY:')[-1].strip().split(' - ')[0]
    return "unknown_pathway"
