In [6]:
import requests
import pandas as pd

def fetch_variants(uniprot_id):
    url = f"https://www.ebi.ac.uk/proteins/api/variation/{uniprot_id}"
    headers = {"Accept": "application/json"}
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    data = r.json()

    rows = []
    for feature in data.get("features", []):
        pos = feature.get("begin")
        for xref in feature.get("xrefs", []):
            if xref.get("name") in {"dbSNP", "ExAC", "ESP", "gnomAD", "TOPMed", "1000Genomes"}:
                rsid = xref.get("id")
                rows.append({
                    "uniprot_id": uniprot_id,
                    "rsid": rsid,
                    "protein_pos": pos
                })
    return rows

In [14]:
df = pd.read_csv("/media/leon/DISK2/icig/done/uniprot/tf_snp_df.tsv", sep="\t")
id_map = df.groupby("Entry")["refsnp_id"].apply(list).to_dict()

all_rows = []
for uid, rsids in id_map.items():
    rows = fetch_variants(uid)
    filtered = [r for r in rows if r["rsid"] in rsids]
    all_rows.extend(filtered)

df = pd.DataFrame(all_rows)
df = df.drop_duplicates()
df

     uniprot_id        rsid protein_pos
0        A2RRD8  rs28462670           8
1        A2RRD8  rs28462670           8
2        A2RRD8  rs28462670           8
3        A6NFI3   rs2253869         156
4        A6NFI3   rs2253869         156
...         ...         ...         ...
2800     Q9Y6Q3   rs2282076           7
2801     Q9Y6Q3   rs2282076           7
2802     Q9Y6Q3   rs2282076           7
2803     Q9Y6Q3   rs2282076           7
2804     Q9Y6Q3   rs2282076           7

[2805 rows x 3 columns]


In [19]:
df.to_csv("/media/leon/DISK2/icig/done/uniprot/snp_positions.tsv", sep="\t", index=False)

In [20]:
import re
import time
from bs4 import BeautifulSoup

# Load TSV
df = pd.read_csv("snp_positions.tsv", sep="\t")

def fetch_domains(uniprot_id):
    url = f"https://web.expasy.org/cgi-bin/protparam/protparam?{uniprot_id}"
    r = requests.get(url)
    if r.status_code != 200:
        print(f"Failed {uniprot_id}, status {r.status_code}")
        return []

    soup = BeautifulSoup(r.text, "html.parser")
    pre = soup.find("pre")
    if not pre:
        return []

    domains = []
    for line in pre.get_text().splitlines():
        # Example line: "FT   DOMAIN         74-145  KRAB"
        m = re.match(r"FT\s+(\w+)\s+(\d+)-(\d+)\s+(.+)", line.strip())
        if m:
            feature, start, end, name = m.groups()
            if feature in ["CHAIN", "COMPBIAS"]:
                continue
            domains.append({
                "feature": feature,
                "start": int(start),
                "end": int(end),
                "name": name.strip()
            })
    return domains

def map_position_to_domain(position, domains):
    matches = []
    for d in domains:
        if d["start"] <= position <= d["end"]:
            matches.append(f"{d['feature']}|{d['name']}")
    return "; ".join(matches) if matches else "NA"

domain_cache = {}  # cache results to avoid duplicate requests

results = []
for i, row in df.iterrows():
    uniprot_id = row["uniprot_id"]
    pos = int(row["protein_pos"])

    if uniprot_id not in domain_cache:
        domain_cache[uniprot_id] = fetch_domains(uniprot_id)
        time.sleep(0.5)  # polite delay

    domains = domain_cache[uniprot_id]
    mapped = map_position_to_domain(pos, domains)

    results.append(mapped)
    print(uniprot_id, "done", i + 1, "/", df.shape[0])

df["Domain"] = results

# Save new table
df.sort_values(by='Domain', inplace=True)
df.to_csv("uniprot_isoforms_with_domains_2.tsv", sep="\t", index=False)

A2RRD8 done 1 / 394
A6NFI3 done 2 / 394
A6NFI3 done 3 / 394
A6NFI3 done 4 / 394
A6NHJ4 done 5 / 394
A6NHJ4 done 6 / 394
A6NHJ4 done 7 / 394
A6NHJ4 done 8 / 394
A6NHJ4 done 9 / 394
A6NHJ4 done 10 / 394
A6NK53 done 11 / 394
A6NK53 done 12 / 394
A6NK53 done 13 / 394
A8MQ14 done 14 / 394
A8MQ14 done 15 / 394
A8MQ14 done 16 / 394
B2RXF5 done 17 / 394
B2RXF5 done 18 / 394
B2RXF5 done 19 / 394
B4DU55 done 20 / 394
B4DU55 done 21 / 394
B4DX44 done 22 / 394
B7Z6K7 done 23 / 394
B7Z6K7 done 24 / 394
B7Z6K7 done 25 / 394
B7Z6K7 done 26 / 394
B7Z6K7 done 27 / 394
B7Z6K7 done 28 / 394
B7Z6K7 done 29 / 394
O14901 done 30 / 394
O14948 done 31 / 394
O14978 done 32 / 394
O15015 done 33 / 394
O15015 done 34 / 394
O15015 done 35 / 394
O15015 done 36 / 394
O15209 done 37 / 394
O43361 done 38 / 394
O75123 done 39 / 394
O75132 done 40 / 394
O75437 done 41 / 394
O75626 done 42 / 394
O75820 done 43 / 394
O94983 done 44 / 394
O95231 done 45 / 394
O95365 done 46 / 394
O95780 done 47 / 394
O95780 done 48 / 394
O