In [1]:
import pandas as pd

# Load TFs file
df_tfs = pd.read_csv('transcription_factors.csv')
df_tfs.head()

Unnamed: 0,transcription_factors
0,NUPR1
1,E2F4
2,TCF3
3,ZFP36
4,CBX5


In [2]:
# Sort TFs
df_tfs = df_tfs.sort_values('transcription_factors', ascending=True).reset_index(drop=True)
df_tfs.head()

Unnamed: 0,transcription_factors
0,ARNT
1,CBX5
2,CREM
3,DACH1
4,E2F4


In [3]:
import requests, sys

uniprot_ids = []
for tf in df_tfs['transcription_factors']:

  params = {
    "query": f"{tf} AND reviewed:true AND gene_exact:{tf}",
    "fields": [
      "accession",
      "protein_name"
    ]
  }
  headers = {
    "accept": "application/json"
  }
  base_url = "https://rest.uniprot.org/uniprotkb/search?query=(organism_id:9606)"

  response = requests.get(base_url, headers=headers, params=params)
  if not response.ok:
    response.raise_for_status()
    sys.exit()

  data = response.json()
  accs = data["results"][0]["primaryAccession"]
  uniprot_ids.append(accs)
  print(f"Gene: {tf} --> UniProt ID: {accs}")

Gene: ARNT --> UniProt ID: P27540
Gene: CBX5 --> UniProt ID: P45973
Gene: CREM --> UniProt ID: Q03060
Gene: DACH1 --> UniProt ID: Q9UI36
Gene: E2F4 --> UniProt ID: Q16254
Gene: EPAS1 --> UniProt ID: Q99814
Gene: ETV3 --> UniProt ID: P41162
Gene: HOXD3 --> UniProt ID: P31249
Gene: KLF11 --> UniProt ID: O14901
Gene: LEF1 --> UniProt ID: Q9UJU2
Gene: MXD1 --> UniProt ID: Q05195
Gene: NOTCH1 --> UniProt ID: P46531
Gene: NUPR1 --> UniProt ID: O60356
Gene: RBL1 --> UniProt ID: P28749
Gene: SIX2 --> UniProt ID: Q9NPC8
Gene: SMARCB1 --> UniProt ID: Q12824
Gene: TCF3 --> UniProt ID: Q9HCS4
Gene: TRIM24 --> UniProt ID: O15164
Gene: TRPS1 --> UniProt ID: Q9UHF7
Gene: ZFP36 --> UniProt ID: P26651


In [4]:
df_tfs['UniProt_ID'] = uniprot_ids
df_tfs.head()

Unnamed: 0,transcription_factors,UniProt_ID
0,ARNT,P27540
1,CBX5,P45973
2,CREM,Q03060
3,DACH1,Q9UI36
4,E2F4,Q16254


In [5]:
# Extract 3D structure files from AlphaFold

import os
os.makedirs("pdbs", exist_ok=True)

af_ids = []
for id in df_tfs['UniProt_ID']:

    af_url = f"https://alphafold.ebi.ac.uk/api/prediction/{id}?key=AIzaSyCeurAJz7ZGjPQUtEaerUkBZ3TaBkXrY94"

    response = requests.get(af_url)
    if not response.ok:
        response.raise_for_status()
        sys.exit()

    data = response.json()
    af_id = data[0]["entryId"]
    af_ids.append(af_id)
    print(f"UniProt ID: {id} --> AlphaFold ID: {af_id}")

    pdb_url = data[0]["pdbUrl"]
    output_path = os.path.join("pdbs", f"{af_id}.pdb")

    pdb_resp = requests.get(pdb_url)
    if pdb_resp.ok:
        with open(output_path, "wb") as f:
            f.write(pdb_resp.content)
        print(f"Downloaded to {output_path}")
    else:
        print(f"Failed to download PDB for {af_id}: HTTP {pdb_resp.status_code}")

UniProt ID: P27540 --> AlphaFold ID: AF-P27540-F1
Downloaded to pdbs/AF-P27540-F1.pdb
UniProt ID: P45973 --> AlphaFold ID: AF-P45973-F1
Downloaded to pdbs/AF-P45973-F1.pdb
UniProt ID: Q03060 --> AlphaFold ID: AF-Q03060-F1
Downloaded to pdbs/AF-Q03060-F1.pdb
UniProt ID: Q9UI36 --> AlphaFold ID: AF-Q9UI36-F1
Downloaded to pdbs/AF-Q9UI36-F1.pdb
UniProt ID: Q16254 --> AlphaFold ID: AF-Q16254-F1
Downloaded to pdbs/AF-Q16254-F1.pdb
UniProt ID: Q99814 --> AlphaFold ID: AF-Q99814-F1
Downloaded to pdbs/AF-Q99814-F1.pdb
UniProt ID: P41162 --> AlphaFold ID: AF-P41162-F1
Downloaded to pdbs/AF-P41162-F1.pdb
UniProt ID: P31249 --> AlphaFold ID: AF-P31249-F1
Downloaded to pdbs/AF-P31249-F1.pdb
UniProt ID: O14901 --> AlphaFold ID: AF-O14901-F1
Downloaded to pdbs/AF-O14901-F1.pdb
UniProt ID: Q9UJU2 --> AlphaFold ID: AF-Q9UJU2-F1
Downloaded to pdbs/AF-Q9UJU2-F1.pdb
UniProt ID: Q05195 --> AlphaFold ID: AF-Q05195-F1
Downloaded to pdbs/AF-Q05195-F1.pdb
UniProt ID: P46531 --> AlphaFold ID: AF-P46531-F1
Down

In [6]:
# Add the AF Ids to the dataframe

df_tfs['AlphaFold ID'] = af_ids
df_tfs.head()

Unnamed: 0,transcription_factors,UniProt_ID,AlphaFold ID
0,ARNT,P27540,AF-P27540-F1
1,CBX5,P45973,AF-P45973-F1
2,CREM,Q03060,AF-Q03060-F1
3,DACH1,Q9UI36,AF-Q9UI36-F1
4,E2F4,Q16254,AF-Q16254-F1
