In [122]:
import os
import pandas as pd

from Bio import ExPASy, SwissProt
import subprocess

In [123]:
os.chdir("/home/enriquead/Documents/selflinks")
os.listdir()

['Endogenous_TREX_2_PPI_FDR_Links_xiFDR2.1.5.2.csv', 'prot', 'selflinks.ipynb']

In [124]:
df = pd.read_csv("Endogenous_TREX_2_PPI_FDR_Links_xiFDR2.1.5.2.csv")
print(f'N rows: {df.shape[0]}')
df.head()

N rows: 3125


Unnamed: 0,SearchIDs,LinkID,PeptidePairIDs,PSMIDs,Protein1,Description1,Decoy1,Protein2,Description2,Decoy2,...,top pep fdr,top psm fdr,Unnamed: 87,PPI id,LinkWindow1,LinkWindow2,TM_OM_Mods,TM_OM_ModWindow,TM_OM_RepresentiveProtein,TM_OM_RepresentiveModSite
0,17415;17416;17417,1785,1276;1244;6088;5963;6053;5899;5556;4554;5462;4...,29420428341;29420371428;29421477995;2943913218...,Q8NI27,THOC2_HUMAN THO complex subunit 2 OS=Homo sapi...,False,Q8NI27,THOC2_HUMAN THO complex subunit 2 OS=Homo sapi...,False,...,0.0,0.0,,194,SSSSIGSASKSDESSTEETDKSRERSQCGVKAVNKASSTTP,VASVQNGPGGGPSSSSIGSASKSDESSTEETDKSRERSQCG,,,,
1,17415;17416;17417,892,4305;4558;5063;5720;5685;5823;5185;4725;3311;4...,29421511135;29421511185;29421459897;2942183689...,P22626,ROA2_HUMAN Heterogeneous nuclear ribonucleopro...,False,P22626,ROA2_HUMAN Heterogeneous nuclear ribonucleopro...,False,...,0.0,0.0,,201,RGFGFVTFDDHDPVDKIVLQKYHTINGHNAEVRKALSRQEM,RQSGKKRGFGFVTFDDHDPVDKIVLQKYHTINGHNAEVRKA,,,,
2,17415;17416;17417,500,5956;5715;5471;5506;5317;5038;4822;3089;2914;2...,29421025728;29421315186;29421371613;2943914294...,Q8NI27,THOC2_HUMAN THO complex subunit 2 OS=Homo sapi...,False,Q8NI27,THOC2_HUMAN THO complex subunit 2 OS=Homo sapi...,False,...,0.0,0.0,,194,KEKTPATTPEARVLGKDGKEKPKEERPNKDEKARETKERTP,KEKKEKTPATTPEARVLGKDGKEKPKEERPNKDEKARETKE,,,,
3,17415;17417,2543,5880;5572;6296;6292;1403;6235;6205;6158;6057;5...,29443183429;29421059502;29421003473;2942135130...,Q6I9Y2,THOC7_HUMAN THO complex subunit 7 homolog OS=H...,False,Q6I9Y2,THOC7_HUMAN THO complex subunit 7 homolog OS=H...,False,...,0.0,0.0,,233,FHVLLSTIHELQQTLENDEKLSEVEEAQEASMETDPKP...,ELEHLSHIKESVEDKLELRRKQFHVLLSTIHELQQTLENDE,,,,
4,17415;17416;17417,1896,1378;3223;6288;6223;6190;6154;6124;5855;5208;4...,29443584828;29421635711;29420546161;2942158846...,P09651,ROA1_HUMAN Heterogeneous nuclear ribonucleopro...,False,P09651,ROA1_HUMAN Heterogeneous nuclear ribonucleopro...,False,...,0.0,0.0,,614,AKPRNQGGYGGSSSSSSYGSGRRF.................,GGNFGGRSSGPYGGGGQYFAKPRNQGGYGGSSSSSSYGSGR,,,,


We are only interested in self links, so we will filter for link where Protein1 is equal to Protein2

In [125]:
df=df[(df['Protein1']) == (df['Protein2'])]
prot=df['Protein1'].unique()
print(f'There are {len(prot)} unique proteins and {df.shape[0]} self-links')

There are 144 unique proteins and 2885 self-links


In [126]:
for i in prot[0:5]:
    print(f'There are {len(df[df.Protein1==i])} links on protein {i}')

There are 306 links on protein Q8NI27
There are 33 links on protein P22626
There are 25 links on protein Q6I9Y2
There are 29 links on protein P09651
There are 75 links on protein Q86V81


In [127]:
def get_alphafold_download_link(uniprot_id):
	link_pattern = 'https://alphafold.ebi.ac.uk/files/AF-{}-F1-model_v2.pdb'
	return link_pattern.format(uniprot_id)

def download_alphafold_prediction(uniprot_id, gene_name):
	url = get_alphafold_download_link(uniprot_id)

	result = subprocess.run(['wget', url, '-O', f'{gene_name}.pdb'])
	return result   # Result will be 0 if operation was successful.


In [128]:
os.chdir("/home/enriquead/Documents/selflinks/prot")
all_scores=pd.DataFrame()
for i in prot:
    df_here = df[df.Protein1==i]
    df2=pd.DataFrame()

    df2['res1']=df_here["fromSite"]
    df2['chain1']="A"
    df2['res2']=df_here["ToSite"]
    df2['chain2']="A"

    os.mkdir(i)
    os.chdir(i)
    df2.to_csv(f"{i}-selflink.txt", sep="|", index=False, header=False)

    handle = ExPASy.get_sprot_raw(i)
    data = SwissProt.read(handle)
    download_alphafold_prediction(data.accessions[0], data.gene_name[0]['Name'])

    print(data.gene_name[0]['Name'])
    print(data.accessions[0])

    command = f'xlms-tools -m score -l {i}-selflink.txt {data.gene_name[0]["Name"]}.pdb --name xlms_out'
    subprocess.run(command, shell=True)
    subprocess.run('sed -i "/graphics silhouettes true/d" xlms_out.cxc', shell=True)
    subprocess.run('sed -i "/cartoon style modeHelix tube/d" xlms_out.cxc', shell=True)

    new_score=pd.read_csv("xlms_out_scores.tsv", sep="\t")
    all_scores=pd.concat([all_scores, new_score])

    os.chdir('..')
all_scores.to_csv("All_prot_scores.tsv", sep="\t", index=False)
    

--2024-02-22 12:30:01--  https://alphafold.ebi.ac.uk/files/AF-Q8NI27-F1-model_v2.pdb
Resolving alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)... 34.149.152.8
Connecting to alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)|34.149.152.8|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/octet-stream]
Saving to: 'THOC2.pdb'

     0K .......... .......... .......... .......... .......... 2.55M
    50K .......... .......... .......... .......... .......... 2.74M
   100K .......... .......... .......... .......... .......... 2.79M
   150K .......... .......... .......... .......... .......... 5.44M
   200K .......... .......... .......... .......... .......... 4.45M
   250K .......... .......... .......... .......... .......... 7.22M
   300K .......... .......... .......... .......... .......... 2.99M
   350K .......... .......... .......... .......... .......... 6.96M
   400K .......... .......... .......... .......... .......... 7.23M
   450K .......... .

THOC2
Q8NI27

[1mxlms-tools[0m: a software suite for modeling protein structures
            with crosslinking mass spectrometry data

Scoring models...
MODEL: THOC2.pdb
-- Building space-filling model: 0.053s
-- Calculating residue depths: 0.486s
-- Computing XLP/MP scores: 0.801s

***

BEST SCORING MODEL: THOC2.pdb
XLP/MP scores are in xlms_out_scores.tsv
Open xlms_out.cxc with ChimeraX to visualize the models with crosslinks
Total time elapsed: 0.802s


If you use xlms-tools, please cite:
Manalastas-Cantos, K., Adoni, K. R., Pfeifer, M., Märtens, B., Grünewald, K., Thalassinos, K., & Topf, M. (2024). Modeling flexible protein structure with AlphaFold2 and cross-linking mass spectrometry. Molecular & Cellular Proteomics. https://doi.org/10.1016/j.mcpro.2024.100724



--2024-02-22 12:30:03--  https://alphafold.ebi.ac.uk/files/AF-P22626-F1-model_v2.pdb
Resolving alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)... 34.149.152.8
Connecting to alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)|34.149.152.8|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 220400 (215K) [application/octet-stream]
Saving to: 'HNRNPA2B1.pdb'

     0K .......... .......... .......... .......... .......... 23% 2.51M 0s
    50K .......... .......... .......... .......... .......... 46% 2.80M 0s
   100K .......... .......... .......... .......... .......... 69% 2.83M 0s
   150K .......... .......... .......... .......... .......... 92% 6.61M 0s
   200K .......... .....                                      100% 6.14M=0.06s

2024-02-22 12:30:04 (3.29 MB/s) - 'HNRNPA2B1.pdb' saved [220400/220400]



HNRNPA2B1
P22626

[1mxlms-tools[0m: a software suite for modeling protein structures
            with crosslinking mass spectrometry data

Scoring models...
MODEL: HNRNPA2B1.pdb
-- Building space-filling model: 0.012s
-- Calculating residue depths: 0.201s
-- Computing XLP/MP scores: 0.261s

***

BEST SCORING MODEL: HNRNPA2B1.pdb
XLP/MP scores are in xlms_out_scores.tsv
Open xlms_out.cxc with ChimeraX to visualize the models with crosslinks
Total time elapsed: 0.261s


If you use xlms-tools, please cite:
Manalastas-Cantos, K., Adoni, K. R., Pfeifer, M., Märtens, B., Grünewald, K., Thalassinos, K., & Topf, M. (2024). Modeling flexible protein structure with AlphaFold2 and cross-linking mass spectrometry. Molecular & Cellular Proteomics. https://doi.org/10.1016/j.mcpro.2024.100724



--2024-02-22 12:30:05--  https://alphafold.ebi.ac.uk/files/AF-Q6I9Y2-F1-model_v2.pdb
Resolving alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)... 34.149.152.8
Connecting to alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)|34.149.152.8|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 139886 (137K) [application/octet-stream]
Saving to: 'THOC7.pdb'

     0K .......... .......... .......... .......... .......... 36% 2.53M 0s
    50K .......... .......... .......... .......... .......... 73% 2.81M 0s
   100K .......... .......... .......... ......               100% 2.42M=0.05s

2024-02-22 12:30:05 (2.59 MB/s) - 'THOC7.pdb' saved [139886/139886]



THOC7
Q6I9Y2

[1mxlms-tools[0m: a software suite for modeling protein structures
            with crosslinking mass spectrometry data

Scoring models...
MODEL: THOC7.pdb
-- Building space-filling model: 0.008s
-- Calculating residue depths: 0.174s
-- Computing XLP/MP scores: 0.207s

***

BEST SCORING MODEL: THOC7.pdb
XLP/MP scores are in xlms_out_scores.tsv
Open xlms_out.cxc with ChimeraX to visualize the models with crosslinks
Total time elapsed: 0.207s


If you use xlms-tools, please cite:
Manalastas-Cantos, K., Adoni, K. R., Pfeifer, M., Märtens, B., Grünewald, K., Thalassinos, K., & Topf, M. (2024). Modeling flexible protein structure with AlphaFold2 and cross-linking mass spectrometry. Molecular & Cellular Proteomics. https://doi.org/10.1016/j.mcpro.2024.100724



--2024-02-22 12:30:06--  https://alphafold.ebi.ac.uk/files/AF-P09651-F1-model_v2.pdb
Resolving alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)... 34.149.152.8
Connecting to alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)|34.149.152.8|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 227852 (223K) [application/octet-stream]
Saving to: 'HNRNPA1.pdb'

     0K .......... .......... .......... .......... .......... 22% 2.73M 0s
    50K .......... .......... .......... .......... .......... 44% 2.77M 0s
   100K .......... .......... .......... .......... .......... 67% 2.89M 0s
   150K .......... .......... .......... .......... .......... 89% 6.62M 0s
   200K .......... .......... ..                              100% 5.48M=0.06s

2024-02-22 12:30:06 (3.41 MB/s) - 'HNRNPA1.pdb' saved [227852/227852]



HNRNPA1
P09651

[1mxlms-tools[0m: a software suite for modeling protein structures
            with crosslinking mass spectrometry data

Scoring models...
MODEL: HNRNPA1.pdb
-- Building space-filling model: 0.013s
-- Calculating residue depths: 0.189s
-- Computing XLP/MP scores: 0.247s

***

BEST SCORING MODEL: HNRNPA1.pdb
XLP/MP scores are in xlms_out_scores.tsv
Open xlms_out.cxc with ChimeraX to visualize the models with crosslinks
Total time elapsed: 0.248s


If you use xlms-tools, please cite:
Manalastas-Cantos, K., Adoni, K. R., Pfeifer, M., Märtens, B., Grünewald, K., Thalassinos, K., & Topf, M. (2024). Modeling flexible protein structure with AlphaFold2 and cross-linking mass spectrometry. Molecular & Cellular Proteomics. https://doi.org/10.1016/j.mcpro.2024.100724



--2024-02-22 12:30:07--  https://alphafold.ebi.ac.uk/files/AF-Q86V81-F1-model_v2.pdb
Resolving alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)... 34.149.152.8
Connecting to alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)|34.149.152.8|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 158435 (155K) [application/octet-stream]
Saving to: 'ALYREF.pdb'

     0K .......... .......... .......... .......... .......... 32% 2.62M 0s
    50K .......... .......... .......... .......... .......... 64% 2.19M 0s
   100K .......... .......... .......... .......... .......... 96% 2.92M 0s
   150K ....                                                  100% 5.99M=0.06s

2024-02-22 12:30:07 (2.59 MB/s) - 'ALYREF.pdb' saved [158435/158435]



ALYREF
Q86V81

[1mxlms-tools[0m: a software suite for modeling protein structures
            with crosslinking mass spectrometry data

Scoring models...
MODEL: ALYREF.pdb
-- Building space-filling model: 0.008s
-- Calculating residue depths: 0.162s
-- Computing XLP/MP scores: 0.199s

***

BEST SCORING MODEL: ALYREF.pdb
XLP/MP scores are in xlms_out_scores.tsv
Open xlms_out.cxc with ChimeraX to visualize the models with crosslinks
Total time elapsed: 0.199s


If you use xlms-tools, please cite:
Manalastas-Cantos, K., Adoni, K. R., Pfeifer, M., Märtens, B., Grünewald, K., Thalassinos, K., & Topf, M. (2024). Modeling flexible protein structure with AlphaFold2 and cross-linking mass spectrometry. Molecular & Cellular Proteomics. https://doi.org/10.1016/j.mcpro.2024.100724



In [129]:
# Used in case of ";" whithin "ToSite"

'''for i in prot[0:5]:
    df_here = df[df.Protein1==i]
    df2=pd.DataFrame()

    b2=[prot_group.split(';') for prot_group in df_here["ToSite"]]
    b1=[len(i)*[df_here["fromSite"].to_list()[n]] for n,i in enumerate(b2)]

    df2['res1']=[prot_list for prot_lists in b1 for prot_list in prot_lists]
    df2['chain1']="A"
    df2['res2']=[prot_list for prot_lists in b2 for prot_list in prot_lists]
    df2['chain2']="A"
    df2.to_csv(f"{i}-selflink.csv", sep="|", index=False, header=False)'''

'for i in prot[0:5]:\n    df_here = df[df.Protein1==i]\n    df2=pd.DataFrame()\n\n    b2=[prot_group.split(\';\') for prot_group in df_here["ToSite"]]\n    b1=[len(i)*[df_here["fromSite"].to_list()[n]] for n,i in enumerate(b2)]\n\n    df2[\'res1\']=[prot_list for prot_lists in b1 for prot_list in prot_lists]\n    df2[\'chain1\']="A"\n    df2[\'res2\']=[prot_list for prot_lists in b2 for prot_list in prot_lists]\n    df2[\'chain2\']="A"\n    df2.to_csv(f"{i}-selflink.csv", sep="|", index=False, header=False)'