In [28]:
import io
import requests
import pandas as pd

def get_uniprot(prot_list, search_fields=['accession', 'id', 'protein_name', 'gene_names', 'go', 'go_f' ,'go_f', 'go_c', 'go_p', 'cc_interaction']):
    """ Get data from Uniprot for a list of proteins

    Args:
        prot_list (list): list of protein IDs
        search_fields (list): list of fields to search for

    Returns:
        pandas.DataFrame: DataFrame with the results
    
    Example:
        >>> uniprot_list = ["P40925", "P40926"]
        >>> df = get_uniprot(uniprot_list)
    """

    base_url = 'https://rest.uniprot.org/uniprotkb/stream'
    fields = "%2C".join(search_fields)
    query_parts = ["%28accession%3A" + id + "%29" for id in prot_list]
    query = "+OR+".join(query_parts)
    query = "%28" + query + "%29"
    format_type = 'tsv'
    
    # full url
    url = f'{base_url}?fields={fields}&format={format_type}&query={query}'
    
    results = requests.get(url)
    results.raise_for_status()
    
    df = pd.read_csv(io.StringIO(results.text), sep='\t')
    return df

In [29]:
# Example usage:
uniprot_list = ["P40925", "P40926"]
df = get_uniprot(uniprot_list)
df

Unnamed: 0,Entry,Entry Name,Protein names,Gene Names,Gene Ontology (GO),Gene Ontology (molecular function),Gene Ontology (molecular function).1,Gene Ontology (cellular component),Gene Ontology (biological process),Interacts with
0,P40925,MDHC_HUMAN,"Malate dehydrogenase, cytoplasmic (EC 1.1.1.37...",MDH1 MDHA,centrosome [GO:0005813]; cytoplasm [GO:0005737...,diiodophenylpyruvate reductase activity [GO:00...,diiodophenylpyruvate reductase activity [GO:00...,centrosome [GO:0005813]; cytoplasm [GO:0005737...,gluconeogenesis [GO:0006094]; malate metabolic...,P54274
1,P40926,MDHM_HUMAN,"Malate dehydrogenase, mitochondrial (EC 1.1.1.37)",MDH2,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,L-malate dehydrogenase activity [GO:0030060]; ...,L-malate dehydrogenase activity [GO:0030060]; ...,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,aerobic respiration [GO:0009060]; gluconeogene...,


In [35]:
accession_list = ["P49411",
"P11586",
"Q9P2E9-1",
"Q9H307",
"Q92616",
"Q9NX58",
"Q99873",
"O43493-1",
"Q10567",
"O43169",
"O00170",
"O75477",
"Q15165-1"
]

to_list = x.var_names.tolist()

df = get_uniprot(accession_list, search_fields = ['accession','protein_name','id','gene_names'])

df

Unnamed: 0,Entry,Protein names,Entry Name,Gene Names
0,O00170,AH receptor-interacting protein (AIP) (Aryl-hy...,AIP_HUMAN,AIP XAP2
1,O43169,Cytochrome b5 type B (Cytochrome b5 outer mito...,CYB5B_HUMAN,CYB5B CYB5M OMB5
2,O75477,Erlin-1 (Endoplasmic reticulum lipid raft-asso...,ERLN1_HUMAN,ERLIN1 C10orf69 KE04 KEO4 SPFH1
3,P11586,"C-1-tetrahydrofolate synthase, cytoplasmic (C1...",C1TC_HUMAN,MTHFD1 MTHFC MTHFD
4,P49411,"Elongation factor Tu, mitochondrial (EF-Tu) (P43)",EFTU_HUMAN,TUFM
5,Q10567,AP-1 complex subunit beta-1 (Adaptor protein c...,AP1B1_HUMAN,AP1B1 ADTB1 BAM22 CLAPB2
6,Q92616,Stalled ribosome sensor GCN1 (GCN1 eIF-2-alpha...,GCN1_HUMAN,GCN1 GCN1L1 KIAA0219
7,Q99873,Protein arginine N-methyltransferase 1 (EC 2.1...,ANM1_HUMAN,PRMT1 HMT2 HRMT1L2 IR1B4
8,Q9H307,Pinin (140 kDa nuclear and cell adhesion-relat...,PININ_HUMAN,PNN DRS MEMA
9,Q9NX58,Cell growth-regulating nucleolar protein,LYAR_HUMAN,LYAR PNAS-5


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
import import_functions
import importlib
import gseapy as gp
# from matplotlib_venn import venn2, venn3

importlib.reload(import_functions)

<module 'import_functions' from 'g:\\Users\\srpang\\Dropbox (Personal)\\4. Caltech Work\\Research\\Chou-Roukes Group\\scviz_git\\dev\\import_functions.py'>

In [5]:
sample_name = 'windy_proteins'
df = pd.read_excel('windy_proteins.xlsx')
abundance_col = [col for col in df.columns if "Abundance:" in col]
data = df[abundance_col]
data.index = df['Accession']

data

Unnamed: 0_level_0,"Abundance: F93: Sample, 912_R155H_selleck_myc","Abundance: F94: Sample, 912_R155H_selleck_myc","Abundance: F95: Sample, 912_R155H_selleck_myc","Abundance: F96: Sample, 932_L229F_selleck_myc","Abundance: F97: Sample, 932_L229F_selleck_myc","Abundance: F98: Sample, 932_L229F_selleck_myc","Abundance: F99: Sample, 933_K251R_selleck_myc","Abundance: F100: Sample, 933_K251R_selleck_myc","Abundance: F113: Sample, 933_K251R_selleck_myc","Abundance: F110: Sample, 934_R256G_selleck_myc","Abundance: F111: Sample, 934_R256G_selleck_myc","Abundance: F112: Sample, 934_R256G_selleck_myc","Abundance: F107: Sample, 942_WT_selleck_myc","Abundance: F108: Sample, 942_WT_selleck_myc","Abundance: F109: Sample, 942_WT_selleck_myc"
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
P35579,8.643036e+08,1.395312e+09,4.173330e+09,44908876.0,6.500436e+08,1.420351e+09,2.454647e+08,6.090529e+08,6.199970e+08,1.443528e+09,1.733447e+09,8.452460e+08,1.250538e+09,5.932175e+08,5.522489e+08
U00002,1.106318e+08,2.944802e+07,2.956049e+08,,,,,,,,,,,,
U00023,2.871359e+06,1.686397e+06,8.005790e+06,,2.535884e+06,,9.360203e+08,5.854144e+08,3.851648e+08,,,,,2.882335e+06,
U00024,,,,,,,,,,4.223449e+06,8.069055e+06,9.039224e+06,,,
U00022,,,,107201703.5,3.227875e+08,3.730005e+08,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q96IF1,,,,,,,,,,,,,,,
Q9BVI0,,,,,,,,,,,,,,,
P51610,,,,,,,,,,,,,,,
Q92747,,,,,,,,,,,,,,,


In [6]:
df_peptide = pd.read_excel('windy_peptides.xlsx')
df_peptide = df_peptide[df_peptide['Master Protein Accessions'].str.contains('U000')]
df_peptide = df_peptide[abundance_col]

df_peptide

Unnamed: 0,"Abundance: F93: Sample, 912_R155H_selleck_myc","Abundance: F94: Sample, 912_R155H_selleck_myc","Abundance: F95: Sample, 912_R155H_selleck_myc","Abundance: F96: Sample, 932_L229F_selleck_myc","Abundance: F97: Sample, 932_L229F_selleck_myc","Abundance: F98: Sample, 932_L229F_selleck_myc","Abundance: F99: Sample, 933_K251R_selleck_myc","Abundance: F100: Sample, 933_K251R_selleck_myc","Abundance: F113: Sample, 933_K251R_selleck_myc","Abundance: F110: Sample, 934_R256G_selleck_myc","Abundance: F111: Sample, 934_R256G_selleck_myc","Abundance: F112: Sample, 934_R256G_selleck_myc","Abundance: F107: Sample, 942_WT_selleck_myc","Abundance: F108: Sample, 942_WT_selleck_myc","Abundance: F109: Sample, 942_WT_selleck_myc"
166,2.348370e+07,,3.272001e+08,1.110369e+06,2.268687e+08,2.246688e+08,375488824.0,492043152.0,3.325676e+08,302252544.0,384299160.0,348899176.0,4.272131e+08,3.183970e+08,2.680555e+08
283,7.178946e+06,10466466.0,3.055862e+07,,,4.728290e+06,6654563.5,21160156.0,2.629684e+06,4166662.0,11474726.0,7447425.5,2.279380e+07,1.578168e+07,1.294692e+07
284,8.894014e+05,,,5.921109e+06,,,,41228824.0,8.190450e+06,12501115.0,90295895.0,18464904.0,9.987897e+07,3.159075e+07,7.333070e+07
285,8.226335e+07,108208631.5,3.590826e+08,2.924426e+05,8.572110e+07,8.021946e+07,248477785.0,264619464.5,1.477150e+08,99137321.0,90241130.0,119596492.0,9.361442e+07,1.240780e+08,3.239026e+07
286,8.456583e+07,5956713.5,2.756393e+08,,,1.692957e+06,92680543.5,11501035.0,2.411552e+08,43046102.5,7549501.5,8368591.0,3.794006e+07,1.074411e+07,3.156084e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30046,2.069364e+07,9567928.0,1.035252e+07,,1.061962e+08,,260364944.0,,2.472851e+08,91529512.0,262839968.0,93449720.0,1.120373e+08,,
30589,2.077487e+08,210401200.0,2.110277e+08,,1.857582e+06,2.984704e+08,545610307.5,529360448.0,2.521118e+08,332684154.0,274142735.5,518694421.5,,4.725831e+08,5.542678e+05
31238,,,,,2.438165e+07,,,145072192.0,,,,,7.849360e+07,,
31726,6.971832e+06,12471447.0,1.951913e+07,,9.813667e+06,1.401151e+07,21747390.0,43131868.0,2.587749e+07,11940443.0,15783661.0,18640800.0,2.057111e+07,1.808130e+07,1.194629e+07


In [4]:
df_peptide[abundance_col]

Unnamed: 0,"Abundance: F93: Sample, 912_R155H_selleck_myc","Abundance: F94: Sample, 912_R155H_selleck_myc","Abundance: F95: Sample, 912_R155H_selleck_myc","Abundance: F96: Sample, 932_L229F_selleck_myc","Abundance: F97: Sample, 932_L229F_selleck_myc","Abundance: F98: Sample, 932_L229F_selleck_myc","Abundance: F99: Sample, 933_K251R_selleck_myc","Abundance: F100: Sample, 933_K251R_selleck_myc","Abundance: F113: Sample, 933_K251R_selleck_myc","Abundance: F110: Sample, 934_R256G_selleck_myc","Abundance: F111: Sample, 934_R256G_selleck_myc","Abundance: F112: Sample, 934_R256G_selleck_myc","Abundance: F107: Sample, 942_WT_selleck_myc","Abundance: F108: Sample, 942_WT_selleck_myc","Abundance: F109: Sample, 942_WT_selleck_myc"
166,2.348370e+07,,3.272001e+08,1.110369e+06,2.268687e+08,2.246688e+08,375488824.0,492043152.0,3.325676e+08,302252544.0,384299160.0,348899176.0,4.272131e+08,3.183970e+08,2.680555e+08
283,7.178946e+06,10466466.0,3.055862e+07,,,4.728290e+06,6654563.5,21160156.0,2.629684e+06,4166662.0,11474726.0,7447425.5,2.279380e+07,1.578168e+07,1.294692e+07
284,8.894014e+05,,,5.921109e+06,,,,41228824.0,8.190450e+06,12501115.0,90295895.0,18464904.0,9.987897e+07,3.159075e+07,7.333070e+07
285,8.226335e+07,108208631.5,3.590826e+08,2.924426e+05,8.572110e+07,8.021946e+07,248477785.0,264619464.5,1.477150e+08,99137321.0,90241130.0,119596492.0,9.361442e+07,1.240780e+08,3.239026e+07
286,8.456583e+07,5956713.5,2.756393e+08,,,1.692957e+06,92680543.5,11501035.0,2.411552e+08,43046102.5,7549501.5,8368591.0,3.794006e+07,1.074411e+07,3.156084e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30046,2.069364e+07,9567928.0,1.035252e+07,,1.061962e+08,,260364944.0,,2.472851e+08,91529512.0,262839968.0,93449720.0,1.120373e+08,,
30589,2.077487e+08,210401200.0,2.110277e+08,,1.857582e+06,2.984704e+08,545610307.5,529360448.0,2.521118e+08,332684154.0,274142735.5,518694421.5,,4.725831e+08,5.542678e+05
31238,,,,,2.438165e+07,,,145072192.0,,,,,7.849360e+07,,
31726,6.971832e+06,12471447.0,1.951913e+07,,9.813667e+06,1.401151e+07,21747390.0,43131868.0,2.587749e+07,11940443.0,15783661.0,18640800.0,2.057111e+07,1.808130e+07,1.194629e+07


In [7]:
norm_factor = df_peptide[abundance_col].mean()
norm_factor = norm_factor.divide(norm_factor.mean())
print(norm_factor)
data = data.divide(norm_factor)
data.to_csv('normalized.csv')


Abundance: F93: Sample, 912_R155H_selleck_myc     0.863385
Abundance: F94: Sample, 912_R155H_selleck_myc     0.697731
Abundance: F95: Sample, 912_R155H_selleck_myc     0.871064
Abundance: F96: Sample, 932_L229F_selleck_myc     0.207558
Abundance: F97: Sample, 932_L229F_selleck_myc     0.754949
Abundance: F98: Sample, 932_L229F_selleck_myc     0.913681
Abundance: F99: Sample, 933_K251R_selleck_myc     1.727992
Abundance: F100: Sample, 933_K251R_selleck_myc    1.574356
Abundance: F113: Sample, 933_K251R_selleck_myc    1.190413
Abundance: F110: Sample, 934_R256G_selleck_myc    0.957734
Abundance: F111: Sample, 934_R256G_selleck_myc    1.175580
Abundance: F112: Sample, 934_R256G_selleck_myc    1.249104
Abundance: F107: Sample, 942_WT_selleck_myc       1.088082
Abundance: F108: Sample, 942_WT_selleck_myc       1.144041
Abundance: F109: Sample, 942_WT_selleck_myc       0.584330
dtype: float64


In [8]:
data = data[data.isna().sum(axis=1) < 5]
# data_reset = data.reset_index().rename(columns={'Accession': 'Proteins'})
data_reset = data.reset_index()
# 3. Reshape Data into Long Form
df_long = data_reset.melt(id_vars = ['Accession'], var_name='Sample', value_name='Abundance')

In [9]:
df_long['sample_id'] = df_long['Sample'].str.extract(r'.*,\s(.*?)$')
df_long['File'] = df_long['Sample']
df.index = df['Accession']


In [10]:
u2g = df['Gene Symbol'].transpose().to_dict()
print(u2g)


{'P35579': 'MYH9', 'U00002': nan, 'U00023': nan, 'U00024': nan, 'U00022': nan, 'U00031': nan, 'U00025': nan, 'O00571': 'DDX3X', 'P49327': 'FASN', 'P35580': 'MYH10', 'P31943': 'HNRNPH1', 'P0DMV9': 'HSPA1B', 'Q92499': 'DDX1', 'Q9BUJ2': 'HNRNPUL1', 'P11940': 'PABPC1', 'P19338': 'NCL', 'P11142': 'HSPA8', 'Q15233': 'NONO', 'P22626': 'HNRNPA2B1', 'P14618': 'PKM', 'O15027': 'SEC16A', 'P68371': 'TUBB4B', 'P07437': 'TUBB', 'P60709': 'ACTB', 'Q13310': 'PABPC4', 'Q9BVA1': 'TUBB2B', 'Q13885': 'TUBB2A', 'P68363': 'TUBA1B', 'P31942': 'HNRNPH3', 'Q9Y3I0': 'RTCB', 'Q9BQE3': 'TUBA1C', 'Q92841': 'DDX17', 'P33176': 'KIF5B', 'P09651': 'HNRNPA1', 'Q8WWM7': 'ATXN2L', 'Q13283': 'G3BP1', 'P23246': 'SFPQ', 'Q07666': 'KHDRBS1', 'Q9UDY2': 'TJP2', 'Q07157': 'TJP1', 'P06748': 'NPM1', 'P11021': 'HSPA5', 'O60506': 'SYNCRIP', 'P68133': 'ACTA1', 'P17844': 'DDX5', 'P61978': 'HNRNPK', 'P14866': 'HNRNPL', 'P04350': 'TUBB4A', 'Q9H361': 'PABPC3', 'P78332': 'RBM6', 'P04264': 'KRT1', 'Q7Z406': 'MYH14', 'P36578': 'RPL4', 'Q08

In [11]:
df = df_long
df['Abundance'] = df['Abundance'].apply(lambda x: np.log2(x))
df['sampleid'] = df['Sample'].str.extract(r'.*,\s(.*?)$')
df['sample_id'] = df['sampleid'].apply(lambda x: x.split('_')[1])
df['Sample'] = df['sample_id']
sample_dict = df[['sampleid','sample_id']].drop_duplicates().set_index('sampleid')['sample_id'].to_dict()
df['Gene Symbol'] = df['Accession'].map(u2g)

print(sample_dict)

{'912_R155H_selleck_myc': 'R155H', '932_L229F_selleck_myc': 'L229F', '933_K251R_selleck_myc': 'K251R', '934_R256G_selleck_myc': 'R256G', '942_WT_selleck_myc': 'WT'}


In [12]:
poi = pd.read_csv('cofactor.csv')
poi['Gene'] = poi['To'].apply(lambda x: x.strip())
poi_list = poi['Gene'].tolist()
print(poi_list)


['UBXN1', 'UBXN2B', 'NSFL1C', 'FAF1', 'FAF2', 'UBXN4', 'UBXN6', 'UBXN7', 'UBXN8', 'ASPSCR1', 'UBXN10', 'YOD1', 'NPLOC4', 'VCPIP1', 'PLAA', 'NGLY1', 'SELENOS', 'AMFR', 'SVIP', 'ZFAND2B', 'ANKZF1', 'SYVN1', 'ATXN3', 'UBE4B', 'RHBDD1', 'UFD1', 'DERL1', 'DERL2', 'SPRTN']


In [13]:
Upregulated = []
Downregulated = []
de_list = []
comparisons = [['L229F','WT'],['R256G','WT'],['R155H','WT'],['K251R','WT']]
cat_list = ['Function','Process','Component','KEGG','RCTM','SMART','InterPro','WikiPathways']

In [43]:
pairs = comparisons[0]

In [14]:
cells = f'{pairs[0]}_vs_{pairs[1]}'
df_de = import_functions.de_analysis(df[df['Sample'].isin(pairs)],pairs[0],pairs[1])
df_de.dropna(subset = ['log2fc','pvalue'],inplace = True)
df_de['hit'] = 'Insignificant'
df_de.loc[df_de['pvalue'] > 0.05,'hit'] = 'Insignificant'
df_de.loc[(df_de['pvalue'] <= 0.05) & (df_de['log2fc'] > 0),'hit'] = 'Upregulated'
df_de.loc[(df_de['pvalue'] <= 0.05) & (df_de['log2fc'] < 0),'hit'] = 'Downregulated'
df_de['Gene Symbol'] = df_de.index.map(u2g)
print(df_de['Gene Symbol'].head())

NameError: name 'pairs' is not defined

In [None]:
poi_list = [gene for gene in poi_list if gene in df_de['Gene Symbol'].unique()]
print(poi_list)

['NSFL1C', 'FAF2', 'UBXN8', 'ASPSCR1', 'NPLOC4', 'SELENOS', 'AMFR', 'ANKZF1', 'SYVN1', 'RHBDD1', 'UFD1', 'DERL1']


In [None]:
up_list = df_de[df_de['hit'] == 'Upregulated'].sort_values(by = 'log2fc',ascending = False).head(5)['Gene Symbol'].tolist()
down_list = df_de[df_de['hit'] == 'Downregulated'].sort_values(by = 'log2fc',ascending = True).head(5)['Gene Symbol'].tolist()

In [19]:
hh_list = up_list + down_list
title = ' '.join(cells.split('_'))
fig,uniup,unidown = import_functions.volcano_plot(df_de,f'de_{sample_name}_{cells}',fc_column = 'log2fc',p_column = 'pvalue',fc_cutoff = [0,0],title = title,labels = poi_list,label_sig = True,label_column = 'Gene Symbol',highlight_label = True)
plt.close(fig)
fig,uniup,unidown = import_functions.volcano_plot(df_de,f'de_{sample_name}_{cells}_all_label',fc_column = 'log2fc',p_column = 'pvalue',fc_cutoff = [0,0],title = title,labels = poi_list,label_sig = False,label_column = 'Gene Symbol',highlight_label = True)
plt.close(fig)
fig,uniup,unidown = import_functions.volcano_plot(df_de,f'de_{sample_name}_{cells}_top5',fc_column = 'log2fc',p_column = 'pvalue',fc_cutoff = [0,0],title = title,labels = hh_list,label_sig = True,label_column = 'Gene Symbol',highlight_label = False)
plt.close(fig)

checking input ...	done
-log10 transformation of the p-value ...	done
select up/down regulated proteins ...	done
setting ylim and xlim for plotting ...	done
define figure and rc ...	done
plotting scatter plot ...	done
add gridline ...	done
add labels ...	done
save the figure ...	done
checking input ...	done
-log10 transformation of the p-value ...	done
select up/down regulated proteins ...	done
setting ylim and xlim for plotting ...	done
define figure and rc ...	done
plotting scatter plot ...	done
add gridline ...	done
add labels ...	done
save the figure ...	done
checking input ...	done
-log10 transformation of the p-value ...	done
select up/down regulated proteins ...	done
setting ylim and xlim for plotting ...	done
define figure and rc ...	done
plotting scatter plot ...	done
add gridline ...	done
add labels ...	done
save the figure ...	done


In [None]:
plt.show(fig)

In [53]:
genehit

Index(['O00139', 'O14579', 'O14654', 'O15160', 'O15226', 'O43852', 'O60841',
       'O75525', 'O76021', 'P08237', 'P08559', 'P08670', 'P14618', 'P18077',
       'P19525', 'P24752', 'P25705', 'P26641', 'P27708', 'P30050', 'P33176',
       'P33992', 'P35268', 'P35269', 'P37840', 'P39656', 'P46776', 'P46782',
       'P46783', 'P48444', 'P49207', 'P51532', 'P51572', 'P53597', 'P55735',
       'P62277', 'P62633', 'P78527', 'P82650', 'P98175', 'Q00059', 'Q01813',
       'Q02790', 'Q08378', 'Q12849', 'Q13247', 'Q13428', 'Q13526', 'Q13643',
       'Q13885', 'Q14244', 'Q14247', 'Q14258', 'Q14739', 'Q15366', 'Q15388',
       'Q15650', 'Q5JTH9', 'Q6P087', 'Q86U70', 'Q8IYB3', 'Q8IZ69', 'Q8N1G2',
       'Q8N1G4', 'Q8NG11', 'Q8TEB9', 'Q96CW1', 'Q96KG9', 'Q96QC0', 'Q99459',
       'Q9BQ39', 'Q9BTV4', 'Q9BY44', 'Q9BY77', 'Q9GZT3', 'Q9H6S0', 'Q9NYL9',
       'Q9P2J5', 'Q9P2K5', 'Q9UBU9', 'Q9UDY2', 'Q9UHB9', 'Q9UKD2', 'Q9UL18',
       'Q9ULX6', 'Q9UN81', 'Q9UPQ9', 'Q9UQE7', 'Q9Y450', 'Q9Y5B9', 'Q9Y5Q8']

In [None]:
universe = df_de.index.unique()

genedown = df_de[df_de['hit'] == 'Downregulated'].index
geneup = df_de[df_de['hit'] == 'Upregulated'].index
genehit = df_de[df_de['hit'] != 'Insignificant'].index

In [51]:
anno_hit = import_functions.get_string_annotation(genehit,universe)

[{"category": "COMPARTMENTS", "term": "GOCC:0005622", "number_of_genes": 85, "number_of_genes_in_background": 11202, "ncbiTaxonId": 9606, "inputGenes": ["Q02790", "Q9H6S0", "Q08378", "P33992", "Q9Y5B9", "P19525", "Q13247", "Q13526", "Q12849", "Q15650", "O14579", "P48444", "P27708", "P24752", "Q96KG9", "O60841", "Q96CW1", "Q9BTV4", "P33176", "Q9NYL9", "Q9UHB9", "P78527", "Q9P2K5", "P14618", "Q14258", "Q8IYB3", "P98175", "P26641", "P37840", "Q14739", "P08237", "P46776", "P35268", "O75525", "Q15366", "Q9UQE7", "P30050", "Q15388", "Q9Y450", "Q99459", "Q9Y5Q8", "O15160", "Q9UL18", "Q8N1G2", "Q9BQ39", "Q96QC0", "Q14247", "Q13428", "P08559", "Q13885", "Q01813", "P55735", "Q6P087", "Q8TEB9", "P53597", "Q9P2J5", "P35269", "P49207", "Q9ULX6", "P25705", "O00139", "P51572", "Q86U70", "P51532", "Q8NG11", "Q9BY77", "P39656", "Q9UPQ9", "P62633", "Q9BY44", "P82650", "P18077", "Q00059", "P62277", "Q9UBU9", "O43852", "Q9UDY2", "O15226", "P08670", "Q5JTH9", "Q9GZT3", "O76021", "P46782", "P46783", "Q14244

In [61]:
anno_hit = get_string_annotation(accession_list,universe)


[]


In [68]:
anno_hit

In [70]:
get_string_id(accession_list)

['9606.ENSP00000322439',
 '9606.ENSP00000216605',
 '9606.ENSP00000216832',
 '9606.ENSP00000300648',
 '9606.ENSP00000345917',
 '9606.ENSP00000406162',
 '9606.ENSP00000350199',
 '9606.ENSP00000308430',
 '9606.ENSP00000279146',
 '9606.ENSP00000410964']

In [65]:
"%0d".join(universe)

'A0A075B6R9%0dA0A0B4J1U7%0dA0A0B4J1Y9%0dA2NJV5%0dC9JLW8%0dE9PRG8%0dO00124%0dO00139%0dO00178%0dO00330%0dO00425%0dO00571%0dO14579%0dO14617%0dO14654%0dO14950%0dO14979%0dO15014%0dO15027%0dO15042%0dO15160%0dO15226%0dO15234%0dO43143%0dO43148%0dO43251%0dO43347%0dO43390%0dO43422%0dO43684%0dO43776%0dO43795%0dO43823%0dO43852%0dO43896%0dO60333%0dO60506%0dO60716%0dO60841%0dO75151%0dO75369%0dO75380%0dO75477%0dO75494%0dO75525%0dO75533%0dO75534%0dO75643%0dO75934%0dO75955%0dO76021%0dO76094%0dO94832%0dO94842%0dO94905%0dO95218%0dO95248%0dO95429%0dO95793%0dO95816%0dO95985%0dO96019%0dP00338%0dP00367%0dP01624%0dP02768%0dP04264%0dP04406%0dP04843%0dP05141%0dP05198%0dP05387%0dP05388%0dP05455%0dP06312%0dP06576%0dP06730%0dP06733%0dP06748%0dP06753%0dP07195%0dP07437%0dP07737%0dP07814%0dP07900%0dP07910%0dP08237%0dP08238%0dP08243%0dP08559%0dP08621%0dP08670%0dP08708%0dP08865%0dP09012%0dP09622%0dP09651%0dP09661%0dP09874%0dP0DMV9%0dP0DP25%0dP10515%0dP10599%0dP10809%0dP11021%0dP11142%0dP11177%0dP11388%0dP11586%0dP11940

In [None]:
for row in data:

    term = row["term"]
    preferred_names = ",".join(row["preferredNames"])
    fdr = float(row["fdr"])
    description = row["description"]
    category = row["category"]

    if category == "Process" and fdr < 0.01:

        ## print significant GO Process annotations

        print("\t".join([term, preferred_names, str(fdr), description]))

In [1]:
def get_string_annotation(gene,universe,species = 9606):
    string_api_url = "https://version-11-5.string-db.org/api"
    output_format = "json"
    method = "enrichment"

    # gene_string = get_string_id(gene, name = 'Hits')
    gene_universe = get_string_id(universe, name = 'Universe')

    params = {
        "identifiers" : "%0d".join(gene),
        "background_string_identifiers": "%0d".join(gene_universe),
        "species" : species
    }
    
    request_url = "/".join([string_api_url, output_format, method])
    results = requests.post(request_url, data=params)
    print(results.text)
    try:
        annotation = pd.read_json(results.text)
    except:
        annotation = pd.DataFrame()
    return annotation

def get_string_network(gene,comparison,species = 9606):
    string_api_url = "https://version-11-5.string-db.org/api"
    output_format = "highres_image"
    method = "network"
    
    if len(gene) <= 10:
        hide_label = 0
    else:
        hide_label = 1
    
    params = {
        "identifiers" : "%0d".join(get_string_id(gene)),
        "species" : species,
        "required_score" : 700,
        "hide_disconnected_nodes" : hide_label,
        "block_structure_pics_in_bubbles" : 1,
        "flat_node_design" : 1,
        "center_node_labels" : 1
    }
    
    request_url = "/".join([string_api_url, output_format, method])
    response = requests.post(request_url, data=params)
    
    with open(f'{comparison}.png','wb') as file:
        file.write(response.content)
    
    return True

def get_string_id(gene,species = 9606,name = None):
    string_api_url = "https://version-11-5.string-db.org/api"
    output_format = "tsv-no-header"
    method = "get_string_ids"
    
    if name:
        print(f"Converting STRING id for {name}...")

    params = {
        "identifiers" : "\r".join(gene),
        "species" : species, 
        "limit" : 1, 
        "echo_query" : 1,
    }
    
    request_url = "/".join([string_api_url, output_format, method])
    results = requests.post(request_url, data=params)
    s_id = []
    for line in results.text.strip().split("\n"):
        l = line.split("\t")
        try:
            string_id = l[2]
            s_id.append(string_id)
        except:
            continue
    
    return s_id

In [24]:
import_functions.get_string_network(geneup,f'Upregulated_{sample_name}_{cells}_interactome')

True

In [2]:
anno_down = get_string_annotation(genedown,universe)

NameError: name 'genedown' is not defined

In [60]:
anno_down