In [10]:
import pandas as pd
import numpy as np
import csv 

In [3]:
PATH = "./disgenet/"
filename = "./disgenet/curated_gene_disease_associations.tsv"

In [None]:
df = pd.read_csv(filename, sep = '\t')
df.head()

In [None]:
print(len(df), df.columns)

#find row associated to disease Malignant Mesothelioma with id C0345967

The columns in the files are:
* geneId 		-> NCBI Entrez Gene Identifier
* geneSymbol	-> Official Gene Symbol
* DSI		-> The Disease Specificity Index for the gene
* DPI		-> The Disease Pleiotropy Index for the gene
* diseaseId 	-> UMLS concept unique identifier
* diseaseName 	-> Name of the disease	
* diseaseType  	-> The DisGeNET disease type: disease, phenotype and group
* diseaseClass	-> The MeSH disease class(es)
* diseaseSemanticType	-> The UMLS Semantic Type(s) of the disease
* score		-> DisGENET score for the Gene-Disease association
* EI		-> The Evidence Index for the Gene-Disease association
* YearInitial	-> First time that the Gene-Disease association was reported
* YearFinal	-> Last time that the Gene-Disease association was reported
* NofPmids	-> Total number of publications reporting the Gene-Disease association
* NofSnps		-> Total number of SNPs associated to the Gene-Disease association
* source		-> Original source reporting the Gene-Disease association

In [None]:
target = df.loc[df['diseaseName'] == 'Malignant mesothelioma']
target.to_csv(PATH+"malignant_mesothelioma_curated_genes.tsv", sep = '\t')

In [4]:
t = pd.read_csv(PATH+"malignant_mesothelioma_curated_genes.tsv", sep = '\t')

In [5]:
t = t.drop('Unnamed: 0', axis = 1)
t.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,10,NAT2,0.466,0.828,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.34,0.5,1995.0,2009.0,1,0,CTD_human
1,142,PARP1,0.432,0.862,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.32,1.0,2011.0,2011.0,1,0,CTD_human
2,302,ANXA2,0.485,0.793,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.3,,2010.0,2010.0,1,0,CTD_human
3,335,APOA1,0.463,0.759,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.3,,2013.0,2013.0,1,0,CTD_human
4,596,BCL2,0.312,0.862,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.31,1.0,2006.0,2010.0,1,0,CTD_human


### Explore the DisGeNet dataset, find the disease of interest and get the list of human genes involved.

In [11]:
curated = pd.read_csv("./disgenet/browser_source_genes_summary_CURATED.tsv", sep = '\t')
curated.head()            

Unnamed: 0,Gene,Gene_id,UniProt,Gene_Full_Name,Protein_Class,pLI,DSI_g,DPI_g,N_diseases,N_SNPs
0,NAT2,10,P11245,N-acetyltransferase 2,transferase,4e-06,0.466,0.828,37,1
1,PARP1,142,P09874,poly(ADP-ribose) polymerase 1,,0.000552,0.432,0.862,66,2
2,ANXA2,302,P07355,annexin A2,,0.000482,0.485,0.793,26,0
3,APOA1,335,P02647,apolipoprotein A1,,0.000549,0.463,0.759,54,15
4,BCL2,596,P10415,"BCL2, apoptosis regulator",signaling molecule,0.56705,0.312,0.862,137,8


In [12]:
#now let's save the gene symbols, entrez ID and names in arrays
geneSymbol=[]
geneID=[]
geneName=[]
for i in range(len(curated)):
    geneSymbol.append(curated['Gene'][i])
    geneID.append(curated['Gene_id'][i])
    geneName.append(curated['Gene_Full_Name'][i])    
#we check on HGNC to see if we need to change genes name

b) For all genes in the seed gene list, collect the following basic information from the Uniprot:

* official (primary) gene symbol (check if the symbols are updated and approved on the HGNC website; report any issue/lack of data/potential misinterpretation)

* Uniprot AC, alphanumeric ‘accession number’ (a.k.a. ’Uniprot entry’)

* protein name (the main one only, do not report the aliases)

* Entrez Gene ID (a.k.a. ‘GeneID’) very brief description of its function (keep it very short, i.e. max 20 words)

* notes related to the above information, if any and if relevant

Store the data gathered in a table in an easily accessible format of your choice (csv, tab,
excel, etc).

In [None]:
print(geneSymbol)

In [None]:
#fino a COL12A1 approved

In [None]:
# printing original list  
print("The original list is : " + str(geneSymbol))
  
# using join() 
# avoiding printing last comma 
print("The formatted output is : ") 
print(', '.join(geneSymbol)) 

# deleted quotes to pass the list into site
#https://www.genenames.org/tools/multi-symbol-checker/

# all gene symbols were approved by HGNC 

In [13]:
uniprotAC=[]
for i in range(len(curated)):
    uniprotAC.append(curated['UniProt'][i])

In [None]:
results = pd.DataFrame(list(zip(geneSymbol,geneName,geneID,uniprotAC)), columns=['Symbol','Name','ID','UniprotAC'])

In [None]:
results.to_csv('mesothelioma-curated-genes.csv')
results.head()

### Exercise 1.2

For each seed gene, collect all binary protein interactions from two different PPI sources:
* Biogrid Human, latest release available
* IID Integrated Interactions Database (experimental data only, all tissues, unless stated otherwise in further instruction)

Note: once you got the list of the proteins interacting with at least one seed gene, you must
also retrieve and include in your interactome the interactions among these non-seed
proteins

In [14]:
#open biogrid DB
biogrid=pd.read_csv('BIOGRID-ALL-3.5.179.tab2.txt', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
biogrid.head()

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,11309420,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,8599089,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,10938104,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,10875894,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [16]:
len(biogrid)

1730436

In [17]:
#let's select what we want from the biogrid dataset.
#select only human PPI
biogrid.columns

Index(['#BioGRID Interaction ID', 'Entrez Gene Interactor A',
       'Entrez Gene Interactor B', 'BioGRID ID Interactor A',
       'BioGRID ID Interactor B', 'Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Synonyms Interactor A',
       'Synonyms Interactor B', 'Experimental System',
       'Experimental System Type', 'Author', 'Pubmed ID',
       'Organism Interactor A', 'Organism Interactor B', 'Throughput', 'Score',
       'Modification', 'Phenotypes', 'Qualifications', 'Tags',
       'Source Database'],
      dtype='object')

In [None]:
#I took the interactions for NAT2 gene in humans 
example = pd.read_csv('./biogrid/BIOGRID-GENE-106528-3.5.179.tab2.txt', sep='\t')
example.head()

In [18]:
#select only human genes
biogrid_human=biogrid.loc[(biogrid['Organism Interactor A']==9606) & (biogrid['Organism Interactor B']==9606)]
biogrid_human

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,11309420,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,8599089,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,10938104,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,10875894,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730424,2621674,84376,10749,124068,115972,-,-,HOOK3,KIF1C,HK3,...,28718761,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1730425,2621675,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1730426,2621676,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1730427,2621677,50618,2059,119098,108373,-,-,ITSN2,EPS8,PRO2015|SH3D1B|SH3P18|SWA|SWAP,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [None]:
# look for the genes which interacts with at least one seed genes
biogrid_seed_genes = biogrid_human.loc[(biogrid_human['Official Symbol Interactor A'].isin(geneSymbol)) | (biogrid_human['Official Symbol Interactor B'].isin(geneSymbol))]
biogrid_seed_genes

## from our seed interaction let's search non-seeds genes that interacts with at least one seed gene

In [None]:
#Interactor A is not a seed gene but Interactor B is a seed gene
non_seed_df_A = biogrid_seed_genes.loc[~(biogrid_seed_genes['Official Symbol Interactor A'].isin(geneSymbol))]
seed_B = non_seed_df_A.loc[(non_seed_df_A ['Official Symbol Interactor B'].isin(geneSymbol))]

In [None]:
# interactor B is not a seed genes but Interactor A is a seed gene
non_seed_df_B = biogrid_seed_genes.loc[~(biogrid_seed_genes['Official Symbol Interactor B'].isin(geneSymbol))]
non_seed_df_B

In [None]:
# Reset Index, otherwise they don't work with list
non_seed_df_A=non_seed_df_A.reset_index(drop=True)
non_seed_df_B=non_seed_df_B.reset_index(drop=True)

In [None]:
non_seed_df_B

In [None]:
non_seed_df_A = biogrid_seed_genes.loc[~(biogrid_seed_genes['Official Symbol Interactor A'].isin(geneSymbol))]
non_seed_df_B = biogrid_seed_genes.loc[~(biogrid_seed_genes['Official Symbol Interactor B'].isin(geneSymbol))]
non_seed_df_A=non_seed_df_A.reset_index(drop=True)
non_seed_df_B=non_seed_df_B.reset_index(drop=True)
#build a list with only non seed genes that interacts with at least one seed gene
non_seed_list=[]
for i in range(0, len(non_seed_df_A)):
    non_seed_list.append(non_seed_df_A['Official Symbol Interactor A'][i])
for i in range(0, len(non_seed_df_B)):
    non_seed_list.append(non_seed_df_B['Official Symbol Interactor B'][i])
#drop duplicates
non_seed_list = list(dict.fromkeys(non_seed_list))

In [None]:
for i in range(0, len(non_seed_df_B)):
    non_seed_list.append(non_seed_df_B['Official Symbol Interactor B'][i])

In [None]:
#drop duplicates
non_seed_list = list(dict.fromkeys(non_seed_list))

In [None]:
# Verify Correctness
for i in range(0, len(geneSymbol)):
    for j in range(0, len(non_seed_list)):
        if(geneSymbol[i] == non_seed_list[j]):
            print("Something is wrong")
print("Ok")

In [None]:
print(len(non_seed_list))

In [None]:
#now search for non seed interactions from the human DB
#Return positions of non-seed genes that interacts with a non seed gene but both interacts with at least one seed gene
biogrid_human = biogrid_human.reset_index(drop=True)
#create a list of index of the original matrix with non seed genes interacting each others
list_of_idx=[]
#fill the list
for i in range(len(biogrid_human)):
    if biogrid_human['Official Symbol Interactor A'][i] in non_seed_list and biogrid_human['Official Symbol Interactor B'][i] in non_seed_list:
        list_of_idx.append(i)

In [None]:
#biogrid_seed_genes
biogrid_non_seed = biogrid_human.loc[list_of_idx]
biogrid_non_seed = biogrid_non_seed.reset_index(drop=True)
biogrid_non_seed

In [None]:
#make a table in which interactor 1 is seed and interactor 2 can be seed or non-seed
biogrid_seed_df = biogrid_seed_genes.loc[(biogrid_seed_genes['Official Symbol Interactor A'].isin(geneSymbol))]
interactome = pd.concat([biogrid_seed_df, biogrid_non_seed], ignore_index=True)
interactome.to_csv("interactome-biogrid.txt", sep='\t')

In [None]:
interactome = pd.read_csv("interactome-biogrid.txt", sep= '\t')
interactome = interactome.drop(['Unnamed: 0'], axis = 1)
interactome

In [None]:
iid = pd.read_csv('human_annotated_PPIs.txt', sep='\t')
iid

In [None]:
#select only rows with evidence type exp, drop some columns.
iid = iid.loc[iid['evidence type'] == 'exp']
icols = ['uniprot1', 'uniprot2', 'symbol1', 'symbol2' , 'evidence type', 'cancer']
iid = iid.reset_index(drop=True)
iid = iid[['uniprot1', 'uniprot2', 'symbol1', 'symbol2' , 'evidence type', 'cancer']]

In [None]:
iid

In [None]:
iid.to_csv('iid.txt', sep = '\t')

In [None]:
#SAME PROCEDURE. 

In [None]:
iid_seed_genes = iid.loc[(iid['symbol1'].isin(geneSymbol)) | (iid['symbol2'].isin(geneSymbol))]
iid_seed_genes

In [None]:
non_seed_1 = iid_seed_genes.loc[~(iid_seed_genes['symbol1'].isin(geneSymbol))]
# interactor 1 is not a seed genes and interacts with a seed gene
#B is a seed gene
non_seed_1

In [None]:
non_seed_2 = iid_seed_genes.loc[~(iid_seed_genes['symbol2'].isin(geneSymbol))]
# interactor 1 is not a seed genes and interacts with a seed gene
#B is a seed gene
non_seed_2

In [None]:
non_seed_1=non_seed_1.reset_index(drop=True)
non_seed_2=non_seed_2.reset_index(drop=True)

In [None]:
#build a list with only non seed genes that interacts with at least one seed gene
non_seed=[]
#non_seed_df_A = biogrid_seed_genes.loc[~(biogrid_seed_genes['Official Symbol Interactor A'].isin(geneSymbol))]
for i in range(0, len(non_seed_1)):
    non_seed.append(non_seed_1['symbol1'][i])
for i in range(0, len(non_seed_2)):
    non_seed.append(non_seed_2['symbol2'][i])
#drop duplicates
non_seed = list(dict.fromkeys(non_seed))

In [None]:
len(non_seed)

In [None]:
iid=iid.reset_index(drop=True)
#non_seed_df_A = biogrid_seed_genes.loc[~(biogrid_seed_genes['Official Symbol Interactor A'].isin(geneSymbol))]
list_of_idx=[]
#fill the list
for i in range(len(iid_seed_genes)):
    if iid['symbol1'][i] in non_seed and iid['symbol2'][i] in non_seed:
        list_of_idx.append(i)

In [None]:
#biogrid_seed_genes
iid_non_seed = iid.loc[list_of_idx]
iid_non_seed = iid_non_seed.reset_index(drop=True)
iid_non_seed

In [None]:
iid_seed_df = iid_seed_genes.loc[(iid_seed_genes['symbol1'].isin(geneSymbol))]
interactome2 = pd.concat([iid_seed_df, iid_non_seed], ignore_index=True)
interactome2.to_csv("interactome-iid.txt", sep='\t')

In [None]:
interactome2 = pd.read_csv("interactome-iid.txt", sep='\t')

In [None]:
interactome2 = interactome2.drop(['Unnamed: 0'], axis = 1)

In [None]:
interactome2

In [None]:
interactome

### Summarize the main results in a table reporting:
* no. of seed genes found in each different DBs (some seed genes may be missing in the DBs);
* total no. of interacting proteins, including seed genes, for each DB;
* total no. of interactions found in each DB.

In [None]:
len(geneSymbol)

In [None]:
seed_B = non_seed_df_A.loc[(non_seed_df_A ['Official Symbol Interactor B'].isin(geneSymbol))]
seed_A = non_seed_df_B.loc[(non_seed_df_B ['Official Symbol Interactor A'].isin(geneSymbol))]

In [None]:
def find_unique_genes(dataframe, column_name):
    genes_found = []
    for index in range(len(dataframe)):
        if dataframe[column_name][index] not in genes_found:
            genes_found.append(dataframe[column_name][index])
    return genes_found

In [None]:
# Genes from Biogrid
la = find_unique_genes(seed_B, 'Official Symbol Interactor B')
lb = find_unique_genes(seed_A, 'Official Symbol Interactor A')
l_tot = la + lb
#drop duplicates
total_genes = list(dict.fromkeys(l_tot))
len(total_genes)
missing_gene = []
for index in range(len(geneSymbol)):
    if geneSymbol[index] not in total_genes:
        missing_gene.append(geneSymbol[index])
print("Genes missing in Biogrid\n", missing_gene)

In [None]:
# Genes from IID 
seed1 = non_seed_2.loc[(non_seed_2['symbol1'].isin(geneSymbol))]
seed2 = non_seed_1.loc[(non_seed_1['symbol2'].isin(geneSymbol))]
# interactor 1 is not a seed genes and interacts with a seed gene
#B is a seed gene
l1 = find_unique_genes(seed1, 'symbol1')
l2 = find_unique_genes(seed2, 'symbol2')
l = l1+l2
#drop duplicates
i_genes = list(dict.fromkeys(l))
iid_missing = []
for index in range(len(geneSymbol)):
    if geneSymbol[index] not in i_genes:
        iid_missing.append(geneSymbol[index])
print("Genes missing from IID DB \n", iid_missing)

total no. of interacting proteins, including seed genes, for each DB;

In [None]:
interactors= []
for index in range(len(biogrid_human)):
    if biogrid_human['Official Symbol Interactor A'][index] not in interactors:
        interactors.append(biogrid_human['Official Symbol Interactor A'][index])
    if biogrid_human['Official Symbol Interactor B'][index] not in interactors:
        interactors.append(biogrid_human['Official Symbol Interactor B'][index])

In [None]:
len(interactors)

In [None]:
def count_interactors(dataframe, column1, column2):
    interactors= []
    for index in range(len(dataframe)):
        if dataframe[column1][index] not in interactors:
            interactors.append(dataframe[column1][index])
        if dataframe[column2][index] not in interactors:
            interactors.append(dataframe[column2][index])
    return len(interactors)

In [None]:
iid_len = count_interactors(iid, 'symbol1', 'symbol2')

In [None]:
print(iid_len)

In [None]:
len(iid)

In [None]:
# total no. of interactions found in each DB.

In [None]:
countbio = biogrid_human['#BioGRID Interaction ID'].nunique()
print(countbio)

In [None]:
uniquesyms = []
for index in range(0, len(iid)):
    sym1 = iid['symbol1'][index]
    sym2 = iid['symbol2'][index]
    uniquesyms.append(sym1+sym2)
#drop duplicates

In [None]:
uniquesyms = list(dict.fromkeys(uniquesyms))
print(len(uniquesyms))

### Build and store three tables:

* seed genes interactome: interactions that involve seed genes only, from all DBs, in the format:
  interactor A gene symbol, interactor B gene symbol, interactor A Uniprot AC, interactor B
  Uniprot AC, database source
* union interactome: all proteins interacting with at least one seed gene, from all DBs, same format as above.
* intersection interactome: all proteins interacting with at least one seed gene confirmed by both DBs, in the       format: interactor A gene symbol, interactor B gene symbol, interactor A Uniprot AC, interactor B Uniprot AC

Always check that interactors are both human (i.e. organism ID is always 9606, Homo
Sapiens)

In [19]:
biogrid_human = biogrid_human.reset_index(drop=True)

In [23]:
uniprot_human = pd.read_csv("./uniprot/HUMAN_9606_idmapping.dat", sep = '\t')

In [24]:
uniprot_human

Unnamed: 0,P31946,UniProtKB-ID,1433B_HUMAN
0,P31946,Gene_Name,YWHAB
1,P31946,GI,4507949
2,P31946,GI,377656702
3,P31946,GI,67464628
4,P31946,GI,1345590
...,...,...,...
6288915,A0A0K0L4Y8,EMBL,KF833265
6288916,A0A0K0L4Y8,EMBL-CDS,AIS72444.1
6288917,A0A0K0L4Y8,NCBI_TaxID,9606
6288918,A0A0K0L4Y8,ChiTaRS,PNPLA2


In [None]:
#save a list of all the symbols in order to search their uniprot
sym_to_fix=[]
sym_to_fix.extend(biogrid_human['Official Symbol Interactor A'])
sym_to_fix.extend(biogrid_human['Official Symbol Interactor B'])

#remove duplicates
sym_to_fix=list(set(sym_to_fix))

# using join() 
# avoiding printing last comma 
print("The formatted output is : ") 
print(', '.join(sym_to_fix)) 
##print in order to search on uniprot.com

In [26]:
#upload the uniprot fixing file
unigene = pd.read_csv("./uniprot/geneid-uniprot.tab", sep = '\t')
unigene

Unnamed: 0,yourlist:M202001096746803381A1F0E0DB47453E0216320D5745D00,Entry,Entry name,Status,Protein names,Gene names,Organism,Annotation
0,SLC6A1,P30531,SC6A1_HUMAN,reviewed,Sodium- and chloride-dependent GABA transporte...,SLC6A1 GABATR GABT1 GAT1,Homo sapiens (Human),5 out of 5
1,PRSS48,Q7RTY5,PRS48_HUMAN,reviewed,Serine protease 48 (EC 3.4.21.-) (Epidermis-sp...,PRSS48 ESSPL,Homo sapiens (Human),4 out of 5
2,DSN1,Q9H410,DSN1_HUMAN,reviewed,Kinetochore-associated protein DSN1 homolog,DSN1 C20orf172 MIS13,Homo sapiens (Human),5 out of 5
3,RPL32,P62910,RL32_HUMAN,reviewed,60S ribosomal protein L32 (Large ribosomal sub...,RPL32 PP9932,Homo sapiens (Human),5 out of 5
4,IFT122,Q9HBG6,IF122_HUMAN,reviewed,Intraflagellar transport protein 122 homolog (...,IFT122 SPG WDR10 WDR140,Homo sapiens (Human),5 out of 5
...,...,...,...,...,...,...,...,...
16933,"MMAA,SLC7A5P2",Q9GIP4,LAT1L_HUMAN,reviewed,Putative L-type amino acid transporter 1-like ...,SLC7A5P2 IMAA MMAA,Homo sapiens (Human),3 out of 5
16934,SNAPC1,Q16533,SNPC1_HUMAN,reviewed,snRNA-activating protein complex subunit 1 (SN...,SNAPC1 SNAP43,Homo sapiens (Human),4 out of 5
16935,DOCK5,Q9H7D0,DOCK5_HUMAN,reviewed,Dedicator of cytokinesis protein 5,DOCK5,Homo sapiens (Human),5 out of 5
16936,RNF19B,Q6ZMZ0,RN19B_HUMAN,reviewed,E3 ubiquitin-protein ligase RNF19B (EC 2.3.2.3...,RNF19B IBRDC3 NKLAM,Homo sapiens (Human),5 out of 5


In [27]:
#create a dictionary that maps symbol with its uniprot
unigene=unigene.rename(columns={"yourlist:M202001096746803381A1F0E0DB47453E0216320D5745D00": "symbol"})
unigene=pd.Series(unigene.Entry.values, index=unigene.symbol).to_dict()
biogrid_human['UniprotAC interactor A']= biogrid_human['Official Symbol Interactor A'].map(unigene)
biogrid_human['UniprotAC interactor B']= biogrid_human['Official Symbol Interactor B'].map(unigene)
biogrid_human

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database,UniprotAC interactor A,UniprotAC interactor B
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,P45985,Q14315
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q86TC9,P35609
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q04771,P49354
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,P23769,P29590
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,P15927,P40763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485375,2621674,84376,10749,124068,115972,-,-,HOOK3,KIF1C,HK3,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q86VS8,O43896
485376,2621675,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q12929,Q9NZM3
485377,2621676,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q12929,Q9NZM3
485378,2621677,50618,2059,119098,108373,-,-,ITSN2,EPS8,PRO2015|SH3D1B|SH3P18|SWA|SWAP,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q9NZM3,Q12929


In [28]:
iid_human = pd.read_csv('iid.txt', sep = '\t')
iid_human.drop(['Unnamed: 0'], axis = 1)

Unnamed: 0,uniprot1,uniprot2,symbol1,symbol2,evidence type,cancer
0,Q9NUX5,Q9NVM4,POT1,PRMT7,exp,0
1,P62633,Q99729,CNBP,HNRNPAB,exp,0
2,O43715,O95817,TRIAP1,BAG3,exp,0
3,Q13285,Q9UKM9,NR5A1,RALY,exp,0
4,P60174,Q9H0W5,TPI1,CCDC8,exp,0
...,...,...,...,...,...,...
272487,O60341,Q8NEZ2,KDM1A,VPS37A,exp,0
272488,Q00987,Q8TAQ2,MDM2,SMARCC2,exp,0
272489,P22314,Q5VTR2,UBA1,RNF20,exp,0
272490,Q86X19,Q96J84,TMEM17,KIRREL1,exp,0


In [106]:
#SEED GENES INTERACTOME
def build_first_table(biogrid_human, iid_human):
    db1 = 'Biogrid Human'
    db2 = 'Integrated Interactions Database experimental data'
    t = pd.DataFrame(columns=['interactorA', 'interactorB', 
                                        'interactorA_Uniprot_AC', 'interactorB_Uniprot_AC', 'db_source'])
    for i in range(len(biogrid_human)):
        sa = biogrid_human['Official Symbol Interactor A'][i]
        sb = biogrid_human['Official Symbol Interactor B'][i]
        uniprota = biogrid_human['UniprotAC interactor A'][i]
        uniprotb = biogrid_human['UniprotAC interactor B'][i]
        if sa in geneSymbol and sb in geneSymbol:
            t = t.append({'interactorA':sa, 'interactorB':sb, 
                          'interactorA_Uniprot_AC':uniprota, 'interactorB_Uniprot_AC':uniprotb, 'db_source': db1}
                         , ignore_index=True)
    for i in range(len(iid_human)):
        sa = iid_human['symbol1'][i]
        sb = iid_human['symbol2'][i]
        uniprota = iid_human['uniprot1'][i]
        uniprotb = iid_human['uniprot2'][i]
        if sa in geneSymbol and sb in geneSymbol:
            t = t.append({'interactorA':sa, 'interactorB':sb, 
                          'interactorA_Uniprot_AC':uniprota, 'interactorB_Uniprot_AC':uniprotb, 'db_source': db2}
                         , ignore_index=True)
    t.to_csv("seed_genes_interactome.tsv", sep = '\t')

In [107]:
biogrid_human=biogrid_human.reset_index(drop=True)
iid_human=iid_human.reset_index(drop=True)
build_first_table(biogrid_human, iid_human)

In [108]:
interactome_seed = pd.read_csv("seed_genes_interactome.tsv", sep = '\t')
interactome_seed.drop(['Unnamed: 0'], axis = 1)

Unnamed: 0,interactorA,interactorB,interactorA_Uniprot_AC,interactorB_Uniprot_AC,db_source
0,ILK,DDX3X,Q13418,O00571,Biogrid Human
1,ILK,COPG1,Q13418,Q9Y678,Biogrid Human
2,CDKN2A,CDK6,Q8N726,Q00534,Biogrid Human
3,RXRA,CTNNB1,P19793,P35222,Biogrid Human
4,MUC1,CTNNB1,P15941,P35222,Biogrid Human
...,...,...,...,...,...
475,TXN,CDKN1A,P10599,P38936,Integrated Interactions Database experimental ...
476,TRAF7,TRAF7,Q6Q0C0,Q6Q0C0,Integrated Interactions Database experimental ...
477,RXRA,IL12B,P19793,P29460,Integrated Interactions Database experimental ...
478,C9,FN1,P02748,P02751,Integrated Interactions Database experimental ...


union interactome: all proteins interacting with at least one seed gene, from all DBs, same format as above.

In [114]:
biogrid_human.to_csv("new_biogrid.tsv", sep = '\t')

In [33]:
def build_union_interactome(biogrid_human, iid_human):
    db1 = 'Biogrid Human'
    db2 = 'Integrated Interactions Database experimental data'
    t = pd.DataFrame(columns=['interactorA', 'interactorB', 
                                        'interactorA_Uniprot_AC', 'interactorB_Uniprot_AC', 'db_source'])
    for i in range(len(biogrid_human)):
        sa = biogrid_human['Official Symbol Interactor A'][i]
        sb = biogrid_human['Official Symbol Interactor B'][i]
        uniprota = biogrid_human['UniprotAC interactor A'][i]
        uniprotb = biogrid_human['UniprotAC interactor B'][i]
        if sa in geneSymbol or sb in geneSymbol:
            t = t.append({'interactorA':sa, 'interactorB':sb, 
                          'interactorA_Uniprot_AC':uniprota, 'interactorB_Uniprot_AC':uniprotb, 'db_source': db1}
                         , ignore_index=True)
    for i in range(len(iid_human)):
        sa = iid_human['symbol1'][i]
        sb = iid_human['symbol2'][i]
        uniprota = iid_human['uniprot1'][i]
        uniprotb = iid_human['uniprot2'][i]
        if sa in geneSymbol or sb in geneSymbol:
            t = t.append({'interactorA':sa, 'interactorB':sb, 
                          'interactorA_Uniprot_AC':uniprota, 'interactorB_Uniprot_AC':uniprotb, 'db_source': db2}
                         , ignore_index=True)
    t.to_csv("union_interactome.tsv", sep = '\t')

In [100]:
# consider also non/seed-non/seed interactions
def build_nonseed_dataframes(biogrid_human, iid_human, union_human):
    non_seed_df_A = union_human.loc[~(union_human['interactorA'].isin(geneSymbol))]
    non_seed_df_B = union_human.loc[~(union_human['interactorB'].isin(geneSymbol))]
    non_seed_df_A=non_seed_df_A.reset_index(drop=True)
    non_seed_df_B=non_seed_df_B.reset_index(drop=True)
    
    #build a list with only non seed genes that interacts with at least one seed gene
    non_seed_list=[]
    for i in range(0, len(non_seed_df_A)):
        non_seed_list.append(non_seed_df_A['interactorA'][i])
    for i in range(0, len(non_seed_df_B)):
        non_seed_list.append(non_seed_df_B['interactorB'][i])
    #drop duplicates
    non_seed_list = list(dict.fromkeys(non_seed_list))
    list_of_idx=[]
    #fill the list
    for i in range(len(biogrid_human)):
        if biogrid_human['Official Symbol Interactor A'][i] in non_seed_list and biogrid_human['Official Symbol Interactor B'][i] in non_seed_list:
            list_of_idx.append(i)

    biogrid_non_seed = biogrid_human.loc[list_of_idx]
    biogrid_non_seed = biogrid_non_seed.reset_index(drop=True)
    biogrid_non_seed.to_csv("biogrid_union_non_seed.tsv", sep = '\t')

    #same procedure for iid
    list_of_idx=[]
    #fill the list
    for i in range(len(iid_human)):
        if iid_human['symbol1'][i] in non_seed_list and iid_human['symbol2'][i] in non_seed_list:
            list_of_idx.append(i)

    iid_non_seed = iid_human.loc[list_of_idx]
    iid_non_seed = iid_non_seed.reset_index(drop=True)                  
    iid_non_seed.to_csv("iid_union_non_seed.tsv", sep = '\t')

In [101]:
def merge_non_seed_dataframes(biogrid_human, iid_human, union_human):
    db1 = 'Biogrid Human'
    db2 = 'Integrated Interactions Database experimental data'
    union_dict = {}
    
    for i in range(len(iid_human)):
        sa = iid_human['symbol1'][i]
        sb = iid_human['symbol2'][i]
        uniprota = iid_human['uniprot1'][i]
        uniprotb = iid_human['uniprot2'][i]
        '''union_human = union_human.append({'interactorA':sa, 'interactorB':sb, 
                          'interactorA_Uniprot_AC':uniprota, 'interactorB_Uniprot_AC':uniprotb, 'db_source': db2}
                         , ignore_index=True)'''
        
        union_dict[i] = {"interactorA": sa, 'interactorB':sb, 
                          'interactorA_Uniprot_AC':uniprota, 'interactorB_Uniprot_AC':uniprotb, 'db_source': db2}

    df = pd.DataFrame.from_dict(union_dict, "index")  
    new = pd.concat([union_human, df], ignore_index=True)   
        
    print("done with iid", len(union_human))
    
    union_dict = {} 
    for i in range(len(biogrid_human)):
        sa = biogrid_human['Official Symbol Interactor A'][i]
        sb = biogrid_human['Official Symbol Interactor B'][i]
        uniprota = biogrid_human['UniprotAC interactor A'][i]
        uniprotb = biogrid_human['UniprotAC interactor B'][i]
        '''union_human = union_human.append({'interactorA':sa, 'interactorB':sb, 
                          'interactorA_Uniprot_AC':uniprota, 'interactorB_Uniprot_AC':uniprotb, 'db_source': db1}
                         , ignore_index=True)'''
        
        union_dict[i] = {"interactorA": sa, 'interactorB':sb, 
                          'interactorA_Uniprot_AC':uniprota, 'interactorB_Uniprot_AC':uniprotb, 'db_source': db1}

    df = pd.DataFrame.from_dict(union_dict, "index")  
    new = pd.concat([new, df], ignore_index=True)

    # important to set the 'orient' parameter to "index" to make the keys as rows
    

    # drop duplicates for interactorA and interactorB 
    new = new.drop_duplicates(subset = ['interactorA', 'interactorB'], keep='first')
    new.to_csv("union_interactome_extended.tsv", sep = '\t')

In [102]:
biogrid_human=biogrid_human.reset_index(drop=True)
iid_human=iid_human.reset_index(drop=True)
#build_union_interactome(biogrid_human, iid_human)

In [103]:
union_human = pd.read_csv("union_interactome.tsv", sep = '\t')
biogrid_human=biogrid_human.reset_index(drop=True)
iid_human=iid_human.reset_index(drop=True)
union_human=union_human.reset_index(drop=True)
build_nonseed_dataframes(biogrid_human, iid_human, union_human)

nsbio = pd.read_csv("biogrid_union_non_seed.tsv", sep = '\t')
nsiid = pd.read_csv("iid_union_non_seed.tsv", sep = '\t')
nsiid = nsiid.reset_index(drop=True)
nsbio = nsbio.reset_index(drop=True)

merge_non_seed_dataframes(nsbio, nsiid, union_human)
#unionext = pd.read_csv("union_interactome_extended.tsv", sep = '\t')
#union_interactome_extended(biogrid_human, iid_human, union_human)
#union_extended = pd.read_csv("union_extended.tsv", sep = '\t') 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




done with iid 25039


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [104]:
#drop duplicates
new = pd.read_csv("union_interactome_extended.tsv", sep = '\t')
new = new.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1)
new

Unnamed: 0,db_source,interactorA,interactorA_Uniprot_AC,interactorB,interactorB_Uniprot_AC
0,Biogrid Human,MAGI1,,CTNNB1,P35222
1,Biogrid Human,TSG101,Q99816,CDKN1A,P38936
2,Biogrid Human,TXN,P10599,COL1A1,P02452
3,Biogrid Human,DCN,P07585,EGFR,P00533
4,Biogrid Human,HTT,P42858,SETD2,Q9BYW2
...,...,...,...,...,...
267148,Biogrid Human,NINL,Q9Y2I6,MAML1,Q92585
267149,Biogrid Human,NINL,Q9Y2I6,BRCA2,P51587
267150,Biogrid Human,NINL,Q9Y2I6,NKAP,Q8N5F7
267151,Biogrid Human,EPS8,Q12929,ITSN2,Q9NZM3


In [105]:
new.to_csv("union_interactome_extended.tsv", sep = '\t')

In [40]:
unionint = pd.read_csv("union_interactome.tsv", sep = '\t')
unionint

Unnamed: 0.1,Unnamed: 0,interactorA,interactorB,interactorA_Uniprot_AC,interactorB_Uniprot_AC,db_source
0,0,MAGI1,CTNNB1,,P35222,Biogrid Human
1,1,TSG101,CDKN1A,Q99816,P38936,Biogrid Human
2,2,TXN,COL1A1,P10599,P02452,Biogrid Human
3,3,DCN,EGFR,P07585,P00533,Biogrid Human
4,4,HTT,SETD2,P42858,Q9BYW2,Biogrid Human
...,...,...,...,...,...,...
25034,25034,LTA,ANKRD40,P01374,Q6AI12,Integrated Interactions Database experimental ...
25035,25035,FN1,TARDBP,P02751,Q13148,Integrated Interactions Database experimental ...
25036,25036,EGFR,PSMD2,P00533,Q13200,Integrated Interactions Database experimental ...
25037,25037,TNF,AGO2,P01375,Q9UKV8,Integrated Interactions Database experimental ...


### intersection interactome: all proteins interacting with at least one seed gene confirmed by both DBs, in the       format: interactor A gene symbol, interactor B gene symbol, interactor A Uniprot AC, interactor B Uniprot AC


In [109]:
def build_intersection_interactome(biogrid_human, iid_human):
    db1 = 'Biogrid Human'
    db2 = 'Integrated Interactions Database experimental data'
    union = pd.read_csv("union_interactome.tsv", sep = '\t')
    
    union_biogrid = union.loc[(union['db_source'] == 'Biogrid Human')]
    union_biogrid = union_biogrid.drop(['Unnamed: 0', 'db_source'], axis = 1)
    
    union_iid = union.loc[(union['db_source'] == 'Integrated Interactions Database experimental data')]
    union_iid = union_iid.drop(['Unnamed: 0', 'db_source'], axis = 1)
    
    intersect = pd.merge(union_biogrid, union_iid)
    intersect.dropna(inplace=True)
    intersect.to_csv("intersection_interactome.tsv", sep = '\t')

In [110]:
biogrid_human=biogrid_human.reset_index(drop=True)
iid_human=iid_human.reset_index(drop=True)
build_intersection_interactome(biogrid_human, iid_human)

In [111]:
intersect = pd.read_csv("intersection_interactome.tsv", sep = '\t')
intersect.drop(['Unnamed: 0'], axis = 1)

Unnamed: 0,interactorA,interactorB,interactorA_Uniprot_AC,interactorB_Uniprot_AC
0,APOA1,PDE1A,P02647,P54750
1,TXN,NCF4,P10599,Q15080
2,VIM,TCHP,P08670,Q9BT92
3,ABLIM1,VIM,O14639,P08670
4,VIM,NIF3L1,P08670,Q9GZT8
...,...,...,...,...
3734,TP53,EGLN3,P04637,Q9H6Z9
3735,KIT,PLCE1,P10721,Q9P212
3736,TP53,DNAJB1,P04637,P25685
3737,PHB,AGO2,P35232,Q9UKV8


### Enrichment analysis
* Using the service Enrichr, find, report in tables and save related charts (8 charts in total) of the overrepresented GO categories (limit to the first 10 for each main category, BP, MF, CL) and the the overrepresented pathways (KEGG 2019 Human) for:
    * the seed genes,
    * the union interactome genes

In [None]:
unionint = pd.read_csv("union_interactome.tsv", sep = '\t')
unionint = unionint.drop(['Unnamed: 0'], axis = 1)

interactome_seed = pd.read_csv("seed_genes_interactome.tsv", sep = '\t')
interactome_seed = interactome_seed.drop(['Unnamed: 0'], axis = 1)

In [None]:
interactome_seed

In [None]:
#save a list of all the symbols in order to search their uniprot
def print_list(dataframe, column1, column2):
    sym_to_fix=[]
    sym_to_fix.extend(dataframe[column1])
    sym_to_fix.extend(dataframe[column2])
    #remove duplicates
    sym_to_fix=list(set(sym_to_fix))
    # using join() 
    # avoiding printing last comma 
    for gene in sym_to_fix:
        print(gene)

In [None]:
print_list(interactome_seed, 'interactorA', 'interactorB')
#Print this list and put the list to https://amp.pharm.mssm.edu/Enrichr Enrichr. Than download KEGG HUMAN 2019 and Ontologies tables

In [None]:
sym_to_fix=[]
sym_to_fix.extend(unionint['interactorA'])
sym_to_fix.extend(unionint['interactorB'])
#remove duplicates
sym_to_fix=list(set(sym_to_fix))
# using join() 
# avoiding printing last comma 
#print(', '.join(sym_to_fix)) 

# deleted quotes to pass the list into site
#https://www.genenames.org/tools/multi-symbol-checker/

In [None]:
approvedsym = pd.read_csv("approved-symbols.csv", sep = ',')

In [None]:
approvedsym.head()

Now, let's open the downloaded kegg and go tables and save only the first 10 entries

In [None]:
kegg_seed = pd.read_table("enrichr/kegg_human/seed/KEGG_2019_Human_table.txt")
kegg_seed = kegg_seed[:10]
kegg_seed.to_csv("enrichr/kegg_human/seed/KEGG_2019_Human_table.txt")

In [None]:
kegg_union = pd.read_table("enrichr/kegg_human/union/KEGG_2019_Human_table.txt")
kegg_union = kegg_union[:10]
kegg_union.to_csv("enrichr/kegg_human/union/KEGG_2019_Human_table_top10.txt")

In [None]:
go_bp_seed = pd.read_table("enrichr/ontologies/seed/GO_Biological_Process_2018_table.txt")
go_bp_seed = go_bp_seed[:10]
go_bp_seed.to_csv("enrichr/ontologies/seed/GO_Biological_Process_2018_table.txt")

go_mf_seed = pd.read_table("enrichr/ontologies/seed/GO_Molecular_Function_2018_table.txt")
go_mf_seed = go_mf_seed[:10]
go_mf_seed.to_csv("enrichr/ontologies/seed/GO_Molecular_Function_2018_table.txt")

go_cc_seed = pd.read_csv("enrichr/ontologies/seed/GO_Cellular_Component_2018_table.txt")
go_cc_seed = go_cc_seed[:10]
go_cc_seed.to_csv("enrichr/ontologies/seed/GO_Cellular_Component_2018_table.txt")

In [None]:
go_cc_seed = pd.read_csv("enrichr/ontologies/seed/GO_Cellular_Component_2018_table.csv")

In [None]:
go_cc_seed

In [None]:
go_bp_union = pd.read_table("enrichr/ontologies/union/GO_Biological_Process_2018_table.txt")
go_bp_union = go_bp_union[:10]
go_bp_union.to_csv("enrichr/ontologies/union/GO_Biological_Process_2018_table.txt")

go_mf_union = pd.read_table("enrichr/ontologies/union/GO_Molecular_Function_2018_table.txt")
go_mf_union = go_mf_union[:10]
go_mf_union.to_csv("enrichr/ontologies/union/GO_Molecular_Function_2018_table.txt")

In [52]:
go_cc_union = pd.read_csv("enrichr/ontologies/union/GO_Cellular_Component_2018_table.csv")

In [53]:
go_cc_union

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Unnamed: 9
0,focal adhesion (GO:0005925),246/356,2.0307521217463597E-63,9.057154e-61,0.0,0,2.582737,372.829565,RPL4;CYFIP1;RPL5;MDC1;NCKAP1;RPL30;TES;RPL3;RP...,
1,nuclear body (GO:0016604),312/618,2.8626122970109088E-37,6.383625e-35,0.0,0,1.886953,158.77564,RB1;MDC1;EHMT2;GPATCH2;RAPH1;HNRNPU;TESK2;MKI6...,
2,nuclear chromosome part (GO:0044454),223/392,6.3003903982783755E-37,9.36658e-35,0.0,0,2.126248,177.233449,SPI1;SMARCB1;TRRAP;EHMT2;CCAR2;ACTB;KAT5;ZMIZ2...,
3,chromatin (GO:0000785),182/296,1.7710370351351232E-36,1.974706e-34,0.0,0,2.298131,189.185561,RB1;SMARCB1;SPI1;MRE11;KDM1A;EHMT2;DSCC1;JMJD1...,
4,nucleolus (GO:0005730),329/676,3.2856822475828046E-35,2.930829e-33,0.0,0,1.819048,144.434085,RPL4;EIF4A2;RPL5;JPT1;RPL3;SMARCB1;POP1;EIF4A3...,
5,cytosolic part (GO:0044445),114/159,2.5530499276119735E-32,1.897767e-30,0.0,0,2.679802,194.943389,RPL4;RPL5;RPL30;RPL3;RPL31;RPL10L;AHR;RPL8;RPL...,
6,cytosolic ribosome (GO:0022626),89/124,1.3972500602057802E-25,8.902479e-24,0.0,0,2.682646,153.528155,RPL4;RPL5;RPL30;RPL3;DDX3X;RPL31;RPL10L;RPLP0;...,
7,cytoskeleton (GO:0005856),244/520,1.6517829255469964E-23,9.20869e-22,0.0,0,1.753806,92.000452,ZNF174;SLC4A1;ACTB;MYLK;RASSF1;BAIAP2L1;MPRIP;...,
8,chromosome,telomeric region (GO:0000781),86/124,4.1069950000000004e-23,2.035244e-21,0,0.0,2.59222,133.6205419871832,FEN1;MRE11;DCLRE1C;KDM1A;MCM7;PINX1;NAT10;BRCA...
9,nuclear speck (GO:0016607),159/296,4.484949237649398E-23,2.000287e-21,0.0,0,2.007708,103.314082,EHMT2;GPATCH2;FAM107A;JADE1;HNRNPU;NR3C1;SMC4;...,


In [54]:
go_cc_union.columns

Index(['Term', 'Overlap', 'P-value', 'Adjusted P-value', 'Old P-value',
       'Old Adjusted P-value', 'Odds Ratio', 'Combined Score', 'Genes',
       'Unnamed: 9'],
      dtype='object')