<a href="https://colab.research.google.com/github/geovalexis/TFG/blob/main/Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data retrievement


## Proteome reference Dataset

In [5]:
# Quest for Orthologs (QfO) dataset
!ls drive/MyDrive/TFG/QFO_2018/

# QfO species and statistics
# We got this information from README file of the QfO release 
# It was formatted into a tabulated file by the following command: 
# cat QfO_statistics.txt | tr -s ' ' | cut -f1-6 -d" " | tr ' ' '\t' > QfO_statistics.tsv
!ls drive/MyDrive/TFG/QFO_2018/QfO_statistics.tsv

# Human dataset
!pip install biopython
from Bio import SeqIO
import pandas as pd
records = list(SeqIO.parse("drive/MyDrive/TFG/QFO_2018/UP000005640_9606.xml", "seqxml"))
records_ids = [record.id for record in records]
df_records_ids = pd.DataFrame(records_ids)
df_records_ids.to_csv("drive/MyDrive/TFG/human_uniprotIDs.tsv", sep="\t", index=False, header=False)
!head drive/MyDrive/TFG/human_uniprotIDs.tsv



QfO_statistics.tsv  UP000005640_9606.fasta     UP000005640_9606.idmapping
README		    UP000005640_9606.gene2acc  UP000005640_9606.xml
drive/MyDrive/TFG/QFO_2018/QfO_statistics.tsv
Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/76/02/8b606c4aa92ff61b5eda71d23b499ab1de57d5e818be33f77b01a6f435a8/biopython-1.78-cp36-cp36m-manylinux1_x86_64.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 4.7MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.78
A0A024R161
A0A024R1R8
A0A075B6F4
A0A075B6H5
A0A075B6H7
A0A075B6H8
A0A075B6H9
A0A075B6I0
A0A075B6I1
A0A075B6I3


In [25]:
# Get species taxIDs from QfO

!pip install --upgrade ete3
from ete3 import NCBITaxa
import pandas as pd

QfO_reference_taxa = pd.read_csv("drive/MyDrive/TFG/QFO_2018/QfO_statistics.tsv", sep="\t")
QfO_reference_species = [getSpecie(QfO_reference_taxa.Tax_ID[i]) for i in range(QfO_reference_taxa.Tax_ID.size)]
QfO_reference_species = pd.DataFrame(QfO_reference_species, columns=["taxID", "species_name"])
QfO_reference_species.to_csv("drive/MyDrive/TFG/QfO_reference_species.tsv", sep="\t", header=True, index=False)

# Snippet got from mapQfO2MtP.py (Gabaldonlab/qfo-2020 repo) but slightly modified to also output specie's name
def getSpecie(taxID: int):
    """This function searches the corresponding specie taxID for a given strain or subspecie taxID (MUST BE AN INTEGER) in the NCBI database. 
    Args:
        taxID (int): taxID of the taxon of interest.
    Returns:
        specieID (int): taxID of the corresponding specie.
    """
    
    ncbi = NCBITaxa()
    #ncbi.update_taxonomy_database() 
    
    specieID = taxID # If the taxID is already a specie it will return the same taxID
    if ncbi.get_rank([taxID])[taxID] != 'species':
        lineage = ncbi.get_lineage(taxID)
        for j in reversed(lineage): #Reverse because it's faster when it's a strain or subspecie
            if ncbi.get_rank([j])[j] == 'species':
                specieID = j
    species_name = ncbi.get_taxid_translator([specieID])[specieID]
    return specieID, species_name

Requirement already up-to-date: ete3 in /usr/local/lib/python3.6/dist-packages (3.1.2)


## Orthologs retrievement

### From MetaPhors

In [None]:
!ls drive/MyDrive/TFG/QfO_input.tsv #Orthologs input for QfO

drive/MyDrive/TFG/QfO_input.tsv


## Matrix

### MetaPhOrs orthologs between homo sapiens and the other 78 reference species from the Quest for Orthologs


In [None]:
import pandas as pd

mtp_orthologs = pd.read_csv("drive/MyDrive/TFG/QfO_input.tsv", sep="\t", names=["protein1", "protein2"])
qfo_ref_human_proteome = pd.read_csv("drive/MyDrive/TFG/human_uniprotIDs.tsv", sep="\t", names=["protein1"])
print("Total size of the orthologs file from MetaPhOrs:", mtp_orthologs.size)
print("Number of reference proteins (from QfO):", qfo_ref_human_proteome.size)
ref_protein_mtp = pd.merge(mtp_orthologs, qfo_ref_human_proteome, how="inner", on="protein1")
print("Size of the resulting inner join between the two datasets:", ref_protein_mtp.size)
print("Actual number of reference Homo Sapiens proteins within MetaPhOrs:", ref_protein_mtp["protein1"].unique().size)
print("First 10 rows:\n", ref_protein_mtp.head(10))
#ref_protein_mtp.drop_duplicates() # There just around 200 repeated rows but I think that they correspond to the orthologs within the same specie (Homo sapiens in this case) -> diagonal
ref_protein_mtp.to_csv("drive/MyDrive/TFG/human_orthologs_MtP.tsv", sep="\t", index=False, header=False)
pd.DataFrame(ref_protein_mtp["protein1"].unique()).to_csv("drive/MyDrive/TFG/human_reference_proteins_MtP.tsv", sep="\t", index=False, header=False)
#TODO: map taxIDs to proteins 2

Total size of the orthologs file from MetaPhOrs: 7898268
Number of reference proteins (from QfO): 20996
Size of the resulting inner join between the two datasets: 139718
Actual number of reference Homo Sapiens proteins within MetaPhOrs: 5254
First 10 rows:
   protein1    protein2
0   Q8ND71      Q7ZAM9
1   Q8ND71      Q6CDT3
2   Q8ND71      A2ESR8
3   Q8ND71      A2E1H0
4   Q8ND71      A2FTJ3
5   Q8ND71      A2FTJ2
6   Q8ND71      G3S4T9
7   Q8ND71  A0A2I3SJI4
8   Q8ND71      H2R4V7
9   P15170      Q8F6D0


#Phylogenetic Profiling (PP)
