# Resolución de ambigüedades taxonID

Se buscará mejorar la resolución de ambigüedades en el filtrado de GSMs buscando el txid más específico dentro de los campos de GPL_txid y GSM_ch1_characteristics

In [1]:
# Dependencias 
from Bio import Entrez
import GEOparse
import pandas as pd
import re
import os

In [100]:
data = pd.read_csv("../230827FilterFunc/bigprueba/27082023Test.tsv", sep="\t").rename(columns={"taxonID":"TaxId"})
data.TaxId = data.TaxId.astype("str")

In [101]:
rel = pd.read_csv("../230903MultiPrEstable/tests/taxonomy_relations_mini.tsv", sep='\t')
rel.TaxId = rel.TaxId.astype("str")

In [102]:
data = data.merge(rel, on="TaxId")

In [105]:
no_match = data.iloc[[i for i,row in data.iterrows() if row.TaxId != row.GSM_taxid_ch1]]

In [118]:
no_match = no_match[["TaxId", "GSM_taxid_ch1", "GPL_taxid", "GPL_organism", "GSM_characteristics_ch1", "GSM_organism_ch1"]]
no_match

Unnamed: 0,TaxId,GSM_taxid_ch1,GPL_taxid,GPL_organism,GSM_characteristics_ch1,GSM_organism_ch1
0,224308,1718,1718;160488;224308;290633;511145,Corynebacterium glutamicum;Pseudomonas putida ...,genotype: ChrS-Ala245fs,Corynebacterium glutamicum
1,224308,1718,1718;160488;224308;290633;511145,Corynebacterium glutamicum;Pseudomonas putida ...,genotype: ChrS-Ala245fs,Corynebacterium glutamicum
2,224308,1718,1718;160488;224308;290633;511145,Corynebacterium glutamicum;Pseudomonas putida ...,genotype: ChrS-Ala245fs,Corynebacterium glutamicum
3,224308,1423,224308,Bacillus subtilis subsp. subtilis str. 168,"strain: NDmed;genotype: Wild-type, undomestica...",Bacillus subtilis
4,224308,1423,224308,Bacillus subtilis subsp. subtilis str. 168,"strain: NDmed;genotype: Wild-type, undomestica...",Bacillus subtilis
...,...,...,...,...,...,...
13646,1314,160490,1314,Streptococcus pyogenes,,Streptococcus pyogenes M1 GAS
13647,1314,160490,1314,Streptococcus pyogenes,,Streptococcus pyogenes M1 GAS
13648,1314,160490,1314,Streptococcus pyogenes,,Streptococcus pyogenes M1 GAS
13649,1314,160490,1314,Streptococcus pyogenes,,Streptococcus pyogenes M1 GAS


In [123]:
no_match.GSM_organism_ch1.unique()

array(['Corynebacterium glutamicum', 'Bacillus subtilis',
       'Corynebacterium glutamicum ATCC 13032',
       'Gluconobacter oxydans 621H', 'Escherichia coli BW25113',
       'Escherichia coli str. K-12 substr. MG1655', 'Aspergillus niger',
       'Allochromatium vinosum DSM 180', 'Escherichia coli',
       'Escherichia coli APEC O2', 'Homo sapiens',
       'Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S',
       'Escherichia coli K-12', 'Escherichia coli ATCC 25922',
       'Escherichia coli BL21(DE3)',
       'Escherichia coli O157:H7 str. Sakai', 'Escherichia coli O157:H7',
       'Escherichia coli DH1',
       'Escherichia coli str. K-12 substr. MG1655star',
       'Escherichia coli W', 'Enterococcus faecalis',
       'Enterococcus faecalis V583', 'Pseudomonas aeruginosa PAO1',
       'Mycobacterium tuberculosis',
       'Mycobacterium tuberculosis str. Erdman = ATCC 35801',
       'Mycolicibacterium smegmatis', 'Macaca mulatta',
       'Mycobacterium tuberc

In [11]:
rank_score = {"strain" : 0,
              "species" : 1,
              "genus" : 2,
              "family" : 3,
              "order" : 4,
              "class" : 5,
              "phylum" : 6,
              "superkingdom" : 7,
              "no rank" : 8}

In [91]:
def is_same_specie(query_txid, subject_txid, taxonomy_relations):
    try:
        query_specie = taxonomy_relations[taxonomy_relations.TaxId == query_txid].reset_index().loc[0,"TaxIdSpecie"]
        subject_specie = taxonomy_relations[taxonomy_relations.TaxId == subject_txid].reset_index().loc[0,"TaxIdSpecie"]
        
    except: return(False)

    if  query_specie == subject_specie:
        return(True)

    return(False)

def get_most_specific_txid(row, rel):
    """
    Obtiene el txid más específico disponible para aquellos GSMs cuya taxonomy relation es ambigüa
    Primero se obtienen los txids asociados a cada metadato, descartando aquellos que no pertenezcan 
    a la misma especie
    Luego se obtiene el rango al que corresponden esos txids y se selecciona el de más baja jerarquía
    """
    q_id = row.taxonID
    gsm_id = {id for id in row.GSM_taxid_ch1.split(';') if is_same_specie(q_id, id, rel)}
    print(gsm_id)
    gpl_id = {id for id in row.GPL_taxid.split(';') if is_same_specie(q_id, id, rel)}
    print(gpl_id)
    gpl_org_id = {id.reset_index().loc[0, "taxonID"] for orgs in row.GPL_organism 
                  for org in orgs.split(';') 
                  if not (id:=rel[rel.ScientificName == org]).empty and is_same_specie(q_id, id,rel)}
    print(gpl_org_id)
    ch_id1 = {id.reset_index.loc().loc[0,"taxonID"] for orgs in row.GSM_organism_ch1 
              for org in orgs.split(';') 
              if not (id:=rel[rel.ScientificName == org]).empty and is_same_specie(q_id,id,rel)}
    print(ch_id1)
    ch_id2 = set()
    if type(chars := row.GSM_characteristics_ch1) != float: 
       for d in chars.split(';'):
          if 'Strain:' in d or 'strain:' in d:
             ch_id2 = {row.GSM_organism_ch1 + d.split(':')[1]}
    
    if ch_id2 and not (id:=rel[rel.ScientificNameSpecie == ch_id2]).empty:
      ch_id2 = id.reset_index().loc[0,"taxonID"]
    
    print(ch_id2)
    return()
    all_ids = gsm_id | gpl_id | gpl_org_id | ch_id1  
    all_ids = [(rank_score.get(rank, 8),id) for id in all_ids if id and (rank:=rel[rel.taxonID == id].reset_index().loc[0,"Rank"])]
    return(sorted(all_ids)[0][1])
    


In [94]:

{id for id in data.loc[10,"GSM_taxid_ch1"].split(';') if is_same_specie(data.loc[10,"taxonID"], id, rel)}


set()

In [92]:
get_most_specific_txid(data.iloc[3], rel)

set()
set()
set()
set()
{'Bacillus subtilis NDmed'}


()

In [68]:
data["GSM_or"]

taxonID          224308
GSM_taxid_ch1      1423
Name: 3, dtype: object

In [61]:
bool(set())

False

# 25 de septeimbre de 2023
Icorporación del concepto al Código

In [125]:
# Dependencias
from Bio import Entrez
import GEOparse
import pandas as pd
import re
import os

In [124]:
def get_taxonomy_data(id, max_tries=10):

  try:
    handle = Entrez.efetch(db='taxonomy', id=id, rettype='xml')
    record = Entrez.read(handle)
  except:
    if max_tries:
      return(get_taxonomy_data(id, max_tries-1))
    else:
      exit()

  return(record[0])

def get_txid_species(record):
  if record['Rank'] == 'species':
    return (record['TaxId'])
  else:
    lineag = [element['TaxId'] for element in record['LineageEx'] if element['Rank'] == 'species']
    return(lineag[0])

def get_taxonomy_relations(txidList):
  relations = []
  rank_big = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus']
  for txid in txidList:
    taxonomy_data = get_taxonomy_data(txid)
    row = {key:(value if key != 'LineageEx' else ';'.join([i['TaxId'] for i in value]))
            for key,value in taxonomy_data.items()
            if key in key in ['TaxId','ScientificName','Rank', 'ParentTaxId', 'LineageEx']}
    txid_specie = get_txid_species(taxonomy_data)
    specie_name = ' '.join(taxonomy_data['ScientificName'].split(' ')[0:2])
    if not taxonomy_data['Rank'] in ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']:
      final_tag_txid = taxonomy_data['ScientificName'].split(' ')[-1]
    else:
      final_tag_txid = 'None'
    row.update({'TaxIdSpecie': txid_specie, 'ScientificNameSpecie':specie_name, 'FinalTagName':final_tag_txid})
    relations.append(row)
  return(pd.DataFrame.from_dict(relations))

def check_relation_txid(tx_query,tx_subject, metadata_taxo, characteristics):
  rank_big = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus']
  metadata_query = metadata_taxo[metadata_taxo['TaxId'] == str(tx_query)]
  metadata_subject = metadata_taxo[metadata_taxo['TaxId'] == tx_subject]
  if not list(metadata_query['Rank']) in rank_big and not list(metadata_subject['Rank']) in rank_big:
    if list(metadata_query['Rank']) == list(metadata_subject['Rank']):
      return('associated_same txid explicit')
    else:
      if list(metadata_query['TaxIdSpecie']) == list(metadata_subject['TaxIdSpecie']):
        if type(characteristics) != float and list(metadata_query['FinalTagName'])[0] in characteristics:
          return('associated_same txid implicit')
        else:
          return('not associated_ambiguous relationship')
      else:
        return('not associated_different specie')
  else:
    return('not associated_not prosecutable')

In [134]:
data = pd.read_csv("../230827FilterFunc/bigprueba/27082023Test.tsv", sep="\t").rename(columns={"taxonID":"TaxId"})
data.TaxId = data.TaxId.astype('str')


In [147]:
data[["TaxId", "GPL_organism", "GPL_taxid", "GSM_taxid_ch1", "GSM_organism_ch1","GSM_characteristics_ch1"]]

Unnamed: 0,TaxId,GPL_organism,GPL_taxid,GSM_taxid_ch1,GSM_organism_ch1,GSM_characteristics_ch1
0,224308,Corynebacterium glutamicum;Pseudomonas putida ...,1718;160488;224308;290633;511145,1718,Corynebacterium glutamicum,genotype: ChrS-Ala245fs
1,224308,Corynebacterium glutamicum;Pseudomonas putida ...,1718;160488;224308;290633;511145,1718,Corynebacterium glutamicum,genotype: ChrS-Ala245fs
2,224308,Corynebacterium glutamicum;Pseudomonas putida ...,1718;160488;224308;290633;511145,1718,Corynebacterium glutamicum,genotype: ChrS-Ala245fs
3,224308,Bacillus subtilis subsp. subtilis str. 168,224308,1423,Bacillus subtilis,"strain: NDmed;genotype: Wild-type, undomestica..."
4,224308,Bacillus subtilis subsp. subtilis str. 168,224308,1423,Bacillus subtilis,"strain: NDmed;genotype: Wild-type, undomestica..."
...,...,...,...,...,...,...
12295,1314,Streptococcus pyogenes,1314,1314,Streptococcus pyogenes,
12296,1314,Streptococcus pyogenes,1314,160490,Streptococcus pyogenes M1 GAS,
12297,1314,Streptococcus pyogenes,1314,160490,Streptococcus pyogenes M1 GAS,
12298,1314,Streptococcus pyogenes,1314,160490,Streptococcus pyogenes M1 GAS,


In [137]:
gpl_ids = {id for ids in data.GPL_taxid for id in ids.split(';')}
gsm_ids = {id for ids in data.GSM_taxid_ch1 for id in ids.split(';')}
gpl_orgs = {id for ids in data.GPL_organism for id in ids.split(';')}
gsm_orgs = {id for ids in data.GSM_organism_ch1 for id in ids.split(';')}

In [149]:
data[["GSM_organism_ch1", "GSM_characteristics_ch1"]]

Unnamed: 0,GSM_organism_ch1,GSM_characteristics_ch1
0,Corynebacterium glutamicum,genotype: ChrS-Ala245fs
1,Corynebacterium glutamicum,genotype: ChrS-Ala245fs
2,Corynebacterium glutamicum,genotype: ChrS-Ala245fs
3,Bacillus subtilis,"strain: NDmed;genotype: Wild-type, undomestica..."
4,Bacillus subtilis,"strain: NDmed;genotype: Wild-type, undomestica..."
...,...,...
12295,Streptococcus pyogenes,
12296,Streptococcus pyogenes M1 GAS,
12297,Streptococcus pyogenes M1 GAS,
12298,Streptococcus pyogenes M1 GAS,


# 25 de septiembre de 2023

In [61]:
# Dependencias
from Bio import Entrez
import pandas as pd
Entrez.email = "diegocar@lcg.unam.mx"
Entrez.api_key = "34677fdcfd2f0659a7f9ee05ab6e44704f09"

In [104]:
def get_rank(id):
    try: 
       with Entrez.efetch(db="taxonomy", id=id, retmode="xml") as handle:
          record = Entrez.read(handle)[0]
    except:
       return(get_rank(id))
    return(record['Rank'])

def get_lineage(id):
    try:
      with Entrez.efetch(db="taxonomy", id=id, retmode="xml") as handle:
          record= Entrez.read(handle)[0]
    except:
       return(get_lineage(id))
    return(record['LineageEx'])

def get_taxonomy(txid, max=10):
  try:
    with Entrez.efetch(db='taxonomy', id=txid, rettype='xml') as handle:
      record = Entrez.read(handle)[0]
  except:
     if max: return(get_taxonomy(id, max-1))
     print("Estoy cansado jefe...")
     exit()
  
  rank = record['LineageEx'][-1]['Rank']
  org = record['ScientificName']
  name = record['ScientificName']
  for d in record['LineageEx']:
     if d.get("Rank") == "species":
        org = d.get("ScientificName")
  return((rank,org,name))

def get_scientific_name(id):
   try:
      with Entrez.efetch(db="taxonomy", id=id, retmode="xml") as handle:
         record = Entrez.read(handle)[0]
   except:
       return(get_scientific_name(id))
   return(record['ScientificName'])

def is_same_specie(q_id, s_id, tax_rel):
  q_specie = tax_rel[tax_rel.TaxId == q_id].reset_index().loc[0,"Specie"]
  s_specie = tax_rel[tax_rel.TaxId == s_id].reset_index().loc[0,"Specie"]
  if q_specie == s_specie: return(True)
  return(False)

def get_best_id(metadata, tax_rel):
   ch1_id = metadata.GSM_taxid_ch1
   q_id = metadata.TaxId
   if not is_same_specie(q_id, ch1_id, tax_rel): return(ch1_id)

   ch1_id = {ch1_id}
   q_id = metadata.TaxId
   gpl_ids = {id for id in metadata.GPL_taxid.split(';') if is_same_specie(q_id, id, tax_rel)}
   gsm_ids = {id for id in metadata.GSM_taxid_ch1.split(';') if is_same_specie(q_id, id, tax_rel)}
   

   ids = gpl_ids | gsm_ids | ch1_id

   rank_score = {"subspecies": 0,
                 "strain" : 0,
                 "serotype": 0,
                 "species group": 1,
                 "species" : 1,
                 "genus" : 2,
                 "family" : 3,
                 "order" : 4,
                 "class" : 5,
                 "phylum" : 6,
                 "superkingdom" : 7,
                 "no rank" : 8}

   rank = [(rank_score.get(tax_rel[tax_rel.TaxId == id].reset_index().loc[0,"Ranks"], 8), id) for id in ids]
   return(sorted(rank)[0][1])

In [57]:
data = pd.read_csv("../230827FilterFunc/bigprueba/27082023Test.tsv", sep="\t").rename(columns={"taxonID":"TaxId"})
data.TaxId = data.TaxId.astype("str")
rela = pd.read_csv("./taxonomy_relations_mini.tsv", sep='\t')
rela.TaxId = rela.TaxId.astype("str")
data.shape

(12300, 25)

In [64]:
txid_list = set(data['TaxId'])
txid_ch1 = {id for ids in data.GSM_taxid_ch1 for id in ids.split(';')}
txid_gpl = {id for ids in data.GPL_taxid for id in ids.split(';')}
txid_all = txid_list | txid_gpl | txid_ch1
txids,ranks,names,sn = zip(*[(id, *get_taxonomy(id)) for id in  txid_all])
info = pd.DataFrame.from_dict({"TaxId" : txids, "Ranks" : ranks, "Specie" : names, "ScientificName" : sn})
info.to_csv("relations.tsv", sep='\t', index=False, header=True)

In [102]:
with Entrez.efetch(db="taxonomy", id=1423, retmode="xml") as handle:
    record = Entrez.read(handle)[0]
for d in record['LineageEx']:
    print(d)

{'TaxId': '131567', 'ScientificName': 'cellular organisms', 'Rank': 'no rank'}
{'TaxId': '2', 'ScientificName': 'Bacteria', 'Rank': 'superkingdom'}
{'TaxId': '1783272', 'ScientificName': 'Terrabacteria group', 'Rank': 'clade'}
{'TaxId': '1239', 'ScientificName': 'Bacillota', 'Rank': 'phylum'}
{'TaxId': '91061', 'ScientificName': 'Bacilli', 'Rank': 'class'}
{'TaxId': '1385', 'ScientificName': 'Bacillales', 'Rank': 'order'}
{'TaxId': '186817', 'ScientificName': 'Bacillaceae', 'Rank': 'family'}
{'TaxId': '1386', 'ScientificName': 'Bacillus', 'Rank': 'genus'}
{'TaxId': '653685', 'ScientificName': 'Bacillus subtilis group', 'Rank': 'species group'}


In [107]:
get_best_id(data.iloc[12000], info)

'1280'

In [101]:
info[info.TaxId == "1423"]

Unnamed: 0,TaxId,Ranks,Specie,ScientificName
18,1423,species group,Bacillus subtilis,Bacillus subtilis


In [77]:
info[info.TaxId.isin(["1718", "224308"])]

Unnamed: 0,TaxId,Ranks,Specie,ScientificName
16,224308,subspecies,Bacillus subtilis,Bacillus subtilis subsp. subtilis str. 168
67,1718,genus,Corynebacterium glutamicum,Corynebacterium glutamicum


# 27 de septiembre de 2023
Resolución de ambigüedades taxonómicas


In [1]:
# Dependencias
from Bio import Entrez
import pandas as pd
Entrez.email = "diegocar@lcg.unam.mx"
Entrez.api_key = "34677fdcfd2f0659a7f9ee05ab6e44704f09"



In [143]:
def get_rank(id):
    try: 
       with Entrez.efetch(db="taxonomy", id=id, retmode="xml") as handle:
          record = Entrez.read(handle)[0]
    except:
       return(get_rank(id))
    return(record['Rank'])

def get_lineage(id):
    try:
      with Entrez.efetch(db="taxonomy", id=id, retmode="xml") as handle:
          record= Entrez.read(handle)[0]
    except:
       return(get_lineage(id))
    return(record['LineageEx'])

def get_taxonomy(txid, max=15):
  try:
    with Entrez.efetch(db='taxonomy', id=txid, rettype='xml') as handle:
      record = Entrez.read(handle)[0]
  except:
     if max: return(get_taxonomy(txid, max-1))
     print(f"Request error in getting {txid} taxonomy")
     return(("","",""))
  
  rank = record['LineageEx'][-1]['Rank']
  org = record['ScientificName']
  name = record['ScientificName']
  for d in record['LineageEx']:
     if d.get("Rank") == "species":
        org = d.get("ScientificName")
  return((rank,org,name))

def get_scientific_name(id):
   try:
      with Entrez.efetch(db="taxonomy", id=id, retmode="xml") as handle:
         record = Entrez.read(handle)[0]
   except:
       return(get_scientific_name(id))
   return(record['ScientificName'])

def is_same_specie(q_id, s_id, tax_rel):
   if (q_specie := tax_rel[tax_rel.TaxId == q_id]).empty or (s_specie := tax_rel[tax_rel.TaxId == s_id]).empty:
      print(f"Can´t compare {q_id} with {s_id}")
      return(False)

   q_specie = q_specie.reset_index().loc[0,"Specie"]
   s_specie = s_specie.reset_index().loc[0,"Specie"]

   if q_specie == s_specie: return(True)
   return(False)

def get_best_id(metadata, tax_rel):
   ch1_id = metadata.GSM_taxid_ch1
   q_id = metadata.TaxId
   if not is_same_specie(q_id, ch1_id, tax_rel): return(ch1_id)

   ch1_id = {ch1_id}
   gpl_ids = {id for id in metadata.GPL_taxid.split(';') if is_same_specie(q_id, id, tax_rel)} 
   gsm_ids = {id for id in metadata.GSM_taxid_ch1.split(';') if is_same_specie(q_id, id, tax_rel)} 
   

   ids = gpl_ids | gsm_ids | ch1_id

   rank_score = {"subspecies": 0,
                 "strain" : 0,
                 "serotype": 0,
                 "species group": 1,
                 "species" : 1,
                 "genus" : 2,
                 "family" : 3,
                 "order" : 4,
                 "class" : 5,
                 "phylum" : 6,
                 "superkingdom" : 7,
                 "no rank" : 8}

   rank = [(rank_score.get(tax_rel[tax_rel.TaxId == id].reset_index().loc[0,"Rank"], 8), id) for id in ids]
   return(sorted(rank)[0][1])

In [144]:
def apply_GEO_filters(raw_metadata, rank):
    # Delete superseries
    super_GSEs = [index for index,row in raw_metadata.iterrows() if 'SuperSeries' in row['GSE_relation']]
    filtered = raw_metadata.drop(super_GSEs).reset_index().drop('index', 1)
    print("Deleted SuperSeries")

    # Delete tech != array
    interest_techs = ['in situ oligonucleotide', 'spotted oligonucleotide', 'spotted DNA/cDNA', 'mixed spotted oligonucleotide/cDNA']
    not_array = [index for index,row in filtered.iterrows() if row['GPL_technology'] not in interest_techs]
    filtered = filtered.drop(not_array).reset_index().drop('index', 1)
    print("Deleted !array tech")

    # Delete spurious txIDs
    txid_query = set(filtered['TaxId'])
    txid_ch1 = {id for ids in filtered.GSM_taxid_ch1 for id in ids.split(';')} #############
    txid_gpl = {id for ids in filtered.GPL_taxid for id in ids.split(';')} #####################
    txid_all = txid_query | txid_gpl | txid_ch1
    txids,ranks,specie,name = zip(*[(id, *get_taxonomy(id)) for id in  txid_all])
    tax_rel = pd.DataFrame.from_dict({"TaxId":txids, "Rank":ranks, "Specie":specie, "ScientificName":name})
    tax_rel.TaxId = tax_rel.TaxId.astype("str")
    tax_rel.to_csv(f'taxonomyRelationsFunction2.tsv', sep='\t')
    #tax_rel = pd.read_csv("./taxonomyRelationsFunction2.tsv", sep='\t')
    #tax_rel.TaxId = tax_rel.TaxId.astype("str")
    print("Tax rel constructed")
    
    if rank == "specie":
      spurious = [i for i,r in filtered.iterrows() if not is_same_specie(r.TaxId, r.GSM_taxid_ch1,tax_rel)]
      filtered = filtered.drop(spurious).reset_index().drop('index', 1)
      print("Spurious GSMs deleted")

    elif rank == "sameTaxId":
      sameTaxid = [i for i,r in filtered.iterrows() if r.TaxId == r.GSM_taxid_ch1]
      taxid_expl = filtered.iloc[sameTaxid]
      print("Explicit txid relation GSM saved")
      filtered = filtered.drop(sameTaxid)

      recovered = [i for i,r in filtered.iterrows() if r.TaxId == get_best_id(r, tax_rel)]
      print("Recoverable  GSMs identified")
      taxid_rec = filtered.loc[recovered]

      filtered = pd.concat([taxid_expl, taxid_rec]).reset_index().drop('index', 1)
      
    return(filtered)

In [6]:
data = pd.read_csv("../230827FilterFunc/bigprueba/27082023Test.tsv", sep="\t").rename(columns={"taxonID":"TaxId"})
data.TaxId = data.TaxId.astype("str")
rela = pd.read_csv("./taxonomy_relations_mini.tsv", sep='\t')
rela.TaxId = rela.TaxId.astype("str")
data.shape

(12300, 25)

## Prueba para "specie"

In [146]:
filtered = apply_GEO_filters(data, "specie")

Deleted SuperSeries
Deleted !array tech
Tax rel constructed
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 with 1314;9541
Can´t compare 1314 wit

In [11]:
rel = pd.read_csv("./taxonomyRelationsFunction.tsv", sep='\t')
rel.TaxId = rel.TaxId.astype("str")

In [18]:
mega = pd.merge(rel, filtered)[["TaxId", "ScientificName", "Specie", "GSM_taxid_ch1"]]
not_identical_taxids = {row.GSM_taxid_ch1 for i,row in mega.iterrows() if row.TaxId != row.GSM_taxid_ch1}

29

In [19]:
txid,gsm_species = zip(*[(id, get_scientific_name(id)) for id in not_identical_taxids])

In [24]:
mega.merge(pd.DataFrame.from_dict({"GSM_taxid_ch1":txid, "GSM_specie_ch1":gsm_species})).to_csv("notIdenticalTaxIDs.tsv", sep='\t')

## Prueba 1 para "sameTaxid"

In [None]:
apply_GEO_filters(data, "sameTaxId")

In [76]:
tax_rel = pd.read_csv("./taxonomyRelationsFunction2.tsv", sep="\t")
tax_rel.TaxId = tax_rel.TaxId.astype("str")
sameTaxid = [i for i,r in data.iterrows() if r.TaxId == r.GSM_taxid_ch1]
taxid_expl = data.iloc[sameTaxid]
fltrd = data.drop(sameTaxid)

recovered = [i for i,r in fltrd.iterrows() if r.TaxId == get_best_id(r, tax_rel)]
taxid_rec = fltrd.loc[recovered]

fltrd = pd.concat([taxid_expl, taxid_rec]).reset_index().drop('index', 1)

Can´t make txids 511145 588858 comparation
Can´t make txids 511145 588858 comparation
Can´t make txids 511145 588858 comparation
Can´t make txids 511145 588858 comparation
Can´t make txids 511145 588858 comparation
Can´t make txids 511145 588858 comparation
Can´t make txids 511145 588858 comparation
Can´t make txids 511145 588858 comparation
Can´t make txids 511145 83334 comparation
Can´t make txids 511145 879462 comparation
Can´t make txids 511145 879462 comparation
Can´t make txids 511145 879462 comparation
Can´t make txids 511145 879462 comparation
Can´t make txids 511145 879462 comparation
Can´t make txids 511145 879462 comparation
Can´t make txids 511145 879462 comparation
Can´t make txids 511145 879462 comparation
Can´t make txids 511145 879462 comparation
Can´t make txids 511145 879462 comparation
Can´t make txids 511145 879462 comparation
Can´t make txids 511145 879462 comparation
Can´t make txids 511145 879462 comparation
Can´t make txids 511145 879462 comparation
Can´t make t

In [90]:
fltrd.loc[[i for i,r in fltrd.iterrows() if r.GPL_taxid != r.GSM_taxid_ch1], ["TaxId","GPL_taxid", "GSM_taxid_ch1"]].set_index("TaxId").drop_duplicates()

Unnamed: 0_level_0,GPL_taxid,GSM_taxid_ch1
TaxId,Unnamed: 1_level_1,Unnamed: 2_level_1
224308,1423,224308
196627,442;562;1718;160488;224308,196627
196627,1718,196627
196627,442;562;1718;224308,196627
196627,303;442;562;1718;224308,196627
511145,83333,511145
511145,442;562;1718;224308,511145
511145,562,511145
511145,155864;199310;386585;511145,511145
511145,1392;99287;155864;199310;226185;226186;386585;...,511145


In [98]:
tax_rel[tax_rel.TaxId == "588858"]

Unnamed: 0.1,Unnamed: 0,TaxId,Rank,Specie,ScientificName


In [99]:
get_scientific_name(588858)

'Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S'

### WTF
Algunos txids no fueron comparables (archivo txidfilter.txt)

In [104]:
wtf = {id : get_scientific_name(id) for id in ["588858", "83334", "879462", "588858", "9544", "1770", "1314;9541"]}

In [130]:
wtfks = {"588858", "83334", "879462", "588858", "9544", "1770", "1314;9541"}

In [106]:
wtfks

['588858', '83334', '879462', '588858', '9544', '1770', '1314;9541']

In [109]:
filter_set = {id for ids in data.GSM_taxid_ch1 for id in ids.split(';')}
best_set = {id for i,r in data.iterrows() for id in r.GSM_taxid_ch1.split(";")}

In [111]:
print(len(filter_set))
print(len(best_set))
print(len(best_set & filter_set))

76
76
76


In [112]:
filter_set = {id for ids in data.GPL_taxid for id in ids.split(';')}
best_set = {id for i,r in data.iterrows() for id in r.GPL_taxid.split(";")}
print(len(filter_set))
print(len(best_set))
print(len(best_set & filter_set))

86
86
86


In [114]:
for i in [get_taxonomy(id) for id in wtfks]:
    print(i)

('strain', 'Salmonella enterica', 'Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S')
('species', 'Escherichia coli', 'Escherichia coli O157:H7')
('strain', 'Escherichia coli', 'Escherichia coli str. K-12 substr. MG1655star')
('strain', 'Salmonella enterica', 'Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S')
('genus', 'Macaca mulatta', 'Macaca mulatta')
('species', 'Mycobacterium avium', 'Mycobacterium avium subsp. paratuberculosis')
('genus', 'Streptococcus pyogenes', 'Streptococcus pyogenes')


In [128]:
gsm_wtfks={id for i,row in data.iterrows() for id in row.GSM_taxid_ch1.split(';') if id in wtfks}
gpl_wtfks={id for i,row in data.iterrows() for id in row.GPL_taxid.split(';') if id in wtfks}

In [131]:
print(wtfks)
print(gsm_wtfks)
print(gpl_wtfks)

{'9544', '1314;9541', '879462', '83334', '588858', '1770'}
{'9544', '879462', '83334', '588858', '1770'}
{'588858', '879462', '9544'}


## Prueba 2 para "sameTaxid"

In [133]:
# Filtrado para quedarse con tax Ids idénticos
filtrado = apply_GEO_filters(data, "sameTaxId")

Deleted SuperSeries
Deleted !array tech
Request error in getting <built-in function id> taxonomy
Request error in getting <built-in function id> taxonomy
Request error in getting <built-in function id> taxonomy
Tax rel constructed
Explicit txid relation GSM saved
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 comparation
Can´t make txids 1314 1314;9541 

In [151]:
# Filtrado a nivel de especie 
filtered

Unnamed: 0,GSE_title,GSE_geo_accession,GSE_type,GSE_relation,TaxId,GSE_supplementary_file,GPL_title,GPL_geo_accession,GPL_technology,GPL_distribution,...,GSM_type,GSM_geo_accession,GSM_channel_count,GSM_source_name_ch1,GSM_taxid_ch1,GSM_organism_ch1,GSM_characteristics_ch1,GSM_taxid_ch2,GSM_characteristics_ch2,GSM_supplementary_file
0,Temporal transcriptome analysis of Bacillus su...,GSE190460,Expression profiling by array,BioProject: https://www.ncbi.nlm.nih.gov/biopr...,224308,geo/series/GSE190nnn/GSE190460/suppl/,BaSysBio Bacillus subtilis T3 400K array,GPL21981,in situ oligonucleotide,custom-commercial,...,RNA,GSM5724096,1,Bacillus subtilis NDmed,1423,Bacillus subtilis,"strain: NDmed;genotype: Wild-type, undomestica...",,,geo/samples/GSM5724nnn/GSM5724096/suppl/
1,Temporal transcriptome analysis of Bacillus su...,GSE190460,Expression profiling by array,BioProject: https://www.ncbi.nlm.nih.gov/biopr...,224308,geo/series/GSE190nnn/GSE190460/suppl/,BaSysBio Bacillus subtilis T3 400K array,GPL21981,in situ oligonucleotide,custom-commercial,...,RNA,GSM5724097,1,Bacillus subtilis NDmed,1423,Bacillus subtilis,"strain: NDmed;genotype: Wild-type, undomestica...",,,geo/samples/GSM5724nnn/GSM5724097/suppl/
2,Temporal transcriptome analysis of Bacillus su...,GSE190460,Expression profiling by array,BioProject: https://www.ncbi.nlm.nih.gov/biopr...,224308,geo/series/GSE190nnn/GSE190460/suppl/,BaSysBio Bacillus subtilis T3 400K array,GPL21981,in situ oligonucleotide,custom-commercial,...,RNA,GSM5724098,1,Bacillus subtilis NDmed,1423,Bacillus subtilis,"strain: NDmed;genotype: Wild-type, undomestica...",,,geo/samples/GSM5724nnn/GSM5724098/suppl/
3,Temporal transcriptome analysis of Bacillus su...,GSE190460,Expression profiling by array,BioProject: https://www.ncbi.nlm.nih.gov/biopr...,224308,geo/series/GSE190nnn/GSE190460/suppl/,BaSysBio Bacillus subtilis T3 400K array,GPL21981,in situ oligonucleotide,custom-commercial,...,RNA,GSM5724099,1,Bacillus subtilis NDmed,1423,Bacillus subtilis,"strain: NDmed;genotype: Wild-type, undomestica...",,,geo/samples/GSM5724nnn/GSM5724099/suppl/
4,Temporal transcriptome analysis of Bacillus su...,GSE190460,Expression profiling by array,BioProject: https://www.ncbi.nlm.nih.gov/biopr...,224308,geo/series/GSE190nnn/GSE190460/suppl/,BaSysBio Bacillus subtilis T3 400K array,GPL21981,in situ oligonucleotide,custom-commercial,...,RNA,GSM5724100,1,Bacillus subtilis NDmed,1423,Bacillus subtilis,"strain: NDmed;genotype: Wild-type, undomestica...",,,geo/samples/GSM5724nnn/GSM5724100/suppl/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8890,"M1 and M6 serotypes, Wild-type vs. Mga- GAS",GSE1895,Expression profiling by array,BioProject: https://www.ncbi.nlm.nih.gov/biopr...,1314,geo/series/GSE1nnn/GSE1895/suppl/,"M1, M3, M18 GAS Array",GPL1482,spotted oligonucleotide,non-commercial,...,RNA,GSM33613,2,CsCl total RNA from M1 serotype GAS KSM165L,1314,Streptococcus pyogenes,,1314.0,,geo/samples/GSM33nnn/GSM33613/suppl/
8891,Wild-type and irr mutant Group A Streptococcus...,GSE1550,Expression profiling by array,BioProject: https://www.ncbi.nlm.nih.gov/biopr...,1314,,NIAID Group A Streptococcus,GPL1338,spotted DNA/cDNA,non-commercial,...,RNA,GSM26678,2,wt JRS4,160490,Streptococcus pyogenes M1 GAS,,160490.0,,
8892,Wild-type and irr mutant Group A Streptococcus...,GSE1550,Expression profiling by array,BioProject: https://www.ncbi.nlm.nih.gov/biopr...,1314,,NIAID Group A Streptococcus,GPL1338,spotted DNA/cDNA,non-commercial,...,RNA,GSM26679,2,wt JRS4,160490,Streptococcus pyogenes M1 GAS,,160490.0,,
8893,Wild-type and irr mutant Group A Streptococcus...,GSE1550,Expression profiling by array,BioProject: https://www.ncbi.nlm.nih.gov/biopr...,1314,,NIAID Group A Streptococcus,GPL1338,spotted DNA/cDNA,non-commercial,...,RNA,GSM26680,2,wt JRS4,160490,Streptococcus pyogenes M1 GAS,,160490.0,,


In [152]:
species = set(filtered.GSM_geo_accession)
identical = set(filtrado.GSM_geo_accession)

In [153]:
len(species)

8224

In [154]:
len(identical)

6985

In [155]:
len(identical & species) / len(identical)

1.0