# Collect 16S sequences from NCBI

In [1]:
# https://biopython.org/docs/latest/Tutorial/chapter_entrez.html#sec-entrez-webenv
# Börja med att importera rätt paket, och lägg in mejladress för att NCBI ska veta vem man är (idk)
# Kräver att Biopython är installerat

import pandas as pd
from Bio import Entrez
from Bio import SeqIO
Entrez.email = 'clara.nordquist.1217@student.uu.se'

In [2]:
# Läsa in vilka arter som vi ska hämta sekvenser för
organism_dataset = pd.read_csv('/Users/claranordquist/Documents/Universitetet/HT24/Tillämpad bioinformatik/Applied-bioinformatics/Collect sequences/Test data/Species_to_identify_species_and_genus.csv', names=['species'])
organisms = [a for a in organism_dataset.iloc[:, 0]]

In [2]:
organisms = ['Fannyhessea vaginae', 'Gardnerella vaginalis', 'Sneathia vaginalis']

In [None]:
output_fasta = open('All_given_species_genus.fasta', 'w')
output_taxonomy = open('Taxonomy.txt', 'w')
output_taxonomy.write(f'ID, Kingdom, Phylum, Class, Order, Family, Genus, Species\n')

not_found = []
max_sequences = 2

for organism in organisms: 
    search_term = f'''
    ({organism}[Organism] AND 16S[All Fields] 
    AND bacteria[filter] 
    AND (refseq[filter] AND "1400"[SLEN] : "1650"[SLEN])
    '''
    # Först, sök igenom databasen
    stream = Entrez.esearch(db = 'nucleotide', term = search_term, usehistory = 'y', idtype = 'acc')
    search_results = Entrez.read(stream)
    stream.close()
    acc_list = search_results['IdList']
    webenv = search_results['WebEnv']
    query_key = search_results['QueryKey']
    
    # Ladda ner gensekvensen och spara i filen fasta_output
    stream = Entrez.efetch(
    db = 'nucleotide', rettype = 'fasta', retmode = 'text', retmax = max_sequences, webenv = webenv, query_key = query_key, idtype = 'acc')
    data = stream.read()
    
    if type(data) == str:
        output_fasta.write(data)
    else:
        not_found.append(organism)
    
    # Ladda ner taxonomin
    stream = Entrez.efetch(db = 'nucleotide', rettype = 'gb', retmode = 'text', retmax = max_sequences, webenv = webenv, query_key = query_key, idtype = 'acc')
    for record in SeqIO.parse(stream, 'genbank'):
        output_taxonomy.write(f'{record.id}, ')
        output_taxonomy.write(f'{record.annotations["taxonomy"]}\n')
output_fasta.close()
output_taxonomy.close()

## Ladda ned 16S rRNA-sekvenserna

In [None]:
# Definiera hur många sekvenser som ska hämtas för varje art, samt namnet på outputfilen
output = open('All_given_species_genus.fasta', 'w')

# En lista för alla arter som ej finns i databasen
not_found = []

# För varje organism i listan, sök i NCBI (RefSeq) efter 16S rRNA-gener med längd mellan 1400 och 1650 bp
# Spara de [max_sequences] första träffarna i fastaformat, i den angivna outputfilen
# Om organismens ej finns i databasen, lägg namnen i en lista
for organism in organisms: 
    search_term = f'''
    ({organism}[Organism] AND 16S[All Fields] 
    AND bacteria[filter] 
    AND (refseq[filter] AND "1400"[SLEN] : "1650"[SLEN])
    '''
    stream = Entrez.esearch(db = 'nucleotide', term = search_term, usehistory = 'y', idtype = 'acc')
    search_results = Entrez.read(stream)
    stream.close()
    acc_list = search_results['IdList']
    webenv = search_results['WebEnv']
    query_key = search_results['QueryKey']
    
    stream = Entrez.efetch(
    db = 'nucleotide', rettype = 'fasta', retmode = 'text', retmax = 2, webenv = webenv, query_key = query_key, idtype = 'acc')
    data = stream.read()
    
    if type(data) == str:
        output.write(data)
    else:
        not_found.append(organism)
output.close()

['NZ_KQ961074.1', 'NZ_LFWE01000020.1', 'NZ_AEDQ01000028.1']
['NZ_NQOJ01000020.1', 'NZ_JASOME010000020.1', 'NZ_JASOOP010000223.1', 'NZ_JASOOP010000222.1', 'NZ_JASOOP010000221.1', 'NZ_JASOOP010000220.1', 'NZ_JASOOP010000219.1', 'NZ_JASOOP010000218.1', 'NZ_JASOOP010000217.1', 'NZ_JASOOP010000216.1', 'NZ_JASOOP010000215.1', 'NZ_JASOOP010000214.1', 'NZ_JASOOP010000213.1', 'NZ_JASOOP010000212.1', 'NZ_JASOOP010000211.1', 'NZ_JASOOP010000210.1', 'NZ_LSLG01000029.1', 'NZ_LWSP01000263.1', 'NZ_LWSP01000262.1', 'NZ_LWSP01000261.1']
['NZ_JBHLSP010000031.1', 'NZ_JBHLSP010000030.1', 'NZ_JBHMQL010000026.1', 'NZ_JBHMQL010000008.1', 'NZ_JBHMQK010000031.1', 'NZ_JBHMQK010000030.1', 'NZ_JASSPO010000056.1', 'NZ_JASSPO010000055.1']
