In [2]:
import numpy as np
import pandas as pd
#Regular expression
import re

from Bio import SeqIO
from Bio.SeqIO import FastaIO
from Bio import Entrez
from Bio import SeqIO


## Find reference genomes with match

In [3]:
samples = pd.read_excel('SecondRound_16rRNAColonies.xlsx', index_col=None)
samples.head(1)

Unnamed: 0,label,cluster
0,191123-29-5,1


In [5]:
#Before reading don't forget to remove first line
reads = pd.read_csv('./round2_0620/r2_clusters_020520---ssu---otus.csv',delimiter='\t')
reads.reset_index(drop=True, inplace =True)
reads.head(1)

Unnamed: 0,sample name,cluster id,cluster acc,# sequences,avg seq. ident.,similarity,sequence,reference,classifications
0,39,R2-32-4_27f_16S_o0520_27f,1,100.0,99.09,CGGCAGCCGCGGGGAGCAATCCTGGCGGCGAGTGGCGAACGGGTGA...,JX644205.1.1461,ncbi|132|77133|root;cellular organisms;Bacteri...,silva|132|26462|Bacteria;Proteobacteria;Gammap...


#### Get IDs to match
Names are different in the two dataset, to merge the data I need to mofify the labels at the point where they will both match. 

In [6]:
#IDs from data picked. This data comes from the clustering.
idSamples = samples.iloc[:,0].values #Label
ids = []
for i in idSamples:
    n = i.split('-')
    m = n[1]+'-'+n[2]
    ids.append(m)
ids[0]

'29-5'

In [7]:
#New data frame with edited id labels and clusters
d = {'label' : ids,
    'cluster' : samples.loc[:,"cluster"].values}
dfSam = pd.DataFrame(data = d)
dfSam.head(1)

Unnamed: 0,label,cluster
0,29-5,1


In [8]:
#IDs from sequencing run
idSeq = reads.iloc[:,1]
idsSeq =[]
for i in idSeq:
    n = i.split('_')
    m = n[0]
    #print(m)
    o = m.split('-')
    p = o[1]+'-'+o[2]
    idsSeq.append(p)
idsSeq[0]

'32-4'

In [9]:
#Genome ID and start-stop positions
sequence = reads.loc[:,'sequence'].values
genomeID = []
start = []
end = []

for i in sequence:
    j = i.split('.')
    genomeID.append(j[0])
    start.append(int(j[1]))
    end.append(int(j[2]))

In [10]:
#New data frame with edited id labels and 
#sequenceID (genomeID) as well as reference
d = {'label' : idsSeq,
    'sequence' : genomeID,
     'start' : start,
     'end' : end,
    'reference' : reads.loc[:,"reference"].values}
dfSeq = pd.DataFrame(data = d)
dfSeq.head(1)

Unnamed: 0,label,sequence,start,end,reference
0,32-4,JX644205,1,1461,ncbi|132|77133|root;cellular organisms;Bacteri...


In [11]:
df = pd.merge(dfSam, dfSeq, on='label')
df.head(1)

Unnamed: 0,label,cluster,sequence,start,end,reference
0,25-19,1,AWQW01000275,1,1268,ncbi|132|1352941|root;cellular organisms;Bacte...


In [12]:
df.to_csv('R2allsequencesIDs.csv')

## Get genome

In [13]:
Entrez.email = "dgarcia@eng.au.dk"
for i in df.iloc[:,2]:
    ref = i.split('.')[0]
    net_handle = Entrez.efetch(db = "nuccore", id = ref, rettype = "fasta", retmode = "text")
    out_handle = open(ref+'.fasta', "w")
    out_handle.write(net_handle.read())
    out_handle.close()
    net_handle.close()

### Get sequence to align: 16SrRNA

In [15]:
for g, s, e in zip(genomeID, start, end):
    file = g+'.fasta'
    outfile = g+'16SrRNA.fasta'
    try:
        with open(outfile, 'w') as f:
            for seq_record in SeqIO.parse(file, "fasta"):
                f.write(">"+str(seq_record.id) + "\n")
                f.write(str(seq_record.seq[s:e]) + "\n")  #first 10 base positions
    except:
        pass