In [299]:
import pandas as pd
from Bio import SeqIO

### Store Fasta into Dictionary
hsp_dict = {rec.description : str(rec.seq) for rec in SeqIO.parse('../Databases/hsp65/hsp65_region.fasta', "fasta")}
rpo_dict = {rec.description : str(rec.seq) for rec in SeqIO.parse('../Databases/rpoB/rpoB_region.fasta', "fasta")}
r16s_dict = {rec.id : str(rec.seq) for rec in SeqIO.parse('../Databases/16S_rrna/16S_Edited.fasta', "fasta")}

In [300]:
## HSP
## Convert Fasta Dictionary to Dataframe
df = pd.DataFrame.from_dict(hsp_dict, orient='index')
df.columns = ["Sequence_hsp"]

## Parse IDs and Seqeunce Name
df['IDs'] = [x.split(' ')[0] for x in df.index]
df['Species'] = ['_'.join(x.split(' ')[1:3]) for x in df.index]

## Filter Multicolumns
df2 = df[df['Species'].str.contains("\|")]
df = df[~(df['Species'].str.contains("\|"))]
df2['Species'] = [x.split('|') for x in df2['Species']]
df2.head()

## Create Function to Create Duplicate Rows
shared_sequences = list(df2['Sequence_hsp'])
shared_ids = list(df2['IDs'])
length_of_dataframe = len(df2)
def new_row_species(list_of_list):
    count = -1
    rows_to_keep = []
    for item in list_of_list['Species']:
        count += 1
        for this in item:
            new_name = 'Mycobacterium_'+this
            seq = shared_sequences[count]
            ID = shared_ids[count]
            new_list = [seq, ID, new_name]
            list_of_list.loc[new_name] = new_list
            rows_to_keep.append(new_name)
    return(list_of_list.loc[rows_to_keep])

df3 = new_row_species(df2)
## Check if Species has more than one record
hsp = df.append(df3, ignore_index=True)
hsp = hsp.drop_duplicates(['Species'], keep='first')
hsp.index = hsp['Species']
hsp['Sequence_hsp'] = [x.strip('-') for x in hsp['Sequence_hsp']]
del hsp['IDs']
del hsp['Species']

In [301]:
## rpoB
## Convert Fasta Dictionary to Dataframe
df = pd.DataFrame.from_dict(rpo_dict, orient='index')
df.columns = ["Sequence_rpob"]

## Parse IDs and Seqeunce Name
df['IDs'] = [x.split(' ')[0] for x in df.index]
df['Species'] = ['_'.join(x.split(' ')[1:3]) for x in df.index]

## Filter Multicolumns
df2 = df[df['Species'].str.contains("\|")]
df = df[~(df['Species'].str.contains("\|"))]
df2['Species'] = [x.split('|') for x in df2['Species']]
df2.head()

## Create Function to Create Duplicate Rows
shared_sequences = list(df2['Sequence_rpob'])
shared_ids = list(df2['IDs'])
length_of_dataframe = len(df2)
def new_row_species(list_of_list):
    count = -1
    rows_to_keep = []
    for item in list_of_list['Species']:
        count += 1
        for this in item:
            new_name = 'Mycobacterium_'+this
            seq = shared_sequences[count]
            ID = shared_ids[count]
            new_list = [seq, ID, new_name]
            list_of_list.loc[new_name] = new_list
            rows_to_keep.append(new_name)
    return(list_of_list.loc[rows_to_keep])

df3 = new_row_species(df2)
## Check if Species has more than one record
rpo = df.append(df3, ignore_index=True)
rpo = rpo.drop_duplicates(['Species'], keep='first')
rpo.index = rpo['Species']
rpo['Sequence_rpob'] = [x.strip('-') for x in rpo['Sequence_rpob']]
del rpo['IDs']
del rpo['Species']

In [302]:
## 16s
## Convert Fasta Dictionary to Dataframe
df = pd.DataFrame.from_dict(r16s_dict, orient='index')
df.columns = ["Sequence_16s"]

## Parse IDs and Seqeunce Name
df['Species'] = df.index

## Filter Multicolumns
df2 = df[df['Species'].str.contains("\|")]
df = df[~(df['Species'].str.contains("\|"))]
df2['Species'] = [x.split('|') for x in df2['Species']]
df2.head()

## Create Function to Create Duplicate Rows
shared_sequences = list(df2['Sequence_16s'])
length_of_dataframe = len(df2)
def new_row_species(list_of_list):
    count = -1
    rows_to_keep = []
    for item in list_of_list['Species']:
        count += 1
        for this in item:
            new_name = 'Mycobacterium_'+this
            seq = shared_sequences[count]
            new_list = [seq, new_name]
            list_of_list.loc[new_name] = new_list
            rows_to_keep.append(new_name)
    return(list_of_list.loc[rows_to_keep])

df3 = new_row_species(df2)
## Check if Species has more than one record
r16s = df.append(df3, ignore_index=True)
r16s = r16s.drop_duplicates(['Species'], keep='first')
r16s.index = r16s['Species']
r16s['Sequence_16s'] = [x.strip('-') for x in r16s['Sequence_16s']]
del r16s['Species']

In [303]:
overlapping_species = pd.concat([rpo, hsp], axis=1, join='inner')
#overlapping_species = pd.concat([intermediate, r16s], axis=1, join='inner')

In [304]:
overlapping_species.head()

Unnamed: 0_level_0,Sequence_rpob,Sequence_hsp
Species,Unnamed: 1_level_1,Unnamed: 2_level_1
Mycobacterium_florentinum,GCCCGTGAGGTCCGCGACACCTCGCTGAAGGTGCCGCACGGTGAGT...,GAGGACCCGTACGAGAAGATCGGCGCCGAGCTGGTCAAGGAAGTTG...
Mycobacterium_fortuitum,ATCTTCGGTGAGAAGGCCCGCGAGGTTCGTGACACCTCGCTGAAGG...,GAGGACCCGTACGAGAAGATCGGCGCTGAGCTCGTCAAAGAGGTCG...
Mycobacterium_stomatepiae,ATCTTCGGTGAGAAGGCCCGCGAGGTCCGCGACACCTCGCTGAAGG...,GAGGACCCATACGAGAAGATCGGCGCCGAGCTGGTCAAGGAAGTCG...
Mycobacterium_lentiflavum,ATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACCTCGCTGAAGG...,GAGGACCCGTACGAGAAGATCGGCGCCGAGCTGGTCAAGGAAGTTG...
Mycobacterium_kubicae,ATCTTCGGTGAGAAGGCCCGCGAGGTCCGCGACACGTCGCTCAAGG...,GAGGACCCGTACGAGAAGATCGGCGCCGAGCTGGTCAAGGAAGTCG...


In [305]:
#overlapping_species['Concatenated_sequences'] = overlapping_species['Sequence_rpob']+overlapping_species['Sequence_hsp']+overlapping_species['Sequence_16s']
overlapping_species['Concatenated_sequences'] = overlapping_species['Sequence_rpob']+overlapping_species['Sequence_hsp']

del overlapping_species['Sequence_rpob']
del overlapping_species['Sequence_hsp']
#del overlapping_species['Sequence_16s']


overlapping_species['Concatenated_sequences'] = [x.strip('-') for x in overlapping_species['Concatenated_sequences']]
overlapping_species.head()

Unnamed: 0_level_0,Concatenated_sequences
Species,Unnamed: 1_level_1
Mycobacterium_florentinum,GCCCGTGAGGTCCGCGACACCTCGCTGAAGGTGCCGCACGGTGAGT...
Mycobacterium_fortuitum,ATCTTCGGTGAGAAGGCCCGCGAGGTTCGTGACACCTCGCTGAAGG...
Mycobacterium_stomatepiae,ATCTTCGGTGAGAAGGCCCGCGAGGTCCGCGACACCTCGCTGAAGG...
Mycobacterium_lentiflavum,ATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACCTCGCTGAAGG...
Mycobacterium_kubicae,ATCTTCGGTGAGAAGGCCCGCGAGGTCCGCGACACGTCGCTCAAGG...


In [306]:
with open("Concatenated_Database_Sequences.fasta", "w") as out_handle:
    for i, row in enumerate(overlapping_species.values):
        name = df.index[i]
        sequence = row[0]
        sequence = sequence.strip('[').strip(']').strip("'").strip("\n").strip("-")
        out_handle.write(">%s\n%s\n" % (name, sequence))

In [307]:
!sed 's/-//g' Concatenated_Database_Sequences.fasta > Concatenated_Database_Sequences_no_dash_only_two.fasta