In [4]:
from Bio import SeqIO
import pandas as pd

In [5]:
def getKmers(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

def get_seq_df(file_name): 
    
    fasta_sequences = SeqIO.parse(open(file_name),'fasta')

    names = []
    sequences = []

    for fasta in fasta_sequences:
        name, sequence = fasta.id, str(fasta.seq)
        names.append(name)
        sequences.append(sequence)
        
    sequences_df = pd.DataFrame()
    
    sequences_df['names'] = names
    sequences_df['sequences'] = sequences
    
    return sequences_df

def get_hexamers(df, size = 6):
    
    sequences_df = df.copy()
    
    sequences_df['hexamers'] = sequences_df['sequences'].apply(lambda x: getKmers(x, size = size))
    
    sequences_df['hex_words'] = sequences_df['hexamers'].apply(lambda x: ' '.join(x))
        
    return sequences_df

In [57]:
fasta = get_seq_df('sequences_coding_region.fasta')

In [59]:
fasta['coding_region'] = fasta['names'].apply(lambda x: x.split(':')[1].split(',')[0])
fasta['accession'] = fasta['names'].apply(lambda x: x.split(':')[0].replace('join(', ''))
fasta['start_position'] = fasta['coding_region'].apply(lambda x: int(x.split('..')[0].replace('<', '')))
fasta['stop_position'] = fasta['coding_region'].apply(lambda x: int(x.split('..')[1].replace('>', '')))

In [60]:
# isolate just the N gene sequences by filtering by position
ngene = fasta[(fasta.start_position > 28000) & (fasta.start_position < 29000)]

In [61]:
# total sequences
len(ngene.accession.unique())

41536

In [62]:
ngene = get_hexamers(ngene)

In [63]:
ngene.to_csv('ngene.csv')