In [357]:
import pandas as pd
from Bio import SeqIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import MuscleCommandline
import os
import numpy as np
import subprocess
from Bio.Align.AlignInfo import SummaryInfo
from math import log2

In [358]:
def new_histones(file_flag=None):
    if file_flag == 'r' or file_flag is None:
        h1 = open('new_histones/H1.fasta')
        h2a = open('new_histones/H2A.fasta')
        h2b = open('new_histones/H2B.fasta')
        h3 = open('new_histones/H3.fasta')
        h4 = open('new_histones/H4.fasta')
    elif file_flag == 'w':
        h1 = open('new_histones/H1.fasta', 'w')
        h2a = open('new_histones/H2A.fasta', 'w')
        h2b = open('new_histones/H2B.fasta', 'w')
        h3 = open('new_histones/H3.fasta', 'w')
        h4 = open('new_histones/H4.fasta', 'w')
    else:
        print('error')
    files = [h1, h2a, h2b, h3, h4]
    return files

In [359]:
def new_alignments(file_flag=None):
    if file_flag == 'r' or file_flag is None:
        al_h1 = open('new_alignments/H1.fasta')
        al_h2a = open('new_alignments/H2A.fasta')
        al_h2b = open('new_alignments/H2B.fasta')
        al_h3 = open('new_alignments/H3.fasta')
        al_h4 = open('new_alignments/H4.fasta')
    elif file_flag == 'w':
        al_h1 = open('new_alignments/H1.fasta', 'w')
        al_h2a = open('new_alignments/H2A.fasta', 'w')
        al_h2b = open('new_alignments/H2B.fasta', 'w')
        al_h3 = open('new_alignments/H3.fasta', 'w')
        al_h4 = open('new_alignments/H4.fasta', 'w')
    else:
        print('error')
    files = [al_h1, al_h2a, al_h2b, al_h3, al_h4]
    return files

In [360]:
def muscle_aln(accessions, options=[],debug = False):
        """
        Align with muscle all sequences from defined accessions
        accessions: list of accessions
        :return: MultipleSeqAlignment object
        """

        muscle = os.path.join(os.path.dirname(sys.executable), "muscle")
        process = subprocess.Popen([muscle]+options, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        
        sequences = "\n".join(accessions)
        aln, error = process.communicate(sequences.encode('utf-8'))
        if debug:
            print(sequences)
            print()
            print("Stderr:")
            print(error.decode('utf-8')) 
            print("Stdout:")
            print(aln.decode('utf-8')) 
        seqFile = io.StringIO()
        seqFile.write(aln.decode('utf-8'))
        seqFile.seek(0)
        sequences = list(SeqIO.parse(seqFile, "fasta"))  # Not in same order, but does it matter?
        msa = MultipleSeqAlignment(sequences)
        return msa

In [361]:
cons_list = []
cons_file = open('consensus', 'w')
his_types = ['H1', 'H2A', 'H2B', 'H3', 'H4']

for his_type, file_in, file_out in zip(his_types, new_histones('r'), new_alignments('w')):
    records = set([record.format('fasta') for record in SeqIO.parse(file_in, 'fasta')])
    alignment = muscle_aln(records)
    cons = SummaryInfo(alignment).dumb_consensus(ambiguous='-', threshold=0.1)
    print(f'>{his_type}\n{cons}', file=cons_file)
    cons_list.append(cons)
    AlignIO.write(alignment, file_out, 'fasta')
    
for file in new_alignments():
    file.close()
    
cons_file.close()

In [351]:
cons_list

[Seq('M-----EMAD-MEMMMSETAPVAA-AAAAP-MGEEEELPEEDEEEMEEDEEEDR...ESP', SingleLetterAlphabet()),
 Seq('MWLCDWAK----------K--D-N-ST---T----LLKVT-VSL-SP---MMMS...AN-', SingleLetterAlphabet()),
 Seq('MIPGKP----GS---P------A-DV---------MMPEAEKSPA-----E-SK...GCG', SingleLetterAlphabet()),
 Seq('MADDTPIIEEIAEQNESVTRIMQRLKHDMQRVTSVPGFNTSAAGV---------...GGT', SingleLetterAlphabet()),
 Seq('MFDVFGRDKGGKVLDKGRAKRHPKVLRGNIQGITKPAISRLARRSGVKRISGLI...FGG', SingleLetterAlphabet())]