In [1]:
from Bio import SeqIO
from Bio import Entrez
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

# from operator import itemgetter #, attrgetter
import copy, os
from math import pow
#import random, sys
import numpy as np
import re
from Bio.Align.Applications import MuscleCommandline
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline 

In [2]:
class Basic(object):
    def __init__(self):
        self.log202 = np.log10(20) / np.log10(2)
        self.log2 = np.log10(2)
        self.ln2 = np.log(2)
        self.inv_ln2 = 1./self.ln2
        self.ln2_2 = self.ln2 ** 2
        self.convShannonTsallis = 1.417199516  # =1/LN(2)
    
    def getDnaNucleotides(self):
        
        return ['A','T','G','C']
    
    def getDnaNucleotideString(self):
        return 'ATGC'
    
    def getRnaNucleotides(self):
        return ['A','U','G','C']
    
    def getrnaNucleotideString(self):
        return 'AUGC'

    # in Hydropathy index[95] http://en.wikipedia.org/wiki/Amino_acid
    def getSeqAA(self):
        return ['A', 'M', 'C', 'F', 'L', 'V', 'I', 'G', 'T', 'S', 'W', 'Y', 'P', 'H', 'N', 'D', 'E', 'Q', 'K', 'R']
    
    def getStringAA(self):
        return 'AMCFLVIGTSWYPHNDEQKR'
    
    def getAaPos(self, string, aa):
        return string.find(aa)
    

    def tira_aspas(self, stri):
        if (stri == None):
            return 'Null'
        
        try:
            stri = stri.replace("'","")
            stri = stri.replace('"',"'")
        except:
            stri = str(stri)
            stri = stri.replace("'","")
            stri = stri.replace('"',"'")
            stri = stri.replace('[',"")
            stri = stri.replace(']',"")
            
        return stri 
    

In [3]:
class MySequence:
    def __init__(self, prjName = "Human_papillomavirus", root = "fasta/"):
        self.clearSequences()
        self.NucleotideAlphabet = ["A","C","G","T"]
        
        self.prjName = prjName
        self.root    = root
        
        self.isProtein = False
        
        self.clearSequences()
        
        
    def read(self, typeSeq = "L2", pattern = None, numOfRecords=None, showmessage=False):

        self.typeSeq = typeSeq
        self.pattern = pattern
        self.showmessage = showmessage
     
        if pattern == None:
            self.fileName    = "%s%s_%s.fasta"%(       self.root, self.prjName, typeSeq)
            self.fileNameAln = "%s%s_%s_aligned.fasta"%(self.root, self.prjName, typeSeq)
        else:
            self.fileName    = "%s%s_%s_type_%d.fasta"%(        self.root, self.prjName, typeSeq, pattern)
            self.fileNameAln = "%s%s_%s_type_%d_aligned.fasta"%(self.root, self.prjName, typeSeq, pattern)
            
        # self.basic = Basic()
        ret = self.readMySequence(self.fileName, numOfRecords, showmessage)

    def read_aligned(self, typeSeq = "L2", pattern = None, suffix = None, numOfRecords=None, showmessage=False):

        self.typeSeq = typeSeq
        self.pattern = pattern
        self.showmessage = showmessage

        if pattern == None:
            if suffix == None:
                self.fileNameAln = "%s%s_%s_aligned.fasta"%(self.root, self.prjName, typeSeq)
            else:
                self.fileNameAln = "%s%s_%s_%s_aligned.fasta"%(self.root, self.prjName, typeSeq, sufix)
        else:
            if suffix == None:
                self.fileNameAln = "%s%s_%s_type_%d_aligned.fasta"%(self.root, self.prjName, typeSeq, pattern)
            else:
                self.fileNameAln = "%s%s_%s_type_%d_%s_aligned.fasta"%(self.root, self.prjName, typeSeq, pattern, suffix)
            
        # self.basic = Basic()
        ret = self.readMySequence(self.fileNameAln, numOfRecords, showmessage)

    def clearSequences(self):
        self.seqs = []
        self.seq_records = []
        
        self.nrow = 0
        self.ncol = 0

        
    def readMySequence(self, fileName, numOfRecords=None, showmessage=False):
        self.clearSequences()
        
        if not os.path.isfile(fileName):
            print("Could not find '%s'"%fileName)
            return False
        
        print("Reading ... '%s'"%fileName)
        
        if numOfRecords == None:
            try:
                for seq_record in SeqIO.parse(fileName, "fasta"):
                    self.seq_records.append(seq_record)
                    self.seqs.append( list(str(seq_record.seq)) )

            except ValueError:
                print("Exception %s: error reading file: '%s'"%(str(ValueError), fileName) )
                return False
            
        else:
            try:
                count = 0
                for seq_record in SeqIO.parse(fileName, "fasta"):
                    self.seq_records.append(seq_record)
                    self.seqs.append( list(str(seq_record.seq)) )
                    count += 1
                    if count == numOfRecords: break
                    
            except ValueError:
                print("Exception %s: error reading file: '%s'"%(str(ValueError), fileName) )
                return False
       

        if (showmessage):
            print("Read %d seqs from '%s'" %(len(self.seq_records), fileName) )
            
        self.seqs = np.array(self.seqs)
        
        self.nrow = self.seqs.shape[0]
        
        try:
            self.ncol = self.seqs.shape[1]

        except:
            self.ncol = 1
            
            print("Problems: fasta is not aligned. Impossible convert into matrix")
            return False
        
        return True
    

    def search_term(self, term, verbose = False):
        records = []; count = 0
        
        for record in self.seq_records:
            x = re.search(term, record.description)
            if x != None:
                count += 1
                if verbose: print(count, "\t", record.description)      
                records.append(record)
                
        return records
    
    def write_seqs(self, seq_records, fname):
        try:
            SeqIO.write(seq_records, fname, "fasta")
            print("%d records saved at %s"%(len(seq_records), fname))

        except ValueError:
            print("Could not write - error %s; file: '%s'"%(str(ValueError), fname) )
            return False


        return True
        
    def allign_with_muscle(self, in_file, out_file, muscleApp = r"/media/flalix/disk2t/seaview/muscle"):
        if not os.path.isfile(muscleApp):
            print("Could not find muscle at '%s'", muscleApp)
            return False

        if not os.path.isfile(out_file):
            muscle_cline = MuscleCommandline(muscleApp, input=in_file, out=out_file)
            print(muscle_cline)
            os.system("%s"%(muscle_cline))
        else:
            print("File exists: '%s'"%out_file)
            
        return True

### Read HLA type A

In [4]:
prjName = "HLA"; root = "fasta/"
mseq1 = MySequence(prjName = "HLA", root = "fasta/")

In [5]:
mseq1.read(typeSeq = "A", pattern = None, numOfRecords=None, showmessage=True)

Reading ... 'fasta/HLA_A.fasta'
Read 40531 seqs from 'fasta/HLA_A.fasta'
Problems: fasta is not aligned. Impossible convert into matrix


In [6]:
#-- GeneratorLen(seqs, 5)
count = 0; maximum=30

for record in mseq1.seq_records:
    count += 1
    if count == maximum: break

    # Extract individual parts of the FASTA record
    identifier = record.id
    description = record.description
    sequence = record.seq

    # Example: adapt to extract features you are interested in
    print('----------------------------------------------------------')
    print('Processing the record {}:'.format(identifier))
    print('Its description is: \n{}'.format(description))
    amount_of_nucleotides = len(sequence)
    print('Its sequence contains {} nucleotides.'.format(amount_of_nucleotides))

----------------------------------------------------------
Processing the record lcl|Z46633.1_cds_CAA86602.1_1:
Its description is: 
lcl|Z46633.1_cds_CAA86602.1_1 [gene=hla-A] [db_xref=GDB:119310,GOA:P01892,HGNC:4931,InterPro:IPR001039,InterPro:IPR003006,InterPro:IPR003597,InterPro:IPR007110,InterPro:IPR010579,InterPro:IPR011161,InterPro:IPR011162,InterPro:IPR013783,PDB:1AKJ,PDB:1AO7,PDB:1AQD,PDB:1B0G,PDB:1B0R,PDB:1BD2,PDB:1DUY,PDB:1DUZ,PDB:1EEY,PDB:1EEZ,PDB:1HHG,PDB:1HHH,PDB:1HHI,PDB:1HHJ,PDB:1HHK,PDB:1HLA,PDB:1I1F,PDB:1I1Y,PDB:1I4F,PDB:1I7R,PDB:1I7T,PDB:1I7U,PDB:1IM3,PDB:1JF1,PDB:1JHT,PDB:1LP9,PDB:1OGA,PDB:1P7Q,PDB:1QEW,PDB:1QR1,PDB:1QRN,PDB:1QSE,PDB:1QSF,PDB:1S8D,PDB:1S9W,PDB:1S9X,PDB:1S9Y,PDB:1T1W,PDB:1T1X,PDB:1T1Y,PDB:1T1Z,PDB:1T20,PDB:1T21,PDB:1T22,PDB:1TVB,PDB:1TVH,PDB:1UR7,PDB:2AV1,PDB:2AV7,PDB:2BNQ,PDB:2BNR,PDB:2C7U,PDB:2CLR,PDB:2F53,PDB:2F54,PDB:2GIT,PDB:2GJ6,PDB:2GT9,PDB:2GTW,PDB:2GTZ,PDB:2GUO,PDB:2J8U,PDB:2JCC,PDB:2P5E,PDB:2P5W,PDB:2PYE,PDB:2UWE,PDB:2V2W,PDB:2V2X,PDB:2VLJ,P

### Filter term and align

In [7]:
seq_records = mseq1.search_term("HLA-A")

typeSeq = "A"
fname = "%s%s_%s_only.fasta"%(mseq1.root, mseq1.prjName, typeSeq)

mseq1.write_seqs(seq_records, fname)

9158 records saved at fasta/HLA_A_only.fasta


True

In [8]:
typeSeq = "A"

in_file  = "%s%s_%s_only.fasta"%(root, prjName, typeSeq)
out_file = "%s%s_%s_only_aligned.fasta"%(root, prjName, typeSeq)

mseq1.allign_with_muscle(in_file, out_file, muscleApp = r"../../seaview/muscle")


Could not find muscle at '%s' /media/flalix/5c1ba0b4-f897-451c-9068-ac5e57194590/flalix/seaview/muscle


False

### Cut and Fulfill

In [None]:
mseq1.read_aligned(typeSeq = "A", pattern = None, numOfRecords=None, sufix = "only", showmessage=True)