# Parsing files is routine in bioinformatics.

### A FASTA file is a descriptive file format that provides genomic sequence data along with a single line line annotation as a header, beginning with the ">" symbol. These may be merged to for massive multi-FASTA files with one or more samples.

URL for more info = https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp




### Let's take a look at a multi-FASTA file from a microbiome experiment

In [41]:
!head -n 24 BINF6112.lab.multi.fa

>F475VFN02IOTX6 rank=0009189 x=3446.5 y=780.0 length=372
AGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCAGGCTTAACACATGCAAGTCGAGC
GGTAACAGGGGAGGCTTGCTCCTGCTGACGAGCGGCGGACGGGTGAGTAACGCGTAGGAA
TCTGCCTAGTAGAGGGGACGAACATGTGGAAACGACATGCTAATAACCGCCATACGCCCT
AACGGGGGAAAGGGAAGGGGACGGTTTTACGGTAGGCCTTCCGCCTATTAGATGAGCCTG
CGTAAGATTAGCTAGTTGGTAGGGTAAAGGGACCGTACCAAGCGACGATCTTTAACTGTC
TGAGAGGAGTGACCAGTCACACTGGGACGTGAGACACGGCCCACGACTCCTACGGGAGGC
AGCAGGTTGGTT
>F475VFN02HA2JP rank=0015150 x=2879.5 y=1555.0 length=367
AGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGC
GGCAGCGACAACATTGAACCTTCGGGAGATTTGTTGGGCGGCGAGCGGCGGACGGGTGAG
TAATGCCTGGGAAATTGCCCTGATGTGGGGGATAACCATTGGAACGATGGCTAATACCGC
ATGATAGCTTCGGCTCAAAGAGAGGGACCTTCGGGCCTCTCGCGTCAGGATATGCCCAGG
TGGGATTAGCTAGTTGGTGAGGTAAGGGCTCACCAAGGCGACGATCCCTAGCTGGTCTGA
GAGGATGATCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGG
GTTGGTT
>F475VFN02IV79U rank=0015459 x=3530.0 y=1872.0 length=377
AGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGCGTGCTTAACACATGCAAGTCG

In [42]:
!grep '>' BINF6112.lab.multi.fa | wc -l

    6804


In [None]:
class Fasta:
    def __init__(self, infile):
        self.mydict = {}
        self.infile = infile
        self.accession = ""
        self.rank = ""
        self.sequence = ""
        self.length = 0
        self.x = 0.0
        self.y = 0.0
        self.GCcontent = None
        if self.infile != None:
            with open(self.infile, 'r') as fa:
                for line in fa.readlines():
                    if not line.startswith('>'):
                        self.sequence = self.sequence + line.rstrip()
                    elif line.startswith('>'):
                        self.accession = line.split()[0].lstrip('>')
                        self.rank = line.split()[1].split('=')[1]
                        self.x = line.split()[2].split('=')[1]
                        self.y = line.split()[3].split('=')[1]
                        self.length = line.split()[4].split('=')[1]  

### Let's provide some access methods to the class!

### Access methods should usually return something, in this case the data stored for each instance variable.

In [None]:
    def getSeq(self):
        return self.sequence
    
    def getRank(self):
        return self.rank

    def getLength(self):
        return self.length   
    
    def getAccession(self):
        return self.accession
    
    def getX(self):
        return self.x

    def getY(self):
        return self.y



### Wait, what if we want to generate empty Fasta objects and set the variables later?

### Step 1: We need to adjust our __init__ method to make file parsing optional!

In [None]:
class Fasta:
    def __init__(self, infile = None):
        self.infile = infile
        self.accession = ""
        self.rank = ""
        self.sequence = ""
        self.length = 0
        self.x = 0.0
        self.y = 0.0
        self.GCcontent = None
        if self.infile != None:
            with open(self.infile, 'r') as fa:
                for line in fa.readlines():
                    if not line.startswith('>'):
                        self.sequence = self.sequence + line.rstrip()
                    elif line.startswith('>'):
                        self.accession = line.split()[0].lstrip('>')
                        self.rank = line.split()[1].split('=')[1]
                        self.x = line.split()[2].split('=')[1]
                        self.y = line.split()[3].split('=')[1]
                        self.length = line.split()[4].split('=')[1]  

### Wait, what if we want to generate empty Fasta objects and set the variables later?

### Step 2: Write setter methods to assign data to the instance variables!

In [None]:
    def setSeq(self,seq):
        self.sequence = self.sequence + line.rstrip()   
        
    def setRank(self,line):
        self.rank = line.split()[1].split('=')[1]
    
    def setLength(self,line):
        self.Length = line.split()[4].split('=')[1] 
        
    def setAccession(self,line):
        self.accession = line.split()[0].lstrip('>')
    
    def setX(self,line):
        self.x = line.split()[2].split('=')[1]
        
    def setY(self,line):
        self.y = line.split()[3].split('=')[1]
    
    def setGC(self):
        if self.GCcontent == None:
            print("Running gcCalculator on " + self.accession)
            self.GCcontent = gcCalculator(self.sequence)
        return self.GCcontent

## How should we make our object look when rendered by the Python interpreter? 

### We can define the special \_\_repr\_\_( ) method to handle this!



In [None]:
    def __repr__(self):
        return "A fasta object for: " + self.accession + " at X = " + self.x + " and Y = " + self.y


### Let's put all of that together to see our whole class.

In [40]:

class Fasta:
    def __init__(self, infile=None):
        self.infile = infile
        self.accession = ""
        self.rank = ""
        self.sequence = ""
        self.length = 0
        self.x = 0.0
        self.y = 0.0
        self.GCcontent = None
        if self.infile != None:
            with open(self.infile, 'r') as fa:
                for line in fa.readlines():
                    if not line.startswith('>'):
                        self.sequence = self.sequence + line.rstrip()
                    elif line.startswith('>'):
                        self.accession = line.split()[0].lstrip('>')
                        self.rank = line.split()[1].split('=')[1]
                        self.x = line.split()[2].split('=')[1]
                        self.y = line.split()[3].split('=')[1]
                        self.length = line.split()[4].split('=')[1]    
    def getSeq(self):
        return self.sequence
    
    def setSeq(self,seq):
        self.sequence = self.sequence + line.rstrip()
    
    def getRank(self):
        return self.rank
    
    def setRank(self,line):
        self.rank = line.split()[1].split('=')[1]
    
    def getLength(self):
        return self.length
        
    def setLength(self,line):
        self.Length = line.split()[4].split('=')[1]    
    
    def getAccession(self):
        return self.accession
    
    def setAccession(self,line):
        self.accession = line.split()[0].lstrip('>')
    
    def getX(self):
        return self.x
    
    def setX(self,line):
        self.x = line.split()[2].split('=')[1]
    
    def getY(self):
        return self.y
    
    def setY(self,line):
        self.y = line.split()[3].split('=')[1]
            
    def __repr__(self):
        return "A fasta object for: " + self.accession + " at X = " + self.x + " and Y = " + self.y  
    
    def setGC(self):
        if self.GCcontent == None:
            print("Running gcCalculator on " + self.accession)
            self.GCcontent = gcCalculator(self.sequence)
        return self.GCcontent
    
    

## How can we parse a large multi-FASTA and generate a list of Fasta objects?

In [None]:
fastas = []

with open("BINF6112.lab.multi.fa", 'r') as mf:
    for line in mf.readlines():
        line = line.rstrip()
        if line.startswith(">"):
            faObj = Fasta()
            fastas.append(faObj)
            if faObj.sequence == "":
                faObj.setAccession(line)
                faObj.setRank(line)
                faObj.setLength(line)
                faObj.setX(line)
                faObj.setY(line)
                
        else:
            faObj.setSeq(line)

In [None]:
print("There were this many fasta objects created: \n    " + str(len(fastas)) )

!echo 'There were this many seuences in the file: ' 
! grep '>' BINF6112.lab.multi.fa | wc -l 

### Great! Now let's write a function to calculate some statistics on each sequence!
### Notice how *self.GContent* is always set to None.


### How can we update this for each Fasta object? 

In [None]:
def gcCalculator(seq):
        G = seq.upper().count('G')
        C = seq.upper().count('C')
        percentGC = ((G + C) / len(seq))*100
        GCcontent = "%.2f" % round(percentGC,2)
        return GCcontent

## Add This setter method is added to the class definition!
#    def setGC(self):
#        if self.GCcontent == None:
#            print("Running gcCalculator on " + self.accession)
#            self.GCcontent = gcCalculator(self.sequence)
#        return self.GCcontent

## This global method is used by our setter. 

for f in fastas:
    x = f.setGC()
    print(x)

In [None]:
fastas[0]

In [None]:
help(Fasta)