## FASTA Format Handling
- Exploring Data 
- Reading Data 
- Cleaning Data 
- `FASTA` Format Handling Template 

## Exploring Data 

In [1]:
# examine first few lines  
!head ../data/Haemophilus_influenzae.fasta

>CP005967.1 Haemophilus influenzae KR494, complete genome
AACCGAAATTACAGTGCATGGACGCACAAAATCTGATGGTTATCGTGCTGATAGAATTAATTGGAAAAAA
ATTGGTAAAGTCCGAGAGCGTTTATCCATTCCTGTTATTGCTAACGGAGAAATTTGGCATTGGCAAGATG
GTCAAGATTGCTTATCTCAAACAGGTTGTCAGGATTTAATGGTGGGACGAGGTGCATTGAATATTCCGAA
CTTAAGCCATGTTCTGAAATCAAATGCAGAAAAAATGCCTTGGAATGAGATTCAAAAAATCTTGCAAAAA
TATGCGAATGTTGAAAATGAATATGGCAGCGGTTTTTACCATGTGGCACGAATTAAACAATGGTTACGTT
ATTTGAATAAGGAATATGATGAGGCGAACCAAGAGTTTGATAAGATTAAGACTTGCCAAACTGCTGAAGA
TTTGAAATTACGGTTAAATGATAAATAAAAAACCTGCTAATCAGCAGGTTTTCTTTTTCTAAATTATTTA
AAAATTCACCGCACTTTTTAATTTTTTCAGTGCATTAGTTTCTAATTGACGAATACGTTCCGCAGATACA
TTATATTTTGCTGCCAAGTCGTGCAATGTAGCTTTGTTATCATCTAACCAACGAGCTTTGATAATATCCT


In [2]:
# examine last few lines 
!tail ../data/Haemophilus_influenzae.fasta

GTTACCAACATGCTTCTGGCCCAACTTTAAATGATGCCTATAAAGTATTGGGTGTAACTGAGTCCGACGA
GCAAAATACGGTTAAGCGTGCTTATCGTCGTCTAATGAATGAACACCATCCAGATAAACTCGTGGCGAAA
GGTTTACCGCCAGAAATGATGGAAATGGCAAAAGAAAAAACGCAACAGATTCAAGCTGCTTACGATTTAA
TTTGTAAAGCAAAAGGCTGGAAATAGTGCGTGTTATTCTTGCGCCTATGCAAGGGGTTCTTGATCCCTTT
GTACGCCAACTTCTCACTGAAGTGAATGACTACGATTTATGTATAACAGAATTTGTCCGCGTAGTTGATC
AACTTCTTCCTGAAAAAGTATTTTATCGTTTATGCCCTGAATTAAAAAATCAGGGCTTTACTTCTTCTAG
CACGCCTGTGCGAGTGCAGTTGCTAGGGCAGCATCCAGAATACCTTGCTGAAAATGCAATTCGTGCAATC
GAGCTTGGTTCTCATGGCATTGATTTAAATTGTGGTTGCCCTTCTAAAACAGTGAATGGCAGCAATGGCG
GTGCGGCATTATTGAAACAGCCTGAATTGATTTATCGTGCAACTCAAGCCTTACGC



## Reading Data

In [3]:
# open file in reading mood 
with open("../data/Haemophilus_influenzae.fasta", "r") as f: 
    # reading data 
    seq = f.read() 

In [5]:
len(seq)

1882752

In [6]:
1856176 == 1882752

False

In [7]:
1882752 - 1856176

26576

In [4]:
seq[:200]

'>CP005967.1 Haemophilus influenzae KR494, complete genome\nAACCGAAATTACAGTGCATGGACGCACAAAATCTGATGGTTATCGTGCTGATAGAATTAATTGGAAAAAA\nATTGGTAAAGTCCGAGAGCGTTTATCCATTCCTGTTATTGCTAACGGAGAAATTTGGCATTGGCAAGATG\n'

In [6]:
# inspect specific range of sequence 
seq[200:500] 

'GTCAAGATTGCTTATCTCAAACAGGTTGTCAGGATTTAATGGTGGGACGAGGTGCATTGAATATTCCGAA\nCTTAAGCCATGTTCTGAAATCAAATGCAGAAAAAATGCCTTGGAATGAGATTCAAAAAATCTTGCAAAAA\nTATGCGAATGTTGAAAATGAATATGGCAGCGGTTTTTACCATGTGGCACGAATTAAACAATGGTTACGTT\nATTTGAATAAGGAATATGATGAGGCGAACCAAGAGTTTGATAAGATTAAGACTTGCCAAACTGCTGAAGA\nTTTGAAATTACGGTTA'

In [7]:
# check length of sequence 
len(seq)

1882752

__Note__
- 1882752 is not actual length of this sequence 
- 1856176 is the actual sequence of this sequence

In [8]:
# find the no. of extra characters in sequence 
# present length - actual length[NCBI]
1882752 - 1856176

26576

## Cleaning Data 

In [8]:
# remove name line / info line  
with open("../data/Haemophilus_influenzae.fasta", "r") as f: 
    seq = f.readline() # skip name line 
    seq = f.read() # read data 

In [9]:
seq[1:200]

'ACCGAAATTACAGTGCATGGACGCACAAAATCTGATGGTTATCGTGCTGATAGAATTAATTGGAAAAAA\nATTGGTAAAGTCCGAGAGCGTTTATCCATTCCTGTTATTGCTAACGGAGAAATTTGGCATTGGCAAGATG\nGTCAAGATTGCTTATCTCAAACAGGTTGTCAGGATTTAATGGTGGGACGAGGTGCATT'

In [10]:
# check length 
len(seq) 

1882694

__Note__
- Still 1882694 is not actual length. it means the dataset is not perfect! 
- We have to remove special characters from this sequence 

In [11]:
# find the no. of extra characters in sequence again 
# present length - actual length[NCBI]
1882694 - 1856176

26518

In [12]:
# remove newline 
seq = seq.replace("\n", "")

In [13]:
# see 1 to 200
seq[1:200]

'ACCGAAATTACAGTGCATGGACGCACAAAATCTGATGGTTATCGTGCTGATAGAATTAATTGGAAAAAAATTGGTAAAGTCCGAGAGCGTTTATCCATTCCTGTTATTGCTAACGGAGAAATTTGGCATTGGCAAGATGGTCAAGATTGCTTATCTCAAACAGGTTGTCAGGATTTAATGGTGGGACGAGGTGCATTGA'

In [14]:
# remove tab 
seq = seq.replace("\t", "")

In [15]:
# see 1 to 200bp 
seq[1:200]

'ACCGAAATTACAGTGCATGGACGCACAAAATCTGATGGTTATCGTGCTGATAGAATTAATTGGAAAAAAATTGGTAAAGTCCGAGAGCGTTTATCCATTCCTGTTATTGCTAACGGAGAAATTTGGCATTGGCAAGATGGTCAAGATTGCTTATCTCAAACAGGTTGTCAGGATTTAATGGTGGGACGAGGTGCATTGA'

In [16]:
# check leng
len(seq)

1856176

In [17]:
# clean dataset? 
1856176 - 1856176

0

## FASTA Format Handling Template

In [18]:
def readFASTA(inputfile): 
    """Reads a FASTA file and removes name line and special characters""" 
    # open file 
    with open(inputfile, "r") as f: 
        # remove name line / info line 
        seq = f.readline()
        # read data 
        seq = f.read() 
        # remove special character 
        seq = seq.replace("\n", "")
        seq = seq.replace("\t", "") 
        
    return seq 

In [19]:
# read FASTA 
s = readFASTA("../data/Haemophilus_influenzae.fasta")

In [20]:
len(s) 

1856176

In [21]:
s[1:200]

'ACCGAAATTACAGTGCATGGACGCACAAAATCTGATGGTTATCGTGCTGATAGAATTAATTGGAAAAAAATTGGTAAAGTCCGAGAGCGTTTATCCATTCCTGTTATTGCTAACGGAGAAATTTGGCATTGGCAAGATGGTCAAGATTGCTTATCTCAAACAGGTTGTCAGGATTTAATGGTGGGACGAGGTGCATTGA'