# Sorting a sequence file (FASTA or FASTQ file)

## Context:
- Here, we sort the sequence reads from a sequence file.

### Get Data

Download the Orcide genome data.

In [2]:
!wget https://raw.githubusercontent.com/biopython/biopython/master/Doc/examples/ls_orchid.fasta --no-check-certificate

--2023-07-14 01:33:17--  https://raw.githubusercontent.com/biopython/biopython/master/Doc/examples/ls_orchid.fasta
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 76480 (75K) [text/plain]
Saving to: 'ls_orchid.fasta'

     0K .......... .......... .......... .......... .......... 66% 12.5K 2s
    50K .......... .......... ....                            100% 12.8M=4.0s

2023-07-14 01:33:23 (18.7 KB/s) - 'ls_orchid.fasta' saved [76480/76480]



In [3]:
from Bio import SeqIO
records = list(SeqIO.parse("ls_orchid.fasta", "fasta"))
for record in records:
    print(len(record.seq))

740
753
748
744
733
718
730
704
740
709
700
726
753
699
658
752
726
765
755
742
762
745
750
731
741
740
727
711
743
727
757
770
767
759
750
788
774
789
688
719
743
737
728
740
696
732
731
735
720
740
629
572
587
700
636
716
592
716
733
626
737
740
574
594
610
730
641
702
733
738
736
732
745
744
738
739
740
745
695
745
743
730
706
744
742
694
712
715
688
784
721
703
744
592


In [4]:
records.sort(key=lambda r:len(r)) # sorting the sequences in ascending order
SeqIO.write(records, "sorted_orchids.fasta", "fasta")

94

In [11]:
# Get the lengths and ids, and sort on length
len_and_ids = [(len(rec), rec.id) for rec in SeqIO.parse('sorted_orchids.fasta', 'fasta')]
len_and_ids

[(572, 'gi|2765606|emb|Z78481.1|PIZ78481'),
 (574, 'gi|2765595|emb|Z78470.1|PPZ78470'),
 (587, 'gi|2765605|emb|Z78480.1|PGZ78480'),
 (592, 'gi|2765601|emb|Z78476.1|PGZ78476'),
 (592, 'gi|2765564|emb|Z78439.1|PBZ78439'),
 (594, 'gi|2765594|emb|Z78469.1|PHZ78469'),
 (610, 'gi|2765593|emb|Z78468.1|PAZ78468'),
 (626, 'gi|2765598|emb|Z78473.1|PSZ78473'),
 (629, 'gi|2765607|emb|Z78482.1|PEZ78482'),
 (636, 'gi|2765603|emb|Z78478.1|PVZ78478'),
 (641, 'gi|2765591|emb|Z78466.1|PPZ78466'),
 (658, 'gi|2765643|emb|Z78518.1|CRZ78518'),
 (688, 'gi|2765619|emb|Z78494.1|PNZ78494'),
 (688, 'gi|2765569|emb|Z78444.1|PAZ78444'),
 (694, 'gi|2765572|emb|Z78447.1|PVZ78447'),
 (695, 'gi|2765579|emb|Z78454.1|PFZ78454'),
 (696, 'gi|2765613|emb|Z78488.1|PTZ78488'),
 (699, 'gi|2765644|emb|Z78519.1|CPZ78519'),
 (700, 'gi|2765647|emb|Z78522.1|CMZ78522'),
 (700, 'gi|2765604|emb|Z78479.1|PPZ78479'),
 (702, 'gi|2765590|emb|Z78465.1|PRZ78465'),
 (703, 'gi|2765566|emb|Z78441.1|PSZ78441'),
 (704, 'gi|2765650|emb|Z78525.1|

## Summary: In this tutorial, we downloaded Orchids genome data and sorted the sequences in ascending order

# Finish!