In [1]:
# Project description

"""
GeneVecTools
=============

This tool provides functionalities for:
- Reading in a variety of genetic file types
- Vector embedding algorithms
- Byte array encoders
- Clustering and preprocessing steps for compression
- Similarity search tools for FASTA/FASTQ files

Installing
==========
Tester files: https://tinyurl.com/cDNALibraryExampleFiles
Install using pip:
.. code-block:: bash
    pip install GeneVecTools
"""




In [5]:
# Importing necessary modules

from GeneVecTools import simSearch, reader, mapper, encoder


In [3]:
# Define the file location of the genetic sequence

file = "small_cDNA_Sequences_pbmc_1k_v2_S1_L002_R2_001.fastq"


In [6]:
# Creating a VecSS object with specified parameters

VECSS = simSearch.VecSS(f=f"data\{file}, length=10000, encoding="one-hot-encoding", bits=8)


In [7]:
# Reading sequences from the file

sequences = VECSS.readq()


In [8]:
# Embedding the sequences

embedded = VECSS.embed(VECSS.s)
print("Embedded sequences:", embedded)


Embedded sequences: [11203753160672822283916481069670212061265302112718951608799917160281330817195147528256418800429567777277542616074214947903725099594621357078553992560212803755024090189579436514945964434501619705841275121862388113212431, 22148962078592792280743252114705459791197323233972200059288565495162288653596944601291737789912601077954698646099282084586388537632613142513039297279422475470788724905455504719394077938321766999027213525802869491630607, 11052935522643732614254101818983735031149437724173647858476895531222506728728730752479454929266937880124483909200694322157963688990486734663564473232516063518399761786691670502138224018289563481320425465784843238900740, 11075070173233118035769878691654579796867953777489680029947117164526101542979781045166834740125837714835662827503240714155260035450920025602227914884336534304779777717906033267988492692154056599901295758625161479717889, 442977508924395033579597594681049733559247251685715893491653274250315639030145843013836900131827732

In [9]:
# Running the similarity search

D, I, time = VECSS.run_search()
print("Similarity search results:\nDifferences:", D, "\nIndices:", I, "\nTime taken:", time)


index.ntotal
100
Similarity search results:
Differences: [[0.0000000e+00 2.1107779e+08 2.4994662e+08 2.5378875e+08]
 [0.0000000e+00 2.5795086e+08 2.6664752e+08 2.7259187e+08]
 [0.0000000e+00 2.4740659e+08 2.5411149e+08 2.6292944e+08]
 [0.0000000e+00 2.2577262e+08 2.7185302e+08 2.7191200e+08]
 [0.0000000e+00 2.0230675e+08 2.0420034e+08 2.3622605e+08]
 [0.0000000e+00 2.8370250e+08 3.2300714e+08 3.2836230e+08]
 [0.0000000e+00 2.0020426e+08 2.5318739e+08 2.8028538e+08]
 [0.0000000e+00 2.3739339e+08 2.4740659e+08 2.5527750e+08]
 [0.0000000e+00 2.6839050e+08 2.7885574e+08 2.7891200e+08]
 [0.0000000e+00 2.1592979e+08 2.3084890e+08 2.4680370e+08]] 
Indices: [[ 0 72 24 87]
 [ 1 79 76 81]
 [ 2  7 70 46]
 [ 3 12 81 76]
 [ 4 71 60 41]
 [ 5 59 41 64]
 [ 6 19 81  4]
 [ 7 87  2 26]
 [ 8 89 40 24]
 [ 9 93 81 24]] 
Time taken: 0:00:00.006960


In [10]:
# Testing the embedding and unembedding process

assert VECSS.unembed(VECSS.embed(VECSS.s)) == VECSS.s


In [11]:
# Reading sequences using the Reader class

R = reader.Reader()
mp, count, total_len, quality = R.read_fastq(file)
sequences_dict_items = mp.values()
sequences = list(sequences_dict_items)
print("Extracted sequences:", sequences)


Extracted sequences: ['NCCAAGGTGGGCGGATCACCTAAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGAGAAACCCCGTCTCTACTAAAAATACAAAATTAG', 'NGGGTGAGCATGGTGGCTCGTGCCTGTGGTCCCAGCACTTTGGGAGGCTGAGGTGGGCAGCTCACTTGAGGTCAGGAGTTCGAGACCAGCC', 'CCATCACTGTGGTCTTTGGGTTCTGTTTGTTTTATTGTTTTGTTGTTGAGAACTGCCTTGCCTTCTGACACTGTAAGGTGCTCCAGGCTTG', 'TCAAAAGAGGTCACTTCTGGCCAGGCATGGTGGCTCACACCTGTAATCCCAGCACTTTGGGAAGCCAAGGCAGGTGGATCACGAGGTCAGG', 'GCTGCTCATGATTGCTGGTATCGATGACTGCTACACCTCAGCCCGGGGCTGCACTGCCACCCTGGGCAACTTCGCCAAGGCCACCGTTGAA', 'ACAGGCATGAGCCACCGTGCCCAGCCTATATTTCTATTTGACAGTGACGGTGTGGGCTATAGTCATTTGTATCCACCAGCAAATAGAATTA', 'AGACCAGACTCAACACGGAGAAACCCAGTCTCTACTAAAAATACAAAATTAGCCAGGCATGGTGGTGCATGCCTGTAATCTCAGCTACTCG', 'ATCGGCGCTCATGTACTTCATTGTGCGCTCTTTGCGGACAGCAGCCCTGGGCCCCGACAGCATGGGGGGCCCCGTCCCCCGGCAGCGTCTC', 'GTAGAAGTCTTGAGACGGAGGCTGGCCATCCATTCAGCCCTGAGCGTGCTGAGTTCTGTGTTTCTCTGAATAGAGGTGTGGAACCTGAGGG', 'GATAGAGACCATCCTGGCTAACATGGTGAAACCCTGTCTCTACTAAACAAAATACAAAAAATTTGCTGGGTGTGGTGGCGGGCGCCTGTAG', 'TGAGTCCTTCCTGGGCTCTGGGGCCTG

In [12]:
# Clustering sequences

mapObj = mapper.Mapper(sequences, 2, 3)
groups_of_similar_kmers = mapper.cluster(mapObj.hfs)
cluster_of_sequences = mapper.groupings(groups_of_similar_kmers, sequences)
print("Clustered sequences:", cluster_of_sequences)


Clustered sequences: [[('CCAAAAGTCATGTACCAATGTACCTGATAAAAAAAAAACAATTAATCAATGATATCAAGTCAATTTAAAACAATAAAATTTTAATTAAAAA', 55), ('CGTTTTTTTGATGAAAAAAAAAAAGAAAAAACAAAAAACAAAAACCCCACAAACAACAGAAAGTTAAAAAACAACAACCCCAAGAAACCCC', 83)], [('AACCTGAAAAGTAGGAAGCATAAGAAAAAAGAAAAGCTAGGAAACAAAAAGCTAAGGGCAAAATGAACAAACTAACAAGAAAATTGGAAGA', 12)], [('AAAAAACACATCAAAAAGCTACTAAAAGGACTGGTGTAATTTAAAAAAAACTAAGGCAGAAGGCTTTTGTAACAGTTAGAAGAATTAGGCA', 54)], [('GAAACCTGACGTTAGAAAGGCTCAACGAGAACAAGCTATCAGGGCTGCTAAGGAAGCAAAAAAGGCGAAGCAAGCATCTAAAAAGACTGCC', 23), ('CCATAGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAACATATAACTGAACTCC', 95)], [('AGGGGGGGGGGGGGGGGGGGGTTGTGGATTATTTTTGAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', 88)], [('ACAAAATGTACAGATACAGAATTCTGTTCAAGCGAAGAATGATAAAGAGACAGAAGCACAGATATCATGGTATCCTCCATAGGATCATTGA', 75)], [('AAGGACATTGAATTGGTCATGTCACAAGCAAATGTGTCGAGAGCAAAGGCAGTCCGAGCCCTGAAGAACAACAGTAATGATATTGTAAATG', 66), ('GAACCAGAGCTTCTGCCCAACTGTCAACCTTGACAAATTGTGGACTTTGGTCAG

In [13]:
# Encoding sequences

encoder_instance = encoder.Encoder(4)
encoded_sequences = encoder_instance.encode_sequences(sequences)
print("Encoded sequences:", encoded_sequences)


Encoded sequences: [5881409623665803581558545801802826637208516500130564821122688002574028144497948559240726960001689556271793231, 10004394407413077045344608303280955879052203303070261615812410114111081023631831635413797231502745052045713967, 4855413878470845053905210764890259966026658175194522644288777091124669795160173631799006946152061391731955780, 5066379109335506145595535234280682755908844744216772993971656908108429381329205644047597104878110800609839169, 19981574232017093563538886404033080916306650350624875655124029601162917839229259644120716225385801402631332162, 18949177865413378613539602313444532902746787690165603044655043547753287311973078365036074499535931162587441224, 5295977813340894782018942531500423239017435631891116988028332278167684302765768294667157080730596888283465768, 9578304578059814396747555236073631998839334279125124428172520154145454069286891232858083060916893287308207128, 50136624857154415191626382536863149433257486237336220130432608498436192527738355277955065

In [14]:
# Compressing the encoded clusters

encoded_clusters_compressed = encoder_instance.encode_clusters(cluster_of_sequences)
print("Compressed clusters:", encoded_clusters_compressed)


Compressed clusters: [b'\x02\x00\x00\x007\x00\x00\x00S\x00\x00\x00..D\x88\x88\x12\x84!\x81D\x88!\x81D!\x18\x88\x88\x88\x88\x88\x84\x18\x81\x18\x84\x18\x82\x81A\x88\x12\x84\x18\x11\x88\x88\x84\x18\x88\x88\x11\x11\x88\x11\x88\x88\x08$\x11\x11\x11!\x18\x82\x88\x88\x88\x88\x88\x82\x88\x88H\x88\x88\x88\x84\x88\x88DDH\x88H\x88\x84\x82\x88\x12\x81\x88\x88H\x88\x84HD\x84(\x88HD\x04', b'\x01\x00\x00\x00\x0c\x00\x00\x00.\x88D!\x88\x88\x12(\x82(\x84\x81(\x88\x88\x88\x82\x88(\x14(\x82\x88\x84\x88\x88B\x81("\x84\x88\x18\x82H\x88H\x81H\x88\x82\x88\x18!\x82(\x08', b'\x01\x00\x00\x006\x00\x00\x00.\x88\x88\x88\x84\x84A\x88\x88(\x14H\x81\x88(\x82\x14"!\x81\x18\x11\x88\x88\x88\x88\x14\x88"\x84\x82(B\x11\x11\x12\x88\x84\x12\x81\x82(\x88\x11(B\x08', b'\x02\x00\x00\x00\x17\x00\x00\x00_\x00\x00\x00..\x82\x88D!H\x12\x81\x82\x88"\x14\x84H\x82\x82H\x88B\x81A("\x14B\x81(\x82(\x84\x88\x88(B\x82(\x84(\x84A\x81\x88\x88\x82\x14B\x04D\x18(\x81"D\x81\x88(\x84B\x84D\x88\x11\x88\x82\x88B\x12A\x88BA\x88\x84D\x84\x14H\x14

In [15]:
# Decompressing the clusters

decoded_clusters_compressed = encoder_instance.decode_clusters(encoded_clusters_compressed)
print("Decompressed clusters:", decoded_clusters_compressed)


Decompressed clusters: [[('CCAAAAGTCATGTACCAATGTACCTGATAAAAAAAAAACAATTAATCAATGATATCAAGTCAATTTAAAACAATAAAATTTTAATTAAAAA', 55), ('CGTTTTTTTGATGAAAAAAAAAAAGAAAAAACAAAAAACAAAAACCCCACAAACAACAGAAAGTTAAAAAACAACAACCCCAAGAAACCCC', 83)], [('AACCTGAAAAGTAGGAAGCATAAGAAAAAAGAAAAGCTAGGAAACAAAAAGCTAAGGGCAAAATGAACAAACTAACAAGAAAATTGGAAGA', 12)], [('AAAAAACACATCAAAAAGCTACTAAAAGGACTGGTGTAATTTAAAAAAAACTAAGGCAGAAGGCTTTTGTAACAGTTAGAAGAATTAGGCA', 54)], [('GAAACCTGACGTTAGAAAGGCTCAACGAGAACAAGCTATCAGGGCTGCTAAGGAAGCAAAAAAGGCGAAGCAAGCATCTAAAAAGACTGCC', 23), ('CCATAGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAACATATAACTGAACTCC', 95)], [('AGGGGGGGGGGGGGGGGGGGGTTGTGGATTATTTTTGAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', 88)], [('ACAAAATGTACAGATACAGAATTCTGTTCAAGCGAAGAATGATAAAGAGACAGAAGCACAGATATCATGGTATCCTCCATAGGATCATTGA', 75)], [('AAGGACATTGAATTGGTCATGTCACAAGCAAATGTGTCGAGAGCAAAGGCAGTCCGAGCCCTGAAGAACAACAGTAATGATATTGTAAATG', 66), ('GAACCAGAGCTTCTGCCCAACTGTCAACCTTGACAAATTGTGGACTTTGGTC

In [16]:
# Testing the compressing and decompressing process

assert cluster_of_sequences == decoded_clusters_compressed
