In [None]:
# Project description

"""
GeneVecTools
=============

This tool provides functionalities for:
- Reading in a variety of genetic file types
- Vector embedding algorithms
- Byte array encoders
- Clustering and preprocessing steps for compression
- Similarity search tools for FASTA/FASTQ files

Installing
==========
Tester files: https://tinyurl.com/cDNALibraryExampleFiles
Install using pip:
.. code-block:: bash
    pip install GeneVecTools
"""


In [None]:
# Importing necessary modules

from GeneVecTools import simSearch, reader, mapper, encoder


In [None]:
# Define the file location of the genetic sequence

file = "small_cDNA_Sequences_pbmc_1k_v2_S1_L002_R2_001.fastq"


In [None]:
# Creating a VecSS object with specified parameters

VECSS = simSearch.VecSS(f=file, length=10000, encoding="one-hot-encoding", bits=8)


In [None]:
# Reading sequences from the file

sequences = VECSS.readq()


In [None]:
# Embedding the sequences

embedded = VECSS.embed(VECSS.s)
print("Embedded sequences:", embedded)


In [None]:
# Running the similarity search

D, I, time = VECSS.run_search()
print("Similarity search results:\nDifferences:", D, "\nIndices:", I, "\nTime taken:", time)


In [None]:
# Testing the embedding and unembedding process

assert VECSS.unembed(VECSS.embed(VECSS.s)) == VECSS.s


In [None]:
# Reading sequences using the Reader class

R = reader.Reader()
mp, count, total_len, quality = R.read_fastq(file)
sequences_dict_items = mp.values()
sequences = list(sequences_dict_items)
print("Extracted sequences:", sequences)


In [None]:
# Clustering sequences

mapObj = mapper.Mapper(sequences, 2, 3)
groups_of_similar_kmers = mapper.cluster(mapObj.hfs)
cluster_of_sequences = mapper.groupings(groups_of_similar_kmers, sequences)
print("Clustered sequences:", cluster_of_sequences)


In [None]:
# Encoding sequences

encoder_instance = encoder.Encoder(4)
encoded_sequences = encoder_instance.encode_sequences(sequences)
print("Encoded sequences:", encoded_sequences)


In [None]:
# Compressing the encoded clusters

encoded_clusters_compressed = encoder_instance.encode_clusters(cluster_of_sequences)
print("Compressed clusters:", encoded_clusters_compressed)


In [None]:
# Decompressing the clusters

decoded_clusters_compressed = encoder_instance.decode_clusters(encoded_clusters_compressed)
print("Decompressed clusters:", decoded_clusters_compressed)


In [None]:
# Testing the compressing and decompressing process

assert cluster_of_sequences == decoded_clusters_compressed
