In [None]:
# Import necessary libraries and modules
import numpy as np
from encoder import Encoder
from mapper import Mapper, kmeans, dist, mindist, cluster, transform, groupings, compress_clusters, get_size_compressed
from vectorize import Mapper as VecMapper
from library.reader import Reader


In [None]:
# Test Encoder Class
print("Testing Encoder Class")

# Create a sample cluster
sample_cluster = [
    [["ACGT", 1], ["TCGA", 1], ["ACGTGTCGAGTGT", 2]],
    [["ACGATGCGCGCTAGGT", 1], ["ACGTGTCGCGCAATCGCTAGAC", 2]],
]

# Encode the cluster
encoded_cluster = Encoder.encode_clusters(sample_cluster)
print("Encoded Cluster:", encoded_cluster)

# Decode the cluster
decoded_cluster = Encoder.decode_clusters(encoded_cluster)
print("Decoded Cluster:", decoded_cluster)


In [None]:
# Test Mapper Class
print("\nTesting Mapper Class")

# Create a sample sequence list
sample_sequences = ["ACGT", "TCGA", "ACGTGTCGAGTGT", "ACGATGCGCGCTAGGT", "ACGTGTCGCGCAATCGCTAGAC"]

# Create Mapper object and feature set
mapper = Mapper([sample_sequences], 2, 3)

# Display the feature set
print("Feature Set:", mapper.fs)

# Perform clustering on the feature set
clustered_data = cluster(mapper.hfs)
print("Clustered Data:", clustered_data)

# Group similar sequences
grouped_sequences = groupings(clustered_data, sample_sequences)
print("Grouped Sequences:", grouped_sequences)

# Compress the clusters
compressed_clusters = compress_clusters(encoded_cluster)
print("Compressed Clusters:", compressed_clusters)

# Get the size of the compressed clusters
compressed_size = get_size_compressed(compressed_clusters)
print("Compressed Size:", compressed_size)


In [None]:
# Test Vectorize Mapper Class
print("\nTesting Vectorize Mapper Class")

# Read sequences from a sample fasta file
reader = Reader()
sequences_dict, _, _ = reader.read_fasta("sample/pathogen.fa")
sequences_list = list(sequences_dict.values())

# Create VecMapper object
vec_mapper = VecMapper()

# Create k-mers
kmers = vec_mapper.make_kmers(sequences_list, 2)
print("K-mers:", kmers)

# Create feature vectors
feature_vectors = vec_mapper.feature_vector(sequences_list, kmers)
print("Feature Vectors:", feature_vectors)

# Select features with highest variance
selected_features = vec_mapper.select_m_highest_variance(feature_vectors, 3)
print("Selected Features with Highest Variance:", selected_features)

# Perform clustering
cluster_set = Cluster().k_means(selected_features, 0.5)
print("Cluster Set:", cluster_set)
