### Showcases notebook

Here you can see examples of modules execution.

In [1]:
import os
import sys
import time
import datetime
import requests
import numpy as np
import time
from sklearn.datasets import make_classification
from concurrent.futures import ThreadPoolExecutor
from sklearn.tree import (DecisionTreeClassifier)
from Bio import SeqIO
from Bio.SeqUtils import GC
from functools import wraps
from io import StringIO
from dotenv import load_dotenv


from custom_random_forest import RandomForestClassifierCustom
from bio_files_processor import OpenFasta, FastaRecord, convert_multiline_fasta_to_oneline, change_fasta_start_pos
from bioinformatics_utils import (DNASequence, RNASequence, AminoAcidSequence, filter_fastq, 
                                  telegram_logger, send_telegram_message)

### Test 1. custom_random_forest.py

In [2]:
X, y = make_classification(n_samples=100000)
random_forest = RandomForestClassifierCustom(max_depth=30, n_estimators=10, max_features=2, random_state=42)

start_time_single_thread_fit = time.time()
random_forest.fit(X, y, n_jobs=1)
end_time_single_thread_fit = time.time()

start_time_single_thread_predict = time.time()
predictions_single_thread = random_forest.predict_proba(X, n_jobs=1)
end_time_single_thread_predict = time.time()

start_time_multi_thread_fit = time.time()
random_forest.fit(X, y, n_jobs=2)
end_time_multi_thread_fit = time.time()

start_time_multi_thread_predict = time.time()
predictions_multi_thread = random_forest.predict_proba(X, n_jobs=2)
end_time_multi_thread_predict = time.time()

predictions_single_thread_rounded = np.round(predictions_single_thread, 5)
predictions_multi_thread_rounded = np.round(predictions_multi_thread, 5)

print('Execution time for 1 thread fit function:', end_time_single_thread_fit - start_time_single_thread_fit)
print('Execution time for 2 threads fit function:', end_time_multi_thread_fit - start_time_multi_thread_fit)

print('Execution time for 1 thread predict function:', end_time_single_thread_predict - start_time_single_thread_predict)
print('Execution time for 2 threads predict function:', end_time_multi_thread_predict - start_time_multi_thread_predict)

match_predictions = np.array_equal(predictions_single_thread_rounded, predictions_multi_thread_rounded)
print('Predictions are equal:', match_predictions)

Execution time for 1 thread fit function: 4.931288957595825
Execution time for 2 threads fit function: 2.7084004878997803
Execution time for 1 thread predict function: 0.10542893409729004
Execution time for 2 threads predict function: 0.12195634841918945
Predictions are equal: True


### Test 2. OpenFasta 

In [3]:
with OpenFasta('data/example_fasta.fasta') as fasta_file:
    records_generator = fasta_file.read_records()

    for record in records_generator:
        print(record)

id = GTD323452, description = 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+), seq = ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG
id = GTD678345, description = 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+), seq = TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGAGCCCTTGGGAGTGGTCCATTTGAGCCGGCAACGGCACGTTTGGACTGCAAACTTGGGCAAACTTGGTCATTTAGAGGAAGTAAAAGTCGTAACAAGGT
id = GTD174893, description = 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+), seq = TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGGGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTT
id = GTD906783, description = 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-), seq = TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGA

In [4]:
with OpenFasta('data/example_fasta.fasta') as fasta_file:
    single_record = fasta_file.read_record()
    print(single_record)

id = GTD323452, description = 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+), seq = ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG


### Test 3. convert_multiline_fasta_to_oneline and change_fasta_start_pos

In [5]:
input_fasta = 'data/example_fasta.fasta'  
output_fasta = 'data/output_oneline.fasta'  

convert_multiline_fasta_to_oneline(input_fasta, output_fasta) #Output example stored in data directory

In [6]:
input_fasta = 'data/example_fasta.fasta'
output_fasta = 'data/output_changed_start_pos.fasta'

change_fasta_start_pos(input_fasta, 2, output_fasta) #Output example stored in data directory

### Test 4. BiologicalSequence

In [7]:
rna_seq = DNASequence('ATGCATGC').transcribe()
print(rna_seq)

amino_seq = AminoAcidSequence('ACDEFGHIKLMNPQRSTVWY')
print(amino_seq.count_hydrophobic_residues())

dna_seq = RNASequence('ATGCATGC').complement()
print(dna_seq)

AUGCAUGC
8


NotImplementedError: RNA complement is not defined.

### Test 5. filter_fastq

In [None]:
filter_fastq('data/example.fastq', (30, 60), 100, 30) #Correct input format

{'M71059:67:000000000-L7CLC:1:1104:21412:16850': ('TACGTAGGGGGCGAACATTGCTCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGCCTGCCAAGTCAGATGTGAAAGCCCACGGCTCAACCGTGGAAGTG',
  '\'\'%&!%&\x0e $\x0e\x0e$$$$\x1d&$\x10%\x1f!$$%$"!%\x1d&\'\'$$$$$&&%\'"%\x1f\x1f\x1f\x1b!"\x1e#"\x1e\x1e\x1f \r\x0f"&\'$%\x1b\x0f\x1c\x0f\x1c#\x1c#&\'%\x0f!&\x1b\x1b"""\x0c \r&\x19&\'\x1f"$& \x0f"\x18\x18'),
 'M71059:67:000000000-L7CLC:1:1106:8948:6488': ('TACGTAGGGGGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGGGCGCGTAGGCGGCCTATTAAGTCAGATGTGAAAGCCCACGGCTTAACCGTGGAAGTG',
  '!&&$%$&$&&&"$\x0f$$&$$\x0e%\x12\x12\x1d\x0e\x10\x1d\x0e!\x1e&!\x1e\x12%\x0f$\x1b\x0e\x0e&$!%&&&"\x1f\x0e\x1f\x0e\x1b"\x0e\x0c\x1b""%%\'\x10%%%!&%%%\'\'\'\x0f&&%%\'"$"&&&"\'""\x0f %&&#!!\x0f\x1a'),
 'M71059:67:000000000-L7CLC:1:1108:14502:21447': ('TACGGAGGGTGCAAGCGTTATTCGGAATTATTGGGCGTAAAGGGCGCGTAGGCGGTCTTTTAAGTCAGATGTGAAAGCCCGGGGCTCAACCCCGGAAGTG',
  '%\x11\x10%$\x0e$&$\x0e \x0e% %$$\x0e$$%%!&%$&$\'&\'\'%\'$%"$$& #&\x0e\x1e\x1d$$&\x1e$&!$&&\x1e"%%\'\'\'\'\'%%&\x10%&\x1e#%\'%\x

In [None]:
filter_fastq('data/output_oneline.fasta', (30, 60), 100, 30) #Incorrect input format

ValueError: Records in Fastq files should start with '@' character

### Test 6. Telegram-logger

In [8]:
chat_id = '319825173' # Your chat-id

@telegram_logger(chat_id)
def good_function():
    print("This goes to stdout")
    print("And this goes to stderr", file=sys.stderr)
    time.sleep(2)
    print("Wake up, Neo")

@telegram_logger(chat_id)
def bad_function():
    print("Some text to stdout")
    time.sleep(2)
    print("Some text to stderr", file=sys.stderr)
    raise RuntimeError("Ooops, exception here!")
    print("This text follows exception and should not appear in logs")
    
@telegram_logger(chat_id)
def long_lasting_function():
    time.sleep(2000)


good_function()

try:
    bad_function()
except Exception:
    pass

Telegram-logger output example:

![logger example](data/pic.png)

### Unittest 

In [12]:
! python -m unittest discover -v

test_amino_acid_sequence_count_hydrophilic_residues (test_AminoAcidSequence.TestBiologicalSequences.test_amino_acid_sequence_count_hydrophilic_residues) ... ok
test_amino_acid_sequence_count_hydrophobic_residues (test_AminoAcidSequence.TestBiologicalSequences.test_amino_acid_sequence_count_hydrophobic_residues) ... FAIL
test_amino_acid_sequence_is_valid_alphabet (test_AminoAcidSequence.TestBiologicalSequences.test_amino_acid_sequence_is_valid_alphabet) ... ok
test_amino_acid_sequence_len (test_AminoAcidSequence.TestBiologicalSequences.test_amino_acid_sequence_len) ... ok
test_amino_acid_sequence_repr (test_AminoAcidSequence.TestBiologicalSequences.test_amino_acid_sequence_repr) ... ok
test_amino_acid_sequence_str (test_AminoAcidSequence.TestBiologicalSequences.test_amino_acid_sequence_str) ... ok
test_dna_sequence_complement (test_DNASequence.TestBiologicalSequences.test_dna_sequence_complement) ... ok
test_dna_sequence_gc_content (test_DNASequence.TestBiologicalSequences.test_dna_sequ