In [23]:
from numba import njit
import uuid
from collections import Counter
import timeit
import numpy as np
import pandas as pd

from ShaidurovAlgorithm import get_convolution

In [24]:
#constants for experiment
symbol_to_probability = {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2 }
source_string_length = 10_000
insertion_length = 1
step = 5
eps = 5
experiment_id = uuid.uuid4()
experiment_id

UUID('4fa3543c-82cd-4caa-99f3-3ead1c0bb0e6')

In [25]:
# functions for experiments
def calculate_ro(first, second, alphabet):
    first_counter = Counter(first)
    second_counter = Counter(second)
    scalar_product = 0
    for letter in alphabet:
        scalar_product += (first_counter[letter] / len(first)) * (second_counter[letter] / len(second))
    return scalar_product

In [26]:
#experiment 
experiment_metadata = pd.Series(
    {
        'experiment_id': experiment_id,
        'source_string_length': source_string_length,
        'insertion_length': insertion_length,
        'probabilities': symbol_to_probability,
        'step': step,
        'eps': eps
    },
    name='value')
experiment_metadata

experiment_id               4fa3543c-82cd-4caa-99f3-3ead1c0bb0e6
source_string_length                                       10000
insertion_length                                               1
probabilities           {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2}
step                                                           5
eps                                                            5
Name: value, dtype: object

In [27]:
generator = np.random.default_rng(experiment_id.int)
source_string = generator.choice(list(symbol_to_probability.keys()), size=source_string_length, p=list(symbol_to_probability.values()))
source_string

array(['G', 'C', 'A', ..., 'G', 'C', 'A'], dtype='<U1')

In [28]:
inserted_string = generator.choice(list(symbol_to_probability.keys()), size=insertion_length)
inserted_string

array(['A'], dtype='<U1')

In [29]:
positions = np.arange(step, stop=source_string_length - 1, step=step)
positions

array([   5,   10,   15, ..., 9985, 9990, 9995])

In [30]:
strings_with_insertion = [np.insert(source_string.copy(), pos, inserted_string) for pos in positions ]
print(inserted_string)
print(source_string[0:11])
print(strings_with_insertion[0][:11])

['A']
['G' 'C' 'A' 'A' 'C' 'T' 'C' 'C' 'G' 'C' 'C']
['G' 'C' 'A' 'A' 'C' 'A' 'T' 'C' 'C' 'G' 'C']


In [31]:
ro = calculate_ro(source_string, strings_with_insertion[0], alphabet=symbol_to_probability.keys())
ro

0.25824253574642536

In [32]:
convolutions = np.array([get_convolution(source_string, string_with_insertion) for string_with_insertion in strings_with_insertion])
convolutions

array([[-1.70530257e-13, -2.27373675e-13,  3.00000000e+00, ...,
         3.00000000e+00,  1.70530257e-13,  2.84217094e-13],
       [ 0.00000000e+00, -1.13686838e-13,  3.00000000e+00, ...,
         3.00000000e+00, -1.13686838e-13,  3.41060513e-13],
       [ 0.00000000e+00, -1.13686838e-13,  3.00000000e+00, ...,
         3.00000000e+00,  0.00000000e+00, -1.13686838e-13],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  3.00000000e+00, ...,
         3.00000000e+00,  5.68434189e-14, -1.70530257e-13],
       [ 5.68434189e-14,  1.70530257e-13,  3.00000000e+00, ...,
         3.00000000e+00, -3.41060513e-13,  1.70530257e-13],
       [-5.68434189e-14, -1.13686838e-13,  3.00000000e+00, ...,
         3.00000000e+00,  2.84217094e-13, -1.70530257e-13]])

In [36]:
convolution_results = pd.DataFrame()
convolution_results['l1'] = source_string_length - positions
convolution_results['l2'] = positions
# convolution_results['b1'] = [np.partition(c, kth=-1)[-1] for c in convolutions]
# convolution_results['b2'] = [np.partition(c, kth=-2)[-2] for c in convolutions]
convolution_results['temp_b1'] = [np.partition(c[np.argmax(c) - eps: np.argmax(c) + eps + 1], kth=-1)[-1] for c in convolutions]
convolution_results['temp_b2'] = [np.partition(c[np.argmax(c) - eps: np.argmax(c) + eps + 1], kth=-2)[-2] for c in convolutions]

# Проверяем условие l2 > l1 и меняем значения местами
convolution_results[['b1', 'b2']] = convolution_results.apply(
    lambda row: ([row['temp_b2'], row['temp_b1']] if row['l2'] > row['l1'] else [row['temp_b1'], row['temp_b2']]),
    axis=1,
    result_type='expand'
)
convolution_results.drop(['temp_b1', 'temp_b2'], inplace=True, axis=1)
convolution_results['l1 estimate'] = (convolution_results['b1'] - convolution_results['b2'] * ro) / (1 - ro ** 2)
convolution_results['l2 estimate'] = (convolution_results['b2'] - convolution_results['b1'] * ro) / (1 - ro ** 2)
convolution_results['l1 error'] = np.abs(convolution_results['l1 estimate'] - convolution_results['l1']) * 100 / convolution_results['l1']
convolution_results['l2 error'] = np.abs(convolution_results['l2 estimate'] - convolution_results['l2']) * 100 / convolution_results['l2']
convolution_results['errors sum'] = convolution_results['l1 error'] + convolution_results['l2 error']
# convolution_results.

convolution_results

Unnamed: 0,l1,l2,b1,b2,l1 estimate,l2 estimate,l1 error,l2 error,errors sum
0,9995,5,9996.0,2610.0,9988.084413,30.651754,0.069190,513.035077,513.104267
1,9990,10,9992.0,2611.0,9983.521900,32.829989,0.064846,228.299887,228.364733
2,9985,15,9991.0,2613.0,9981.897056,35.249593,0.031076,134.997285,135.028361
3,9980,20,9987.0,2614.0,9977.334543,37.427828,0.026708,87.139138,87.165846
4,9975,25,9984.0,2614.0,9974.120180,38.257913,0.008820,53.031652,53.040472
...,...,...,...,...,...,...,...,...,...
1994,25,9975,2614.0,9979.0,39.641389,9968.762907,58.565554,0.062527,58.628082
1995,20,9980,2612.0,9984.0,36.115004,9974.673570,80.575021,0.053371,80.628392
1996,15,9985,2608.0,9988.0,30.722406,9980.066168,104.816039,0.049412,104.865451
1997,10,9990,2609.0,9992.0,30.687080,9984.075291,206.870799,0.059306,206.930105


In [37]:
statistics = convolution_results.describe()
statistics

Unnamed: 0,l1,l2,b1,b2,l1 estimate,l2 estimate,l1 error,l2 error,errors sum
count,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0
mean,5000.0,5000.0,6252.364182,6280.677839,4961.290544,4999.461588,2.572643,2.355649,4.928292
std,2886.029568,2886.029568,2151.135928,2151.12479,2900.047622,2900.03877,13.10467,14.034716,18.91497
min,5.0,5.0,2608.0,2610.0,29.856995,30.651754,0.001114,1.4e-05,0.548447
25%,2502.5,2502.5,4379.5,4424.0,2436.954536,2496.532149,0.419584,0.161758,1.307478
50%,5000.0,5000.0,6267.0,6266.0,4981.033443,4979.685293,1.034418,0.335076,2.100372
75%,7497.5,7497.5,8108.5,8151.5,7463.789207,7522.174681,2.402075,0.60842,3.311992
max,9995.0,9995.0,9996.0,9995.0,9988.084413,9987.289654,497.13989,513.035077,513.104267


In [35]:
with pd.ExcelWriter(f'artifacts/{experiment_id}.xlsx') as writer:
    convolution_results.to_excel(writer, sheet_name='Conclusions table')
    statistics.to_excel(writer, sheet_name='Statistics table')
    experiment_metadata.to_excel(writer, sheet_name='Metadata')