In [1]:
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count
from functools import partial
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("paper", font_scale=2.0)
sns.set_style('whitegrid')
from itertools import product

from functions import *

from pandarallel import pandarallel

pandarallel.initialize(progress_bar=False)

with SequenceFile("data/uniprot-9606.fasta", digital=True, alphabet=alphabet) as seq_file:
    sequences = list(seq_file)



INFO: Pandarallel will run on 48 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
uniprot = fasta_reader('data/uniprot-9606.fasta')
uniprot.head(3)

Unnamed: 0,Accession,Sequence
0,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...
1,A0A0B4J2F0,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...
2,A0A0B4J2F2,MVIMSEFSADPAGQGQGQQKPLRVGFYDIERTLGKGNFAVVKLARH...


#### Results for full length all AA

In [3]:
aa_known = 'ACDEFGHIKLMNPQRSTVWY'
prob_range = [0.9, 0.8, 0.6, 0.4, 0.2, 0.3, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05]

all_files = ['results/scan_results/' + f'full_length_{aa_known}_max_prob_{prob}_del.pkl.gz' \
             for prob in prob_range]
result_dfs = [pd.read_pickle(i) for i in all_files]
# total number of HMMS
hmms = list(range(uniprot.shape[0]))

In [4]:
plot_df = pd.DataFrame(columns=['known aa', 'max_prob', \
                                'total queries', 'hits',  'identified', ])

for i, v in enumerate(result_dfs):
    stats = gen_stats(hmms, v)
    plot_df.loc[i] = [aa_known, prob_range[i], stats['Total queries'],\
                       stats['Hits'], stats['Identified'],]
plot_df['identified_frac'] = plot_df['identified']/plot_df['total queries']
plot_df['Type'] = 'Full length'
plot_df

Unnamed: 0,known aa,max_prob,total queries,hits,identified,identified_frac,Type
0,ACDEFGHIKLMNPQRSTVWY,0.9,20181,20178,19985,0.990288,Full length
1,ACDEFGHIKLMNPQRSTVWY,0.8,20181,20179,19983,0.990189,Full length
2,ACDEFGHIKLMNPQRSTVWY,0.6,20181,20179,19937,0.987909,Full length
3,ACDEFGHIKLMNPQRSTVWY,0.4,20181,20178,19879,0.985035,Full length
4,ACDEFGHIKLMNPQRSTVWY,0.2,20181,20180,19858,0.983995,Full length
5,ACDEFGHIKLMNPQRSTVWY,0.3,20181,20178,19868,0.98449,Full length
6,ACDEFGHIKLMNPQRSTVWY,0.1,20181,20125,19930,0.987563,Full length
7,ACDEFGHIKLMNPQRSTVWY,0.09,20181,20095,19858,0.983995,Full length
8,ACDEFGHIKLMNPQRSTVWY,0.08,20181,20107,19373,0.959962,Full length
9,ACDEFGHIKLMNPQRSTVWY,0.07,20181,20137,14911,0.738863,Full length


In [5]:
plot_df.to_pickle('results/full_length_all_p_max.pkl.gz')

In [8]:
del result_dfs

#### Results for fragment all AA

In [17]:
frag_plot_dfs = []

lengths = [100, 50, 25, 15, 10, 5]

for length in lengths:

    aa_known = 'ACDEFGHIKLMNPQRSTVWY'
    prob_range = [0.9, 0.8, 0.6, 0.4, 0.2, 0.3, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05]
    # total number of HMMS
    hmms = list(range(uniprot.shape[0]))
    all_result_dfs = []
    repeats = 10
    for frag in range(repeats):
        files = ['results/scan_results/' + f'{frag}_frag_length_{length}_{aa_known}_max_prob_{prob}_del.pkl.gz' \
                     for prob in prob_range]
        result_dfs = [pd.read_pickle(i) for i in files]
        all_result_dfs.append(result_dfs)

    plot_dfs1 = []
    for frag in range(repeats):
        temp_df = pd.DataFrame(columns=['known aa', 'max_prob', \
                                    'total queries', 'hits',  'identified', 'fragment#' ])
        for i, v in enumerate(all_result_dfs[frag]):
            stats = gen_stats(hmms, v)
            temp_df.loc[i] = [aa_known, prob_range[i], stats['Total queries'],\
                               stats['Hits'], stats['Identified'], frag]
        temp_df['identified_frac'] = temp_df['identified']/temp_df['total queries']
        plot_dfs1.append(temp_df)
    plot_df1 = pd.concat(plot_dfs1).reset_index(drop=True)
    plot_df1['Type'] = f'{length} AA'
    frag_plot_dfs.append(plot_df1)
    del all_result_dfs, result_dfs,

In [18]:
plot_df_ = pd.concat(frag_plot_dfs).reset_index(drop=True)
plot_df_.to_pickle('results/all_fragments_all_AA_no_indels.pkl.gz')
plot_df_.head(5)

Unnamed: 0,known aa,max_prob,total queries,hits,identified,fragment#,identified_frac,Type
0,ACDEFGHIKLMNPQRSTVWY,0.9,20181,20178,19446,0,0.96358,100 AA
1,ACDEFGHIKLMNPQRSTVWY,0.8,20181,20179,19436,0,0.963084,100 AA
2,ACDEFGHIKLMNPQRSTVWY,0.6,20181,20179,19342,0,0.958426,100 AA
3,ACDEFGHIKLMNPQRSTVWY,0.4,20181,20178,19310,0,0.956841,100 AA
4,ACDEFGHIKLMNPQRSTVWY,0.2,20181,20180,19333,0,0.95798,100 AA


In [19]:
plot_df_ = pd.read_pickle('results/all_fragments_all_AA_no_indels.pkl.gz')
frag_and_full = pd.concat([plot_df, plot_df_]).reset_index(drop=True)
frag_and_full

Unnamed: 0,known aa,max_prob,total queries,hits,identified,identified_frac,Type,fragment#
0,ACDEFGHIKLMNPQRSTVWY,0.90,20181,20178,19985,0.990288,Full length,
1,ACDEFGHIKLMNPQRSTVWY,0.80,20181,20179,19983,0.990189,Full length,
2,ACDEFGHIKLMNPQRSTVWY,0.60,20181,20179,19937,0.987909,Full length,
3,ACDEFGHIKLMNPQRSTVWY,0.40,20181,20178,19879,0.985035,Full length,
4,ACDEFGHIKLMNPQRSTVWY,0.20,20181,20180,19858,0.983995,Full length,
...,...,...,...,...,...,...,...,...
727,ACDEFGHIKLMNPQRSTVWY,0.09,20181,1518,0,0.0,5 AA,9
728,ACDEFGHIKLMNPQRSTVWY,0.08,20181,2204,2,0.000099,5 AA,9
729,ACDEFGHIKLMNPQRSTVWY,0.07,20181,7316,3,0.000149,5 AA,9
730,ACDEFGHIKLMNPQRSTVWY,0.06,20181,19834,2,0.000099,5 AA,9


In [20]:
frag_and_full.to_pickle('results/frag_and_full_len_all_aa_no_indels_.pkl.gz')

In [21]:
plot_df_['max_prob'].value_counts()

0.90    60
0.80    60
0.60    60
0.40    60
0.20    60
0.30    60
0.10    60
0.09    60
0.08    60
0.07    60
0.06    60
0.05    60
Name: max_prob, dtype: int64

#### Times each sequence is identified from 10 fragments

In [3]:
aa_known = 'ACDEFGHIKLMNPQRSTVWY'
prob_range = [0.9, 0.8, 0.6, 0.4, 0.2, 0.3, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05]
repeats = 10


lengths = [100, 50, 25, 15, 10, 5]
for l in lengths:
    #counts per repeat
    cpr_dfs = []

    for p in prob_range:
        cpr = pd.DataFrame({'Query':uniprot.Accession})
        for r in range(repeats):
            fname = 'results/scan_results/' + f'{r}_frag_length_{l}_{aa_known}_max_prob_{p}_del.pkl.gz'
            result_df = pd.read_pickle(fname)
            tt = result_df.groupby(['Query', 'Accession'], \
                          sort=False,)[['E-value', 'Score', ]].max()
            tt = tt.loc[tt.groupby(['Query'], sort=False)['E-value'].idxmin()].reset_index()
            cpr = pd.merge(cpr, tt[tt.Query == tt.Accession][['Query', 'Accession']], on='Query',\
                           suffixes=('', str(r)), how='left')
        cpr['Times identified'] = cpr.apply(lambda x: np.sum(x == x[0])-1, axis=1)
        cpr['Prob'] = p
        cpr_dfs.append(cpr)

    cpr_df = pd.concat(cpr_dfs).reset_index(drop=True)
    cpr_df.to_pickle(f'results/length_{l}_times_identified_from_10_fragments.pkl.gz')

In [5]:
lengths = [100, 50, 25, 15, 10, 5]

times_dfs = []
for l in lengths:
    df = pd.read_pickle(f'results/length_{l}_times_identified_from_10_fragments.pkl.gz')
    df['Length'] = l
    times_dfs.append(df)
times_df = pd.concat(times_dfs)
times_df = times_df.rename(columns={'Prob': 'Posterior'})
times_df.to_pickle('results/all_times_identified_from_10_frag.pkl.gz')

#### Results for full length reduced AA

In [43]:
reduced_AA_full_length_dfs = []

# aa_knowns = ['LSE', 'CKY', 'WMC']
aa_knowns = ['L', 'LS', 'LSE', 'C', 'K', 'CK', 'KY', 'YC', 'CKY', 'W',\
             'WM', 'WMC', 'LSEA', 'Y', 'WMCH', 'LSEAG', 'WMCHY']

for aa_known in aa_knowns:


#     prob_range = [0.9, 0.8, 0.6, 0.4, 0.2, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05]
    prob_range = [0.9, 0.8, 0.6, 0.4, 0.2, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05]

    # total number of HMMS
    hmms = list(range(uniprot.shape[0]))
    all_result_dfs = []
    repeats = 1
    for frag in range(repeats):
        files = ['results/scan_results/' + f'full_length_{aa_known}_max_prob_{prob}_del.pkl.gz' \
                     for prob in prob_range]
        result_dfs = [pd.read_pickle(i) for i in files]
        all_result_dfs.append(result_dfs)

    plot_dfs1 = []
    for frag in range(repeats):
        temp_df = pd.DataFrame(columns=['Reduced AA', 'max_prob', \
                                    'total queries', 'hits',  'identified', 'fragment#' ])
        for i, v in enumerate(all_result_dfs[frag]):
            stats = gen_stats(hmms, v)
            temp_df.loc[i] = [aa_known, prob_range[i], stats['Total queries'],\
                               stats['Hits'], stats['Identified'], frag]
        temp_df['identified_frac'] = temp_df['identified']/temp_df['total queries']
        plot_dfs1.append(temp_df)
    plot_df1 = pd.concat(plot_dfs1).reset_index(drop=True)
#     plot_df1['Reduced AA'] = f'{aa_known}'
    reduced_AA_full_length_dfs.append(plot_df1)
    del all_result_dfs, result_dfs,

In [44]:
final_df = pd.concat(reduced_AA_full_length_dfs).reset_index(drop=True)
# final_df = final_df.rename(columns={'Known AA': 'Reduced AA'})
final_df.head(2)

Unnamed: 0,Reduced AA,max_prob,total queries,hits,identified,fragment#,identified_frac
0,L,0.9,20181,20125,15795,0,0.782667
1,L,0.8,20181,20129,15137,0,0.750062


In [28]:
# 'CK', 'KY', 'YC', 'LSEA', 'LSEAG', 'WMCH', 'WMCHY'

In [47]:

# categorising for plot
reduced_aa_type = {
    'L': 'LSEAG',
    'LS': 'LSEAG',
    'LSE': 'LSEAG',
    'LSEA': 'LSEAG',
    'LSEAG': 'LSEAG',
    'C':'CKY',
    'K': 'CKY',
    'Y': 'CKY',
    'CK':'CKY',
    'KY': 'CKY',
    'YC':'CKY',
    'CKY':'CKY',
    'W': 'WMCHY',
    'WM': 'WMCHY',
    'WMC':'WMCHY',
    'WMCH':'WMCHY',
    'WMCHY':'WMCHY'
    
}


long_plot = final_df.copy()
long_plot['# Reduced AA'] = final_df['Reduced AA'].apply(len)
# long_plot['Type'] = long_plot['Reduced AA'].apply(lambda x: reduced_aa_type[x])
long_plot[' Reduced AA '] = long_plot['Reduced AA'].apply(lambda x: reduced_aa_type[x])
long_plot = long_plot.rename(columns={'max_prob': 'Posterior'})
long_plot.head(5)

Unnamed: 0,Reduced AA,Posterior,total queries,hits,identified,fragment#,identified_frac,# Reduced AA,Reduced AA.1
0,L,0.9,20181,20125,15795,0,0.782667,1,LSEAG
1,L,0.8,20181,20129,15137,0,0.750062,1,LSEAG
2,L,0.6,20181,20141,12772,0,0.632873,1,LSEAG
3,L,0.4,20181,20145,8090,0,0.400872,1,LSEAG
4,L,0.2,20181,20145,509,0,0.025222,1,LSEAG


In [48]:
long_plot.to_pickle('results/full_length_reduced_AA_no_indels.pkl.gz')

In [None]:
# sns.choose_colorbrewer_palette('qualitative',)

#### Results for fragment length 50 reduced AA

In [9]:
frag_length = 50
reduced_AA_frag_length_50_dfs = []

# aa_knowns = ['LSE', 'CKY', 'WMC']
# aa_knowns = ['L', 'LS', 'C', 'K', 'Y', 'CK', 'KY', 'YC', 'LSE', 'CKY', \
#              'W', 'WM', 'WMC', 'LSEA', 'WMCH', 'WMCHY', 'LSEAG', ]

aa_knowns = ['L', 'LS', 'C', 'K', 'Y', 'CK', 'KY', 'YC', 'LSE', 'W', 'CKY', \
            'WM', 'WMC', 'LSEA', 'WMCH', 'WMCHY', 'LSEAG', ]

for aa_known in aa_knowns:


#     prob_range = [0.9, 0.8, 0.6, 0.4, 0.2, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05]
    prob_range = [0.9, 0.8, 0.6, 0.4, 0.2, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05]

    # total number of HMMS
    hmms = list(range(uniprot.shape[0]))
    all_result_dfs = []
    repeats = 1
    for frag in range(repeats):
        files = ['results/scan_results/' + f'{frag}_frag_length_{frag_length}_{aa_known}_max_prob_{prob}_del.pkl.gz' \
                     for prob in prob_range]
        result_dfs = [pd.read_pickle(i) for i in files]
        all_result_dfs.append(result_dfs)

    plot_dfs1 = []
    for frag in range(repeats):
        temp_df = pd.DataFrame(columns=['Reduced AA', 'max_prob', \
                                    'total queries', 'hits',  'identified', 'fragment#' ])
        for i, v in enumerate(all_result_dfs[frag]):
            stats = gen_stats(hmms, v)
            temp_df.loc[i] = [aa_known, prob_range[i], stats['Total queries'],\
                               stats['Hits'], stats['Identified'], frag]
        temp_df['identified_frac'] = temp_df['identified']/temp_df['total queries']
        plot_dfs1.append(temp_df)
    plot_df1 = pd.concat(plot_dfs1).reset_index(drop=True)
#     plot_df1['Reduced AA'] = f'{aa_known}'
    reduced_AA_frag_length_50_dfs.append(plot_df1)
    del all_result_dfs, result_dfs,

In [10]:
final_df_frag_50 = pd.concat(reduced_AA_frag_length_50_dfs).reset_index(drop=True)
final_df_frag_50

Unnamed: 0,Reduced AA,max_prob,total queries,hits,identified,fragment#,identified_frac
0,L,0.90,20181,16153,337,0,0.016699
1,L,0.80,20181,17653,316,0,0.015658
2,L,0.60,20181,20109,90,0,0.00446
3,L,0.40,20181,20145,12,0,0.000595
4,L,0.20,20181,20145,1,0,0.00005
...,...,...,...,...,...,...,...
182,LSEAG,0.09,20181,20036,1,0,0.00005
183,LSEAG,0.08,20181,20138,0,0,0.0
184,LSEAG,0.07,20181,20140,1,0,0.00005
185,LSEAG,0.06,20181,20141,1,0,0.00005


In [12]:

# categorising for plot
reduced_aa_type = {
    'L': 'LSEAG',
    'LS': 'LSEAG',
    'LSE': 'LSEAG',
    'LSEA': 'LSEAG',
    'LSEAG': 'LSEAG',
    'C':'CKY',
    'K': 'CKY',
    'Y': 'CKY',
    'CK':'CKY',
    'KY': 'CKY',
    'YC':'CKY',
    'CKY':'CKY',
    'W': 'WMCHY',
    'WM': 'WMCHY',
    'WMC':'WMCHY',
    'WMCH':'WMCHY',
    'WMCHY':'WMCHY'
    
}


long_plot_frag_50 = final_df_frag_50.copy()
long_plot_frag_50['# Reduced AA'] = final_df_frag_50['Reduced AA'].apply(len)
# long_plot['Type'] = long_plot['Reduced AA'].apply(lambda x: reduced_aa_type[x])
long_plot_frag_50[' Reduced AA '] = long_plot_frag_50['Reduced AA'].apply(lambda x: reduced_aa_type[x])
long_plot_frag_50 = long_plot_frag_50.rename(columns={'max_prob': 'Posterior'})
long_plot_frag_50

Unnamed: 0,Reduced AA,Posterior,total queries,hits,identified,fragment#,identified_frac,# Reduced AA,Reduced AA.1
0,L,0.90,20181,16153,337,0,0.016699,1,LSEAG
1,L,0.80,20181,17653,316,0,0.015658,1,LSEAG
2,L,0.60,20181,20109,90,0,0.00446,1,LSEAG
3,L,0.40,20181,20145,12,0,0.000595,1,LSEAG
4,L,0.20,20181,20145,1,0,0.00005,1,LSEAG
...,...,...,...,...,...,...,...,...,...
182,LSEAG,0.09,20181,20036,1,0,0.00005,5,LSEAG
183,LSEAG,0.08,20181,20138,0,0,0.0,5,LSEAG
184,LSEAG,0.07,20181,20140,1,0,0.00005,5,LSEAG
185,LSEAG,0.06,20181,20141,1,0,0.00005,5,LSEAG


In [58]:
long_plot_frag_50.to_pickle('results/50_AA_frag_reduced_AA_no_indels.pkl.gz')

In [18]:
# long_plot_frag_50[long_plot_frag_50['Reduced AA'] == 'LSEAG']

#### Result for fragment 100 AA reduced AA

In [3]:
frag_length = 100
reduced_AA_frag_length_100_dfs = []

# aa_knowns = ['LSE', 'CKY', 'WMC']
# aa_knowns = ['L', 'LS', 'C', 'K', 'Y', 'CK', 'KY', 'YC', 'LSE', 'CKY', \
#              'W', 'WM', 'WMC', 'LSEA', 'WMCH', 'WMCHY', 'LSEAG', ]

aa_knowns = ['L', 'LS', 'C', 'K', 'Y', 'CK', 'KY', 'YC', 'LSE', 'CKY', 'W', \
            'WM', 'WMC', 'LSEA', 'WMCH', 'WMCHY', 'LSEAG']

for aa_known in aa_knowns:


    prob_range = [0.9, 0.8, 0.6, 0.4, 0.2, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05]

    # total number of HMMS
    hmms = list(range(uniprot.shape[0]))
    all_result_dfs = []
    repeats = 1
    for frag in range(repeats):
        files = ['results/scan_results/' + f'{frag}_frag_length_{frag_length}_{aa_known}_max_prob_{prob}_del.pkl.gz' \
                     for prob in prob_range]
        result_dfs = [pd.read_pickle(i) for i in files]
        all_result_dfs.append(result_dfs)

    plot_dfs1 = []
    for frag in range(repeats):
        temp_df = pd.DataFrame(columns=['Reduced AA', 'max_prob', \
                                    'total queries', 'hits',  'identified', 'fragment#' ])
        for i, v in enumerate(all_result_dfs[frag]):
            stats = gen_stats(hmms, v)
            temp_df.loc[i] = [aa_known, prob_range[i], stats['Total queries'],\
                               stats['Hits'], stats['Identified'], frag]
        temp_df['identified_frac'] = temp_df['identified']/temp_df['total queries']
        plot_dfs1.append(temp_df)
    plot_df1 = pd.concat(plot_dfs1).reset_index(drop=True)
#     plot_df1['Reduced AA'] = f'{aa_known}'
    reduced_AA_frag_length_100_dfs.append(plot_df1)
    del all_result_dfs, result_dfs,

In [4]:
final_df_frag_100 = pd.concat(reduced_AA_frag_length_100_dfs).reset_index(drop=True)
final_df_frag_100.head(5)

Unnamed: 0,Reduced AA,max_prob,total queries,hits,identified,fragment#,identified_frac
0,L,0.9,20181,20122,3589,0,0.177841
1,L,0.8,20181,20128,2984,0,0.147862
2,L,0.6,20181,20141,1434,0,0.071057
3,L,0.4,20181,20145,255,0,0.012636
4,L,0.2,20181,20145,5,0,0.000248


In [6]:

# categorising for plot
reduced_aa_type = {
    'L': 'LSEAG',
    'LS': 'LSEAG',
    'LSE': 'LSEAG',
    'LSEA': 'LSEAG',
    'LSEAG': 'LSEAG',
    'C':'CKY',
    'K': 'CKY',
    'Y': 'CKY',
    'CK':'CKY',
    'KY': 'CKY',
    'YC':'CKY',
    'CKY':'CKY',
    'W': 'WMCHY',
    'WM': 'WMCHY',
    'WMC':'WMCHY',
    'WMCH':'WMCHY',
    'WMCHY':'WMCHY'
    
}


long_plot_frag_100 = final_df_frag_100.copy()
long_plot_frag_100['# Reduced AA'] = final_df_frag_100['Reduced AA'].apply(len)
# long_plot['Type'] = long_plot['Reduced AA'].apply(lambda x: reduced_aa_type[x])
long_plot_frag_100[' Reduced AA '] = long_plot_frag_100['Reduced AA'].apply(lambda x: reduced_aa_type[x])
long_plot_frag_100 = long_plot_frag_100.rename(columns={'max_prob': 'Posterior'})
long_plot_frag_100.head(10)

Unnamed: 0,Reduced AA,Posterior,total queries,hits,identified,fragment#,identified_frac,# Reduced AA,Reduced AA.1
0,L,0.9,20181,20122,3589,0,0.177841,1,LSEAG
1,L,0.8,20181,20128,2984,0,0.147862,1,LSEAG
2,L,0.6,20181,20141,1434,0,0.071057,1,LSEAG
3,L,0.4,20181,20145,255,0,0.012636,1,LSEAG
4,L,0.2,20181,20145,5,0,0.000248,1,LSEAG
5,L,0.1,20181,20140,1,0,5e-05,1,LSEAG
6,L,0.09,20181,20140,0,0,0.0,1,LSEAG
7,L,0.08,20181,20140,2,0,9.9e-05,1,LSEAG
8,L,0.07,20181,20140,1,0,5e-05,1,LSEAG
9,L,0.06,20181,20140,1,0,5e-05,1,LSEAG


In [7]:
long_plot_frag_100.to_pickle('results/100_AA_frag_reduced_AA_no_indels.pkl.gz')

### Errors

#### full length sequence

In [35]:
import os

aa_known = 'LSEAG'
rep = 0
prob = 0.8
ins_rate = 0
del_rate = 0
frag = 0
fname = f'{rep}_full_length_{aa_known}_max_prob_{prob}_ins_{ins_rate}_del_{del_rate}_1_ins.pkl.gz'
os.path.getmtime('results/scan_results/' + fname) 
start_time = os.path.getmtime('results/scan_results/' + fname) 
start_time

1694939293.980853

In [36]:
# start_time

1694939293.980853

In [27]:
from glob import glob
# results = glob('results/*.pkl.gz')
# results.pop(4)
res = [i for i in results if 'scan_results' in i]
hmms = [i for i in results if 'entropy_results' in i]
cols = [0, 10, 20, 30, 40, 50, 60]
all_indels = [i for i in product(cols, cols)]

aa_knowns = ['LSEAG', 'CKY', 'WMCHY', 'ACDEFGHIKLMNPQRSTVWY']
prob_range = [0.8,]
repeats = 1



heatmap_df = pd.DataFrame(columns=['insertions', 'deletions', 'known aa', \
                                'total queries', 'hits',  'identified', 'Type'])


idx = 0
for rep in range(repeats):
    for aa_known in aa_knowns:
        for v in all_indels:
            for prob in prob_range:
                ins_rate = v[0]
                del_rate = v[1]
    #             print(idx, end='\r')

                fname = f'{rep}_full_length_{aa_known}_max_prob_{prob}_ins_{ins_rate}_del_{del_rate}_1_ins.pkl.gz'

                try:
#                     curr_time = os.path.getmtime('results/scan_results/' + fname) 
#                     if curr_time >= start_time: # select only recent results
                        tmp0 = pd.read_pickle('results/scan_results/' + fname)
                        tmp0 = tmp0.reset_index(drop=True)

                        hmms = list(range(uniprot.shape[0]))

                        stats = gen_stats(hmms, tmp0)

                        heatmap_df.loc[idx] = [v[0], v[1], aa_known, stats['Total queries'], \
                                           stats['Hits'], stats['Identified'], 'Full length' ]
                        idx += 1
                        del tmp0
                except Exception as exp:
    #                 print(exp)
                    pass
heatmap_df['precision'] = heatmap_df['identified']/heatmap_df['total queries']

heatmap_df.to_pickle('results/heatmap_df_full_length_1_ins_p_0.8.pkl.gz')
heatmap_df.head(2)

Unnamed: 0,insertions,deletions,known aa,total queries,hits,identified,Type,precision
0,0,0,LSEAG,20181,20164,19883,Full length,0.985234
1,0,10,LSEAG,20181,20161,19851,Full length,0.983648


In [28]:
heatmap_df.shape

(196, 8)

#### 100 AA fragment

In [237]:
import os

aa_known = 'LSEAG'
rep = 0
prob = 0.8
ins_rate = 0
del_rate = 0
frag = 0
fname = f'{frag}_frag_length_100_{aa_known}_max_prob_{prob}_ins_{ins_rate}_del_{del_rate}_1_ins.pkl.gz'
os.path.getmtime('results/scan_results/' + fname) 
start_time_100 = os.path.getmtime('results/scan_results/' + fname) 
start_time_100

1694290242.813897

In [259]:
# for single inserts

from glob import glob
# results = glob('results/*.pkl.gz')
# results.pop(4)
res = [i for i in results if 'scan_results' in i]
hmms = [i for i in results if 'entropy_results' in i]
cols = [0, 10, 20, 30, 40, 50, 60]
all_indels = [i for i in product(cols, cols)]

aa_knowns = ['LSEAG', 'CKY', 'WMCHY', 'ACDEFGHIKLMNPQRSTVWY']
prob_range = [0.8,]
num_frag = 1



heatmap_df = pd.DataFrame(columns=['insertions', 'deletions', 'known aa', \
                                'total queries', 'hits',  'identified', 'Type'])


idx = 0
for frag in range(num_frag):
    for aa_known in aa_knowns:
        for v in all_indels:
            for prob in prob_range:
                ins_rate = v[0]
                del_rate = v[1]
    #             print(idx, end='\r')

                fname = f'{frag}_frag_length_100_{aa_known}_max_prob_{prob}_ins_{ins_rate}_del_{del_rate}_1_ins.pkl.gz'

                try:
                    curr_time = os.path.getmtime('results/scan_results/' + fname) 
                    if curr_time >= start_time_100: # select only recent results
                        tmp0 = pd.read_pickle('results/scan_results/' + fname)
                        tmp0 = tmp0.reset_index(drop=True)

                        hmms = list(range(uniprot.shape[0]))

                        stats = gen_stats(hmms, tmp0)

                        heatmap_df.loc[idx] = [v[0], v[1], aa_known, stats['Total queries'], \
                                           stats['Hits'], stats['Identified'], '100 AA fragment' ]
                        idx += 1
                        del tmp0
                except Exception as exp:
#                     print(exp)
                    pass
heatmap_df['precision'] = heatmap_df['identified']/heatmap_df['total queries']


heatmap_df.to_pickle('results/heatmap_df_frag_100AA_1_ins_p_0.8.pkl.gz')
heatmap_df.head()

Unnamed: 0,insertions,deletions,known aa,total queries,hits,identified,Type,precision
0,0,0,LSEAG,20181,20164,19313,100 AA fragment,0.956989
1,0,10,LSEAG,20181,20117,19184,100 AA fragment,0.950597
2,0,20,LSEAG,20181,19843,18499,100 AA fragment,0.916654
3,0,30,LSEAG,20181,13372,6749,100 AA fragment,0.334423
4,0,40,LSEAG,20181,11343,4509,100 AA fragment,0.223428


In [239]:
heatmap_df.shape

(196, 8)

#### 50 AA fragment

In [4]:
import os

aa_known = 'LSEAG'
rep = 0
prob = 0.8
ins_rate = 0
del_rate = 0
frag = 0
fname = f'{frag}_frag_length_50_{aa_known}_max_prob_{prob}_ins_{ins_rate}_del_{del_rate}_1_ins.pkl.gz'
os.path.getmtime('results/scan_results/' + fname) 
start_time_50 = os.path.getmtime('results/scan_results/' + fname) 
start_time_50

1694598479.501456

In [5]:
# for single inserts

from glob import glob
# results = glob('results/*.pkl.gz')
# results.pop(4)
res = [i for i in results if 'scan_results' in i]
hmms = [i for i in results if 'entropy_results' in i]
cols = [0, 10, 20, 30, 40, 50, 60]
all_indels = [i for i in product(cols, cols)]

aa_knowns = ['LSEAG', 'CKY', 'WMCHY', 'ACDEFGHIKLMNPQRSTVWY']
prob_range = [0.8,]
num_frag = 1



heatmap_df = pd.DataFrame(columns=['insertions', 'deletions', 'known aa', \
                                'total queries', 'hits',  'identified', 'Type'])


idx = 0
for frag in range(num_frag):
    for aa_known in aa_knowns:
        for v in all_indels:
            for prob in prob_range:
                ins_rate = v[0]
                del_rate = v[1]
    #             print(idx, end='\r')

                fname = f'{frag}_frag_length_50_{aa_known}_max_prob_{prob}_ins_{ins_rate}_del_{del_rate}_1_ins.pkl.gz'

                try:
                    curr_time = os.path.getmtime('results/scan_results/' + fname) 
                    if curr_time >= start_time_50: # select only recent results                    
                        tmp0 = pd.read_pickle('results/scan_results/' + fname)
                        tmp0 = tmp0.reset_index(drop=True)

                        hmms = list(range(uniprot.shape[0]))

                        stats = gen_stats(hmms, tmp0)

                        heatmap_df.loc[idx] = [v[0], v[1], aa_known, stats['Total queries'], \
                                           stats['Hits'], stats['Identified'], '50 AA fragment' ]
                        idx += 1
                        del tmp0
                except Exception as exp:
    #                 print(exp)
                    pass
heatmap_df['precision'] = heatmap_df['identified']/heatmap_df['total queries']


heatmap_df.to_pickle('results/heatmap_df_frag_50AA_1_ins_p_0.8.pkl.gz')
heatmap_df.head(2)

Unnamed: 0,insertions,deletions,known aa,total queries,hits,identified,Type,precision
0,0,0,LSEAG,20181,20117,18911,50 AA fragment,0.93707
1,0,10,LSEAG,20181,19518,18017,50 AA fragment,0.89277


In [6]:
heatmap_df.shape

(196, 8)