In [1]:
from __future__ import print_function

from optparse import OptionParser
import collections
import gzip
import heapq
import json
import math
import pdb
import os
import random
import shutil
import subprocess
import sys
import tempfile
import time

import h5py
import numpy as np
import pandas as pd

from basenji import genome
from basenji import util

# Checking leak. Extract single chromosome and based on train and test data check if indexes of model contigs are covering.

In [2]:
# read file with contigs which is in tsv format
contigs_file = 'contigs.bed'

In [3]:
# create object to store model contig
ModelContig = collections.namedtuple('ModelContig', ['chr', 'start', 'end', 'label'])

In [4]:
# create dataframe from contigs file
def file_to_df(file, headers):
    contig_df = pd.read_csv(file, sep='\t', names=headers) 
    return contig_df

In [28]:
contig_df = file_to_df(contigs_file, ['chr', 'start', 'end', 'label'])
chr_num = [chrom[3:] for chrom in contig_df.chr.unique()]
chr_num

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 'X']

Unnamed: 0,chr,start,end,label
0,chr1,9481,11209,train
1,chr1,13303,14647,train
2,chr1,19463,20807,train
3,chr1,55633,57937,train
4,chr1,87043,88387,train
...,...,...,...,...
492879,chrX,121633781,121636469,test
492880,chrX,132120057,132122937,test
492881,chrX,139460368,139461712,test
492882,chrX,140508527,140510063,test


In [31]:
# create function that extracts specific chromosome and sort the list by length
def extract_and_sort(chr_n, ctgs_df):

    # subsetting specific chr
    chr_cont_df = ctgs_df[(ctgs_df['chr'] == f'chr{chr_n}') & (ctgs_df['label'].isin(['train','test']))]
    
    # save each contig to list as model object
    contigs = []
    for idx, row in chr_cont_df.iterrows():
        chrom, start, end, label = row
        contigs.append(ModelContig(chrom, start, end, label))
    
    # sort
    contigs.sort(key=lambda x: x.start)
    return contigs

# For chromosome 1

ModelContig(chr='chr2', start=9619, end=11155, label='train')

In [33]:
def check_coverage(modelSeqs):    
    # variable that extends input sequence
    # so that its length equals to fixed value
    # here it is 1344, so given that input seq is 192 lenght
    # we crop it with 576
    # all parameters are based on basenji methodology
    crop_bp = 576
    fix_length = 1344
    leak = []
    for mn, mseq in enumerate(contigs):

        # assumption is that length of each contig is 1344 here
        # so there is no need to check other than first 'test' contig
        # that occurs after 'train' contig 
        # same if 'train' occurs after 'test'
        if mseq.label == 'test' and contigs[mn-1].label == 'train':

            end_train = contigs[mn-1].end

            if mseq.start - crop_bp < end_train:

                # this is start index after transforming contig
                # to input data to model
                input_start = mseq.start - crop_bp

                nt_cover = end_train - input_start
                perc_cover = nt_cover/fix_length*100

                # print('First Case',mn, mseq, perc_cover)

                leak.append((mseq, perc_cover))

        elif mseq.label == 'train' and contigs[mn-1].label == 'test':

            end_test = contigs[mn-1].end

            if mseq.start - crop_bp < end_test:

                # do the same as in case above
                input_start = mseq.start - crop_bp

                nt_cover = end_test - input_start
                perc_cover = nt_cover/fix_length*100

                # print('Second Case', mn, mseq, perc_cover)

                # we add 'test' sequences as we are checking leak from train to test set
                leak.append((contigs[mn-1],perc_cover))
    leak = [con for con in leak if con[1] <= 100.00]
    return leak

In [35]:
leak_each_chr = []
df = {'chrom':chr_num, 
     'percCover':[],
     'percSet':[]}
for chrNum in chr_num:
    contigs = extract_and_sort(chrNum, contig_df)
    leak = check_coverage(contigs)
    sumPerc = sum([con[1] for con in leak])
    
    percCover = sumPerc/len(leak)
    df['percCover'].append(percCover)
    
    percSet = round(len(leak)/len(contigs)*100,3)
    df['percSet'].append(percSet)
    
    leak_each_chr.append(leak)


Second Case 9 ModelContig(chr='chr1', start=90838, end=92182, label='train') 39.0625
Second Case 39 ModelContig(chr='chr1', start=756811, end=759499, label='train') 31.398809523809522
First Case 46 ModelContig(chr='chr1', start=866419, end=881971, label='test') 4.985119047619048
Second Case 53 ModelContig(chr='chr1', start=947867, end=964187, label='train') 27.976190476190478
First Case 74 ModelContig(chr='chr1', start=1184591, end=1187279, label='test') 33.63095238095239
Second Case 75 ModelContig(chr='chr1', start=1187317, end=1197493, label='train') 40.029761904761905
First Case 83 ModelContig(chr='chr1', start=1271867, end=1277627, label='test') 14.806547619047619
Second Case 85 ModelContig(chr='chr1', start=1286219, end=1298315, label='train') 29.017857142857146
First Case 122 ModelContig(chr='chr1', start=1548545, end=1559105, label='test') 22.916666666666664
Second Case 123 ModelContig(chr='chr1', start=1559591, end=1568423, label='train') 6.696428571428571
Second Case 140 Model

Second Case 0 ModelContig(chr='chr2', start=9619, end=11155, label='train') 18093495.386904765
First Case 3 ModelContig(chr='chr2', start=33042, end=37458, label='test') 35.342261904761905
First Case 12 ModelContig(chr='chr2', start=85281, end=89313, label='test') 26.190476190476193
Second Case 13 ModelContig(chr='chr2', start=89726, end=93374, label='train') 12.12797619047619
Second Case 20 ModelContig(chr='chr2', start=134972, end=138428, label='train') 25.669642857142854
First Case 26 ModelContig(chr='chr2', start=162171, end=163899, label='test') 35.49107142857143
First Case 35 ModelContig(chr='chr2', start=206207, end=212927, label='test') 24.702380952380953
Second Case 36 ModelContig(chr='chr2', start=213208, end=220312, label='train') 21.949404761904763
Second Case 44 ModelContig(chr='chr2', start=294530, end=297410, label='train') 24.107142857142858
Second Case 63 ModelContig(chr='chr2', start=364683, end=369291, label='train') 20.684523809523807
Second Case 70 ModelContig(chr=

Second Case 42 ModelContig(chr='chr3', start=262003, end=263347, label='train') 13.392857142857142
Second Case 68 ModelContig(chr='chr3', start=405467, end=409883, label='train') 29.910714285714285
Second Case 74 ModelContig(chr='chr3', start=437417, end=438953, label='train') 33.92857142857143
Second Case 89 ModelContig(chr='chr3', start=502203, end=503547, label='train') 36.01190476190476
First Case 123 ModelContig(chr='chr3', start=675663, end=677967, label='test') 9.375
Second Case 151 ModelContig(chr='chr3', start=835548, end=836892, label='train') 17.485119047619047
First Case 163 ModelContig(chr='chr3', start=902923, end=904267, label='test') 14.732142857142858
Second Case 164 ModelContig(chr='chr3', start=904641, end=907329, label='train') 15.029761904761903
Second Case 194 ModelContig(chr='chr3', start=1036563, end=1037907, label='train') 42.26190476190476
Second Case 214 ModelContig(chr='chr3', start=1125063, end=1126407, label='train') 16.815476190476193
Second Case 251 Mode

Second Case 2 ModelContig(chr='chr4', start=15582, end=17118, label='train') 4.017857142857143
Second Case 5 ModelContig(chr='chr4', start=27381, end=30069, label='train') 25.44642857142857
Second Case 43 ModelContig(chr='chr4', start=319960, end=321880, label='train') 29.6875
First Case 92 ModelContig(chr='chr4', start=747473, end=749777, label='test') 7.5892857142857135
First Case 110 ModelContig(chr='chr4', start=1054263, end=1060407, label='test') 4.613095238095238
First Case 115 ModelContig(chr='chr4', start=1101634, end=1113346, label='test') 22.172619047619047
Second Case 161 ModelContig(chr='chr4', start=1707481, end=1709209, label='train') 12.946428571428573
First Case 163 ModelContig(chr='chr4', start=1730861, end=1733933, label='test') 34.970238095238095
Second Case 164 ModelContig(chr='chr4', start=1734187, end=1735723, label='train') 23.958333333333336
First Case 166 ModelContig(chr='chr4', start=1738000, end=1743760, label='test') 45.23809523809524
Second Case 168 ModelCo

First Case 9 ModelContig(chr='chr5', start=43606, end=58774, label='test') 13.244047619047619
Second Case 13 ModelContig(chr='chr5', start=88728, end=98712, label='train') 8.928571428571429
Second Case 15 ModelContig(chr='chr5', start=101061, end=114693, label='train') 23.214285714285715
First Case 17 ModelContig(chr='chr5', start=117868, end=119212, label='test') 20.238095238095237
First Case 22 ModelContig(chr='chr5', start=175864, end=177976, label='test') 21.13095238095238
Second Case 24 ModelContig(chr='chr5', start=183253, end=210901, label='train') 24.404761904761905
First Case 26 ModelContig(chr='chr5', start=216929, end=226721, label='test') 26.339285714285715
First Case 46 ModelContig(chr='chr5', start=461281, end=467809, label='test') 33.63095238095239
Second Case 56 ModelContig(chr='chr5', start=579465, end=591369, label='train') 31.175595238095237
First Case 62 ModelContig(chr='chr5', start=690421, end=696373, label='test') 9.895833333333332
Second Case 63 ModelContig(chr=

Second Case 2 ModelContig(chr='chr6', start=151161, end=160569, label='train') 30.80357142857143
First Case 6 ModelContig(chr='chr6', start=199154, end=201266, label='test') 25.967261904761905
Second Case 7 ModelContig(chr='chr6', start=201539, end=237635, label='train') 22.544642857142858
Second Case 24 ModelContig(chr='chr6', start=495747, end=500547, label='train') 10.491071428571429
Second Case 45 ModelContig(chr='chr6', start=640433, end=642161, label='train') 4.538690476190476
First Case 65 ModelContig(chr='chr6', start=805783, end=808087, label='test') 41.44345238095239
Second Case 82 ModelContig(chr='chr6', start=946057, end=949513, label='train') 41.220238095238095
First Case 97 ModelContig(chr='chr6', start=1059292, end=1061788, label='test') 27.67857142857143
Second Case 98 ModelContig(chr='chr6', start=1061917, end=1084957, label='train') 33.25892857142857
First Case 112 ModelContig(chr='chr6', start=1181645, end=1184909, label='test') 9.821428571428571
Second Case 123 Mode

First Case 12 ModelContig(chr='chr7', start=185953, end=224737, label='test') 36.086309523809526
First Case 41 ModelContig(chr='chr7', start=742157, end=744077, label='test') 37.797619047619044
Second Case 42 ModelContig(chr='chr7', start=744389, end=758405, label='train') 19.642857142857142
First Case 53 ModelContig(chr='chr7', start=973366, end=978934, label='test') 9.300595238095239
Second Case 55 ModelContig(chr='chr7', start=1038341, end=1042373, label='train') 30.505952380952383
First Case 57 ModelContig(chr='chr7', start=1046343, end=1056711, label='test') 14.880952380952381
First Case 61 ModelContig(chr='chr7', start=1109543, end=1156391, label='test') 21.726190476190478
Second Case 62 ModelContig(chr='chr7', start=1156365, end=1165965, label='train') 44.79166666666667
First Case 63 ModelContig(chr='chr7', start=1165989, end=1167525, label='test') 41.07142857142857
First Case 85 ModelContig(chr='chr7', start=1418321, end=1430993, label='test') 0.4464285714285714
Second Case 91 

First Case 10 ModelContig(chr='chr8', start=185783, end=187127, label='test') 23.660714285714285
Second Case 11 ModelContig(chr='chr8', start=187255, end=189175, label='train') 33.33333333333333
Second Case 77 ModelContig(chr='chr8', start=875113, end=902761, label='train') 1.488095238095238
First Case 84 ModelContig(chr='chr8', start=933519, end=936015, label='test') 30.431547619047617
Second Case 85 ModelContig(chr='chr8', start=936429, end=940845, label='train') 12.053571428571429
First Case 94 ModelContig(chr='chr8', start=1059740, end=1062620, label='test') 2.604166666666667
Second Case 95 ModelContig(chr='chr8', start=1062883, end=1064227, label='train') 23.288690476190478
First Case 107 ModelContig(chr='chr8', start=1212804, end=1216836, label='test') 35.639880952380956
Second Case 108 ModelContig(chr='chr8', start=1217028, end=1218372, label='train') 28.57142857142857
Second Case 111 ModelContig(chr='chr8', start=1225783, end=1227127, label='train') 38.392857142857146
First Cas

First Case 21 ModelContig(chr='chr9', start=177639, end=180711, label='test') 21.428571428571427
Second Case 46 ModelContig(chr='chr9', start=347097, end=355353, label='train') 43.154761904761905
First Case 47 ModelContig(chr='chr9', start=355383, end=356727, label='test') 40.625
Second Case 48 ModelContig(chr='chr9', start=357055, end=358399, label='train') 18.452380952380953
Second Case 56 ModelContig(chr='chr9', start=402643, end=408211, label='train') 24.25595238095238
First Case 57 ModelContig(chr='chr9', start=408377, end=409913, label='test') 30.505952380952383
Second Case 64 ModelContig(chr='chr9', start=443955, end=451635, label='train') 40.922619047619044
First Case 77 ModelContig(chr='chr9', start=513832, end=521128, label='test') 32.663690476190474
First Case 83 ModelContig(chr='chr9', start=554349, end=558765, label='test') 36.30952380952381
Second Case 84 ModelContig(chr='chr9', start=559242, end=564618, label='train') 7.366071428571429
First Case 96 ModelContig(chr='chr9

Second Case 34 ModelContig(chr='chr10', start=286791, end=288903, label='train') 31.101190476190478
First Case 45 ModelContig(chr='chr10', start=428967, end=445863, label='test') 26.041666666666668
Second Case 61 ModelContig(chr='chr10', start=720946, end=746674, label='train') 3.943452380952381
Second Case 86 ModelContig(chr='chr10', start=964539, end=971451, label='train') 11.904761904761903
Second Case 98 ModelContig(chr='chr10', start=1125568, end=1126912, label='train') 40.848214285714285
First Case 111 ModelContig(chr='chr10', start=1365189, end=1379205, label='test') 37.723214285714285
Second Case 117 ModelContig(chr='chr10', start=1439839, end=1460575, label='train') 3.050595238095238
First Case 139 ModelContig(chr='chr10', start=1771938, end=1795362, label='test') 43.154761904761905
Second Case 142 ModelContig(chr='chr10', start=1806520, end=1809400, label='train') 1.0416666666666665
First Case 151 ModelContig(chr='chr10', start=1852940, end=1854860, label='test') 19.940476190

First Case 11 ModelContig(chr='chr11', start=245435, end=247739, label='test') 31.845238095238095
First Case 16 ModelContig(chr='chr11', start=322153, end=323497, label='test') 20.238095238095237
Second Case 26 ModelContig(chr='chr11', start=454627, end=463267, label='train') 37.94642857142857
First Case 48 ModelContig(chr='chr11', start=651498, end=664362, label='test') 39.360119047619044
Second Case 49 ModelContig(chr='chr11', start=664578, end=667842, label='train') 26.785714285714285
First Case 51 ModelContig(chr='chr11', start=675444, end=677556, label='test') 25.372023809523807
Second Case 52 ModelContig(chr='chr11', start=677711, end=685583, label='train') 31.324404761904763
Second Case 77 ModelContig(chr='chr11', start=930235, end=935419, label='train') 37.5
First Case 83 ModelContig(chr='chr11', start=1004901, end=1022373, label='test') 28.794642857142854
Second Case 84 ModelContig(chr='chr11', start=1022441, end=1066409, label='train') 37.797619047619044
First Case 88 ModelCo

Second Case 0 ModelContig(chr='chr12', start=59963, end=61691, label='train') 9952442.55952381
Second Case 5 ModelContig(chr='chr12', start=85953, end=87297, label='train') 32.217261904761905
Second Case 19 ModelContig(chr='chr12', start=315346, end=325714, label='train') 30.729166666666668
Second Case 34 ModelContig(chr='chr12', start=436877, end=438797, label='train') 38.392857142857146
Second Case 55 ModelContig(chr='chr12', start=552495, end=597999, label='train') 29.910714285714285
Second Case 146 ModelContig(chr='chr12', start=1236359, end=1243271, label='train') 25.595238095238095
Second Case 151 ModelContig(chr='chr12', start=1254363, end=1255707, label='train') 14.880952380952381
First Case 158 ModelContig(chr='chr12', start=1284903, end=1286247, label='test') 27.529761904761905
First Case 165 ModelContig(chr='chr12', start=1310495, end=1320095, label='test') 29.464285714285715
Second Case 167 ModelContig(chr='chr12', start=1331903, end=1333247, label='train') 38.2440476190476

Second Case 0 ModelContig(chr='chr13', start=19168449, end=19170945, label='train') 7137957.440476191
First Case 1 ModelContig(chr='chr13', start=19171510, end=19175350, label='test') 0.818452380952381
First Case 11 ModelContig(chr='chr13', start=19245788, end=19247132, label='test') 0.6696428571428571
Second Case 12 ModelContig(chr='chr13', start=19247470, end=19250350, label='train') 17.708333333333336
Second Case 62 ModelContig(chr='chr13', start=19641020, end=19652540, label='train') 34.077380952380956
Second Case 69 ModelContig(chr='chr13', start=19704188, end=19705532, label='train') 10.416666666666668
Second Case 97 ModelContig(chr='chr13', start=19925788, end=19927132, label='train') 29.613095238095237
First Case 115 ModelContig(chr='chr13', start=20006363, end=20007707, label='test') 29.761904761904763
Second Case 138 ModelContig(chr='chr13', start=20140581, end=20143653, label='train') 30.877976190476193
First Case 140 ModelContig(chr='chr13', start=20148248, end=20149592, la

First Case 17 ModelContig(chr='chr14', start=19444368, end=19445712, label='test') 17.485119047619047
Second Case 87 ModelContig(chr='chr14', start=20245823, end=20247167, label='train') 29.6875
Second Case 98 ModelContig(chr='chr14', start=20342828, end=20344172, label='train') 0.2232142857142857
First Case 135 ModelContig(chr='chr14', start=20671031, end=20673719, label='test') 14.285714285714285
First Case 157 ModelContig(chr='chr14', start=20775648, end=20777952, label='test') 25.44642857142857
First Case 173 ModelContig(chr='chr14', start=20888687, end=20891567, label='test') 36.160714285714285
Second Case 175 ModelContig(chr='chr14', start=20894745, end=20917785, label='train') 31.547619047619047
Second Case 180 ModelContig(chr='chr14', start=20982555, end=20986779, label='train') 12.946428571428573
Second Case 197 ModelContig(chr='chr14', start=21040037, end=21041573, label='train') 32.88690476190476
First Case 215 ModelContig(chr='chr14', start=21193542, end=21206598, label='te

First Case 0 ModelContig(chr='chr15', start=20048811, end=20050539, label='test') 6135739.583333334
First Case 2 ModelContig(chr='chr15', start=20071463, end=20072807, label='test') 15.476190476190476
First Case 30 ModelContig(chr='chr15', start=20397569, end=20400641, label='test') 36.75595238095239
First Case 45 ModelContig(chr='chr15', start=20471869, end=20483005, label='test') 38.24404761904761
Second Case 74 ModelContig(chr='chr15', start=20846823, end=20860647, label='train') 30.208333333333332
Second Case 98 ModelContig(chr='chr15', start=21316655, end=21318575, label='train') 30.357142857142854
First Case 108 ModelContig(chr='chr15', start=21932203, end=21938731, label='test') 6.9940476190476195
Second Case 109 ModelContig(chr='chr15', start=21939048, end=21942312, label='train') 19.270833333333336
Second Case 136 ModelContig(chr='chr15', start=22345348, end=22346692, label='train') 24.925595238095237
Second Case 139 ModelContig(chr='chr15', start=22355471, end=22358159, label

Second Case 0 ModelContig(chr='chr16', start=59321, end=62009, label='train') 6713769.866071429
Second Case 10 ModelContig(chr='chr16', start=145369, end=153625, label='train') 18.75
First Case 16 ModelContig(chr='chr16', start=171837, end=176637, label='test') 24.851190476190478
Second Case 49 ModelContig(chr='chr16', start=436019, end=441011, label='train') 23.660714285714285
First Case 64 ModelContig(chr='chr16', start=515324, end=519356, label='test') 13.318452380952381
Second Case 65 ModelContig(chr='chr16', start=519833, end=529241, label='train') 7.366071428571429
First Case 67 ModelContig(chr='chr16', start=535035, end=561339, label='test') 33.92857142857143
First Case 72 ModelContig(chr='chr16', start=647583, end=651807, label='test') 35.714285714285715
Second Case 95 ModelContig(chr='chr16', start=917254, end=919366, label='train') 37.276785714285715
First Case 97 ModelContig(chr='chr16', start=926639, end=953135, label='test') 27.306547619047617
Second Case 109 ModelContig(c

First Case 2 ModelContig(chr='chr17', start=21338, end=38042, label='test') 29.538690476190478
Second Case 3 ModelContig(chr='chr17', start=38549, end=40085, label='train') 5.133928571428571
First Case 4 ModelContig(chr='chr17', start=40367, end=47087, label='test') 21.875
Second Case 5 ModelContig(chr='chr17', start=47165, end=59069, label='train') 37.05357142857143
First Case 11 ModelContig(chr='chr17', start=74747, end=92603, label='test') 14.434523809523808
First Case 20 ModelContig(chr='chr17', start=163537, end=175057, label='test') 36.01190476190476
Second Case 21 ModelContig(chr='chr17', start=175067, end=211547, label='train') 42.11309523809524
Second Case 34 ModelContig(chr='chr17', start=417221, end=427013, label='train') 43.601190476190474
First Case 43 ModelContig(chr='chr17', start=498597, end=501477, label='test') 30.282738095238095
First Case 48 ModelContig(chr='chr17', start=527003, end=528731, label='test') 11.755952380952381
First Case 75 ModelContig(chr='chr17', sta

First Case 16 ModelContig(chr='chr18', start=124508, end=125852, label='test') 22.916666666666664
Second Case 17 ModelContig(chr='chr18', start=126052, end=127588, label='train') 27.976190476190478
First Case 18 ModelContig(chr='chr18', start=127708, end=129052, label='test') 33.92857142857143
Second Case 27 ModelContig(chr='chr18', start=171929, end=175961, label='train') 2.083333333333333
First Case 38 ModelContig(chr='chr18', start=226977, end=229473, label='test') 18.303571428571427
Second Case 50 ModelContig(chr='chr18', start=297277, end=300157, label='train') 30.952380952380953
Second Case 53 ModelContig(chr='chr18', start=350419, end=357715, label='train') 39.13690476190476
First Case 56 ModelContig(chr='chr18', start=384305, end=389105, label='test') 30.505952380952383
Second Case 57 ModelContig(chr='chr18', start=389093, end=391781, label='train') 43.75
Second Case 69 ModelContig(chr='chr18', start=468269, end=483245, label='train') 27.529761904761905
First Case 107 ModelCont

Second Case 3 ModelContig(chr='chr20', start=70815, end=72159, label='train') 0.744047619047619
Second Case 6 ModelContig(chr='chr20', start=83603, end=84947, label='train') 37.202380952380956
Second Case 39 ModelContig(chr='chr20', start=264162, end=266658, label='train') 17.782738095238095
First Case 40 ModelContig(chr='chr20', start=266663, end=268007, label='test') 42.485119047619044
Second Case 41 ModelContig(chr='chr20', start=268325, end=274469, label='train') 19.196428571428573
First Case 62 ModelContig(chr='chr20', start=478093, end=480781, label='test') 37.351190476190474
First Case 65 ModelContig(chr='chr20', start=493545, end=499689, label='test') 13.244047619047619
First Case 70 ModelContig(chr='chr20', start=527148, end=528492, label='test') 4.761904761904762
Second Case 90 ModelContig(chr='chr20', start=780029, end=784445, label='train') 44.642857142857146
First Case 96 ModelContig(chr='chr20', start=845769, end=847881, label='test') 38.24404761904761
Second Case 97 Mode

Second Case 30 ModelContig(chr='chr22', start=16865828, end=16875812, label='train') 19.345238095238095
Second Case 79 ModelContig(chr='chr22', start=17239659, end=17242155, label='train') 40.476190476190474
First Case 101 ModelContig(chr='chr22', start=17338959, end=17340495, label='test') 19.49404761904762
First Case 141 ModelContig(chr='chr22', start=17659286, end=17663894, label='test') 5.5059523809523805
Second Case 142 ModelContig(chr='chr22', start=17664142, end=17667598, label='train') 24.404761904761905
Second Case 147 ModelContig(chr='chr22', start=17689268, end=17690612, label='train') 41.81547619047619
First Case 154 ModelContig(chr='chr22', start=17743783, end=17752231, label='test') 35.788690476190474
Second Case 155 ModelContig(chr='chr22', start=17752339, end=17755411, label='train') 34.82142857142857
Second Case 174 ModelContig(chr='chr22', start=17869415, end=17872679, label='train') 9.077380952380953
Second Case 180 ModelContig(chr='chr22', start=17935347, end=179411

Second Case 6 ModelContig(chr='chrX', start=2760708, end=2762052, label='train') 40.25297619047619
First Case 17 ModelContig(chr='chrX', start=2859463, end=2860807, label='test') 32.217261904761905
First Case 22 ModelContig(chr='chrX', start=2878061, end=2887853, label='test') 31.845238095238095
Second Case 23 ModelContig(chr='chrX', start=2887944, end=2893896, label='train') 36.086309523809526
First Case 35 ModelContig(chr='chrX', start=2939985, end=2941329, label='test') 22.470238095238095
First Case 76 ModelContig(chr='chrX', start=3180329, end=3184361, label='test') 12.5
First Case 85 ModelContig(chr='chrX', start=3231150, end=3240750, label='test') 34.30059523809524
Second Case 86 ModelContig(chr='chrX', start=3241299, end=3245715, label='train') 2.0089285714285716
Second Case 98 ModelContig(chr='chrX', start=3293439, end=3298815, label='train') 36.904761904761905
First Case 109 ModelContig(chr='chrX', start=3342543, end=3343887, label='test') 16.36904761904762
Second Case 120 Mod

{'chrom': ['1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  '10',
  '11',
  '12',
  '13',
  '14',
  '15',
  '16',
  '17',
  '18',
  '19',
  '20',
  '21',
  '22',
  'X'],
 'percCover': [23.9986405746509,
  23.832193795998702,
  23.75414847814384,
  23.806583049886598,
  23.07604361424641,
  23.776916584321473,
  23.965957273320292,
  23.806689342403633,
  24.552800087339527,
  23.340739566701085,
  23.729761096014514,
  23.961411824436983,
  23.270994208494216,
  23.29578398717005,
  23.548310761613525,
  24.122296229704684,
  23.803516504641067,
  23.405294953802382,
  24.302455357142872,
  24.589353067959856,
  23.98004955570745,
  24.524355999461925,
  23.25931491665596],
 'percSet': [8.008,
  7.241,
  7.501,
  6.501,
  7.252,
  7.585,
  7.486,
  7.373,
  7.424,
  7.871,
  7.691,
  7.554,
  6.85,
  7.106,
  8.352,
  8.069,
  8.782,
  7.292,
  9.414,
  8.983,
  8.032,
  9.547,
  5.015]}

In [37]:
df = pd.DataFrame(df)
df

Unnamed: 0,chrom,percCover,percSet
0,1,23.998641,8.008
1,2,23.832194,7.241
2,3,23.754148,7.501
3,4,23.806583,6.501
4,5,23.076044,7.252
5,6,23.776917,7.585
6,7,23.965957,7.486
7,8,23.806689,7.373
8,9,24.5528,7.424
9,10,23.34074,7.871


In [22]:
# remove cases when comparing last and first model sequence
leak = [con for con in leak if con[1] <= 100.00]

In [24]:
# średnia procentowa pokrywania się sekw testowych z treningowymi w oszczególnym zestawie
sumPerc = sum([con[1] for con in leak])
sumPerc/len(leak)

23.832193795998702

In [25]:
# procent sekwencji testowych mogacych wystepowac w treningowym zestawie
round(len(leak)/len(contigs)*100,3)

7.241

In [40]:
df.percSet.mean()

7.692565217391303

In [28]:
leak

[(ModelContig(chr='chr1', start=89443, end=90787, label='test'), 39.0625),
 (ModelContig(chr='chr1', start=749553, end=756657, label='test'),
  31.398809523809522),
 (ModelContig(chr='chr1', start=866419, end=881971, label='test'),
  4.985119047619048),
 (ModelContig(chr='chr1', start=946323, end=947667, label='test'),
  27.976190476190478),
 (ModelContig(chr='chr1', start=1184591, end=1187279, label='test'),
  33.63095238095239),
 (ModelContig(chr='chr1', start=1184591, end=1187279, label='test'),
  40.029761904761905),
 (ModelContig(chr='chr1', start=1271867, end=1277627, label='test'),
  14.806547619047619),
 (ModelContig(chr='chr1', start=1278161, end=1286033, label='test'),
  29.017857142857146),
 (ModelContig(chr='chr1', start=1548545, end=1559105, label='test'),
  22.916666666666664),
 (ModelContig(chr='chr1', start=1548545, end=1559105, label='test'),
  6.696428571428571),
 (ModelContig(chr='chr1', start=1644387, end=1647843, label='test'),
  22.842261904761905),
 (ModelContig(

In [29]:
2660/33218

0.08007706665061111