# Creating protein_coding_transcripts

In [15]:
!head protein_coding.gtf -n20

chr1	HAVANA	gene	65419	71585	.	+	.	gene_id "ENSG00000186092.6"; gene_type "protein_coding"; gene_name "OR4F5"; level 2; hgnc_id "HGNC:14825"; havana_gene "OTTHUMG00000001094.1";
chr1	HAVANA	transcript	65419	71585	.	+	.	gene_id "ENSG00000186092.6"; transcript_id "ENST00000641515.2"; gene_type "protein_coding"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_name "OR4F5-202"; level 2; protein_id "ENSP00000493376.2"; hgnc_id "HGNC:14825"; tag "RNA_Seq_supported_partial"; tag "basic"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";
chr1	HAVANA	exon	65419	65433	.	+	.	gene_id "ENSG00000186092.6"; transcript_id "ENST00000641515.2"; gene_type "protein_coding"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_name "OR4F5-202"; exon_number 1; exon_id "ENSE00003812156.1"; level 2; protein_id "ENSP00000493376.2"; hgnc_id "HGNC:14825"; tag "RNA_Seq_supported_partial"; tag "basic"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "O

In [16]:
!awk -v OFS="\t" '{if ($3 == "transcript" || $3 == "exon" || $3 == "CDS" || $3 == "UTR") print $1, $3, $4 - 1, $5, $7}' protein_coding.gtf > protein_coding_transcripts.txt
!cat protein_coding_transcripts.txt | head #it's very similar to .bed but there's an extra column so I changed it to .txt

chr1	transcript	65418	71585	+
chr1	exon	65418	65433	+
chr1	exon	65519	65573	+
chr1	CDS	65564	65573	+
chr1	exon	69036	71585	+
chr1	CDS	69036	70005	+
chr1	UTR	65418	65433	+
chr1	UTR	65519	65564	+
chr1	UTR	70005	71585	+
chr1	transcript	69054	70108	+
cat: write error: Broken pipe


In [17]:
f = open('protein_coding_transcripts.txt')
protein_coding_transcripts = f.readlines()
protein_coding_transcripts

['chr1\ttranscript\t65418\t71585\t+\n',
 'chr1\texon\t65418\t65433\t+\n',
 'chr1\texon\t65519\t65573\t+\n',
 'chr1\tCDS\t65564\t65573\t+\n',
 'chr1\texon\t69036\t71585\t+\n',
 'chr1\tCDS\t69036\t70005\t+\n',
 'chr1\tUTR\t65418\t65433\t+\n',
 'chr1\tUTR\t65519\t65564\t+\n',
 'chr1\tUTR\t70005\t71585\t+\n',
 'chr1\ttranscript\t69054\t70108\t+\n',
 'chr1\texon\t69054\t70108\t+\n',
 'chr1\tCDS\t69090\t70005\t+\n',
 'chr1\tUTR\t69054\t69090\t+\n',
 'chr1\tUTR\t70005\t70108\t+\n',
 'chr1\ttranscript\t450702\t451697\t-\n',
 'chr1\texon\t450702\t451697\t-\n',
 'chr1\tCDS\t450742\t451678\t-\n',
 'chr1\tUTR\t450702\t450742\t-\n',
 'chr1\tUTR\t451678\t451697\t-\n',
 'chr1\ttranscript\t685678\t686673\t-\n',
 'chr1\texon\t685678\t686673\t-\n',
 'chr1\tCDS\t685718\t686654\t-\n',
 'chr1\tUTR\t685678\t685718\t-\n',
 'chr1\tUTR\t686654\t686673\t-\n',
 'chr1\ttranscript\t923927\t939291\t+\n',
 'chr1\texon\t923927\t924948\t+\n',
 'chr1\tCDS\t924431\t924948\t+\n',
 'chr1\texon\t925921\t926013\t+\n',
 'chr

# Making classes.
## Chromosome, Transcript, Intron, Exon, CDS, UTR

In [18]:
class Chromosome:
    def __init__(self, nm): #string (format chr1, chr2, chrx, etc.)
        self.name = nm #string
        self.transcripts = []
    def show(self):
        show = str(str(self.name) + ": ")
        show += str(len(self.transcripts)) + " transcripts"
        print(show)
    def add_transcript(self, s, e, strand):
        self.transcripts.append(Transcript(s, e, strand))

In [19]:
class Transcript:
    def __init__(self, s, e, posneg): #int, int, string
        self.start = int(s)
        self.end = int(e)
        self.strand = (posneg == "+") #boolean
        self.exon_intron = []
        self.cds_start = -1 #smaller number, regardless of whether it's 5' or 3'
        self.cds_end = -1    #larger number, regardless of whether it's 3' or 5'
    def show(self):
        show = str("transcript: ")
        show += str(self.start) + " "      #start position
        show += str(self.end) + " "        #end position
        show += str(self.strand) + " "     #positive or negative strand
        show += str(len(self.exon_intron)) #how many exons
        print(show)
    def add_exon(self, s, e):
        self.exon_intron.append(Exon(s, e, self.strand))
        self.sort()
    def add_intron(self, s, e):
        self.exon_intron.append(Intron(s, e, self.strand))
        self.sort()
    def sort(self):
        self.exon_intron.sort(key=lambda x: x.start)
    def setCdsPos(self): #assumes there aren't any UTRs
        for currExon in self.exon_intron:
            if (not currExon.has_cds()):
                continue
            exon_cds_start = currExon.cds_utr[0].start
            exon_cds_end = currExon.cds_utr[0].end
            if (self.cds_start == -1 or self.cds_start > exon_cds_start):
                self.cds_start = exon_cds_start
            if (self.cds_end == -1 or self.cds_end < exon_cds_end):
                self.cds_end = exon_cds_end
    def has_intron(self):
        #for x in self.exon_intron:
            #if (type(x) == Intron):
                #print("there's an intron")
                #return True
        #print("  no introns!")
        #return False
        return any([(type(x) == Intron) for x in self.exon_intron])
    def has_utr(self):
        #for x in self.exon_intron:
            #if x.has_utr():
                #return True
        #return False
        return any([x.has_utr() for x in self.exon_intron])

In [20]:
class Intron:
    def __init__(self, s, e, trnscrpt_strand): #int, int, boolean
        self.start = int(s)
        self.end = int(e)
        self.strand = trnscrpt_strand #boolean
    def show(self):
        show = str("intron: ")
        show += str(self.start) + " "      #start position
        show += str(self.end) + " "        #end position
        show += str(self.strand)           #positive or negative strand
        print(show)
    def has_utr(self):
        return False
    def has_cds(self):
        return False

In [21]:
class Exon:
    def __init__(self, s, e, trnscrpt_strand): #int, int, boolean
        self.start = int(s)
        self.end = int(e)
        self.strand = trnscrpt_strand #boolean
        self.cds_utr = []
    def show(self):
        show = str("exon: ")
        show += str(self.start) + " "      #start position
        show += str(self.end) + " "        #end position
        show += str(self.strand) + " "     #positive or negative strand
        show += str(len(self.cds_utr))     #how many CDSs and UTRs
        print(show)
    def add_cds(self, s, e):
        self.cds_utr.append(Cds(s, e, self.strand))
        self.sort()
    def add_utr(self, s, e, is_start):
        self.cds_utr.append(Utr(s, e, self.strand, is_start)) #start = UTR index < CDS index (5' if pos, 3' if neg)
        self.sort()
    def sort(self):
        self.cds_utr.sort(key=lambda x: x.start)
    def has_cds(self):
        #for x in self.cds_utr:
            #if (type(x) == Cds):
                #return True
        #return False
        return any([(type(x) == Cds) for x in self.cds_utr])
    def has_utr(self):
        #for x in self.cds_utr:
            #if (type(x) == Utr):
                #return True
        #return False
        return any([(type(x) == Utr) for x in self.cds_utr])

In [22]:
class Cds:
    def __init__(self, s, e, exon_strand): #int, int, boolean
        self.start = int(s)
        self.end = int(e)
        self.strand = exon_strand #boolean
    def show(self):
        show = str("CDS: ")
        show += str(self.start) + " "       #start position
        show += str(self.end) + " "        #end position
        show += str(self.strand)           #positive or negative strand
        print(show)

In [23]:
class Utr:
    def __init__(self, s, e, exon_strand, is_start): #int, int, boolean, boolean
        self.start = int(s)
        self.end = int(e)
        self.strand = exon_strand #boolean
        self.is_five = is_start                #True = 5' UTR, False = 3' UTR
        if (not self.strand):                  #if it is on the negative strand
            self.is_five = not self.is_five    #flip
    def show(self):
        if(self.is_five):
            show = str("5' ")
        else:
            show = str("3' ")
        show += str("UTR: ")
        show += str(self.start) + " "      #start position
        show += str(self.end) + " "        #end position
        show += str(self.strand) + " "     #positive or negative strand
        print(show)

In [24]:
example_transcript1 = Transcript(1, 100, "+")
example_transcript2 = Transcript(30, 600, "-")

example_transcript1.show()
example_transcript2.show()

print(example_transcript1)
print(example_transcript2)

print()
print("-------------")
print()

example_transcript1.add_exon(2, 10)
example_transcript1.add_exon(21, 22)
example_transcript1.add_exon(40, 50)
example_transcript2.add_exon(99, 580)

example_transcript1.show()
example_transcript2.show()

for i in range(3):
    print("example_transcript1")
    example_transcript1.exon_intron[i].show()
print("example_transcript2")
example_transcript2.exon_intron[0].show()

transcript: 1 100 True 0
transcript: 30 600 False 0
<__main__.Transcript object at 0x7f979611a750>
<__main__.Transcript object at 0x7f979611a710>

-------------

transcript: 1 100 True 3
transcript: 30 600 False 1
example_transcript1
exon: 2 10 True 0
example_transcript1
exon: 21 22 True 0
example_transcript1
exon: 40 50 True 0
example_transcript2
exon: 99 580 False 0


In [25]:
example_chromosome1 = Chromosome("chr1")
example_chromosome5 = Chromosome("chr5")

example_chromosome1.show()
example_chromosome5.show()

print()
print("-------------")
print()

example_chromosome1.add_transcript(11, 20, "+")

example_chromosome1.show()
example_chromosome5.show()

print()
print("-------------")
print()

test = example_chromosome1.transcripts[0]
print(example_chromosome1.transcripts[0])
print(test)
test.show()

chr1: 0 transcripts
chr5: 0 transcripts

-------------

chr1: 1 transcripts
chr5: 0 transcripts

-------------

<__main__.Transcript object at 0x7f9796129450>
<__main__.Transcript object at 0x7f9796129450>
transcript: 11 20 True 0


# Filling chr_list

### Chromosomes, Exons, CDSs, UTRs

In [26]:
rightChrs = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY']

In [27]:
chr_list = []
chr_ind = -1
trn_ind = -1
exn_ind = -1
for i in range(len(protein_coding_transcripts)):
    if (i % 200000 == 0): #just to make sure it's actually running (not stuck)
        print(i)
    x = protein_coding_transcripts[i]
    temp = x.split()
    if (chr_ind == -1 or temp[0] != chr_list[chr_ind].name): #if it's the next chromosome
        #create new chromosome in chr_list, reset transcript & exon indices
        #if ("1" <= temp[0][3:4] and temp[0][3:4] <= "9"): #if it's a number
            #if ((int(temp[0][3:]) < 1 or 22 < int(temp[0][3:]))):
                #continue
        #elif (temp[0] != "chrX" and temp[0] != "chrY"): #the rest are letters
            #continue
        if (not temp[0] in rightChrs):
            continue
        chr_list.append(Chromosome(temp[0]))
        chr_ind += 1
        trn_ind = -1
    if (temp[1] == 'transcript'): #if it's a transcript
        #add new transcript to current chromosome, reset exon index
        chr_list[chr_ind].add_transcript(int(temp[2]), int(temp[3]), temp[4])
        trn_ind += 1
        exn_ind = -1
    elif (temp[1] == 'exon'): #if it's an exon
        #add new exon to current transcript
        chr_list[chr_ind].transcripts[trn_ind].add_exon(int(temp[2]), int(temp[3]))
        exn_ind += 1
    elif (temp[1] == 'CDS'): #if it's a CDS
        #find the exon it's in
        currTranscript = chr_list[chr_ind].transcripts[trn_ind]
        for exn in currTranscript.exon_intron: #assumes no introns yet
            if (exn.start <= int(temp[2]) and int(temp[3]) <= exn.end):
                currExon = exn
                break
        
        #add new CDS to whatever exon it's in
        currExon.add_cds(int(temp[2]), int(temp[3]))
    elif (temp[1] == 'UTR'): # if it's a UTR
        #find the exon it's in
        currTranscript = chr_list[chr_ind].transcripts[trn_ind]
        currTranscript.setCdsPos()
        
        #find the exon it's in
        for exn in currTranscript.exon_intron: #assumes no introns yet
            if (exn.start <= int(temp[3]) and int(temp[3]) <= exn.end):
                currExon = exn
                break
        
        #add new UTR to whatever exon it's in
        if (int(temp[3]) <= currTranscript.cds_start): #if UTR position is smaller than CDS position
            currExon.add_utr(temp[2], temp[3], True)
        elif (int(temp[2]) >= currTranscript.cds_end): #if UTR position is larger than CDS position
            currExon.add_utr(temp[2], temp[3], False)
        else:
            print("something's wrong")
            print("  UTR start: " + temp[2])
            print("  UTR end: " + temp[3])
            print("  CDS start: " + str(currTranscript.cds_start))
            print("  CDS end: " + str(currTranscript.cds_end))

for chrm in chr_list:
    chrm.show()

0
200000
400000
600000
800000
1000000
1200000
1400000
1600000
1800000
2000000
2200000
chr1: 13955 transcripts
chr2: 10476 transcripts
chr3: 9508 transcripts
chr4: 5910 transcripts
chr5: 6843 transcripts
chr6: 6468 transcripts
chr7: 7185 transcripts
chr8: 5650 transcripts
chr9: 4932 transcripts
chr10: 4884 transcripts
chr11: 10221 transcripts
chr12: 9227 transcripts
chr13: 2057 transcripts
chr14: 5595 transcripts
chr15: 5433 transcripts
chr16: 7978 transcripts
chr17: 10284 transcripts
chr18: 2556 transcripts
chr19: 10902 transcripts
chr20: 3383 transcripts
chr21: 1581 transcripts
chr22: 3377 transcripts
chrX: 4745 transcripts
chrY: 309 transcripts


In [28]:
chr_list[0].show()
for thingy in chr_list[0].transcripts[0].exon_intron:
    print()
    thingy.show()
    if (type(thingy) == Exon):
        for thing in thingy.cds_utr:
            thing.show()

print()
print()
print()

chr_list[1].show()
for thingy in chr_list[1].transcripts[0].exon_intron:
    print()
    thingy.show()
    if (type(thingy) == Exon):
        for thing in thingy.cds_utr:
            thing.show()

chr1: 13955 transcripts

exon: 65418 65433 True 1
5' UTR: 65418 65433 True 

exon: 65519 65573 True 2
5' UTR: 65519 65564 True 
CDS: 65564 65573 True

exon: 69036 71585 True 2
CDS: 69036 70005 True
3' UTR: 70005 71585 True 



chr2: 10476 transcripts

exon: 38813 41627 False 2
3' UTR: 38813 41610 False 
CDS: 41610 41627 False

exon: 45439 46505 False 2
CDS: 45439 46385 False
5' UTR: 46385 46505 False 


### Introns

In [29]:
#for each transcript, whatever's not an exon is an intron

In [30]:
chr_list[1].transcripts[0].show()

transcript: 38813 46505 False 2


In [31]:
for thing in chr_list[1].transcripts[0].exon_intron:
    print(str(type(thing)) + " " + str(thing.start) + " " + str(thing.end))

<class '__main__.Exon'> 38813 41627
<class '__main__.Exon'> 45439 46505


In [32]:
#for chrm in chr_list:
for chrm in chr_list:
    #if (chrm == chr_list[2]):
        #break
    #chrm.show()
    for trn in chrm.transcripts:
        #if (trn == chrm.transcripts[1]):
            #break
        if (trn.has_intron()): #don't add introns if there already are introns
            #trn.sort()
            #print("  yes intron")
            continue
        #print("  moving on")
        prev = trn.start
        exon_list_copy = trn.exon_intron.copy()
        for exn in exon_list_copy:
            #print("    next exon")
            curr = exn.start
            #print("    prev = " + str(prev) + " curr = " + str(curr))
            if (prev < curr):
                #print("    adding intron from " + str(prev) + " to " + str(curr))
                trn.add_intron(prev, curr) #0-indexed
            prev = exn.end
        #print("  ended for loop")
        #print("    prev = " + str(prev) + " trn.end = " + str(trn.end))
        if (prev < trn.end):
            #print("    adding intron from " + str(prev) + " to " + str(trn.end))
            trn.add_intron(prev, trn.end) #0-indexed
        #trn.sort()

chr_list[1].show()
for thing in chr_list[1].transcripts[0].exon_intron:
    print(str(type(thing)) + " " + str(thing.start) + " " + str(thing.end))

chr2: 10476 transcripts
<class '__main__.Exon'> 38813 41627
<class '__main__.Intron'> 41627 45439
<class '__main__.Exon'> 45439 46505


In [33]:
chr_list[0].show() #gene on positive strand
for thingy in chr_list[0].transcripts[0].exon_intron:
    thingy.show()
print()
chr_list[1].show() #gene on negative strand
for thingy in chr_list[1].transcripts[0].exon_intron:
    thingy.show()

chr1: 13955 transcripts
exon: 65418 65433 True 1
intron: 65433 65519 True
exon: 65519 65573 True 2
intron: 65573 69036 True
exon: 69036 71585 True 2

chr2: 10476 transcripts
exon: 38813 41627 False 2
intron: 41627 45439 False
exon: 45439 46505 False 2


# Converting data to 200bp windows (bed)

In [34]:
#Option 1: pcg_windows_sorted_200.bed (extends the non-200bp windows to 200bp)
#Option 2: pcg_windows_sorted_rem-not200.bed (removes non-200bp windows)
#will be using option 2

In [35]:
f = open('pcg_windows_sorted_rem-not200.bed')
windows = f.readlines()
for i in range(len(windows)):
    windows[i] = windows[i].split()
    windows[i][1] = int(windows[i][1])
    windows[i][2] = int(windows[i][2])
    for n in range(4): #4 categories
        windows[i].append(0.0)
    windows[i].append(0) #intron/exon boolean column
    windows[i].append(0) #cds/utr boolean column
windows

[['chr1', 65418, 65618, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65518, 65718, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65618, 65818, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65718, 65918, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65818, 66018, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65918, 66118, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66018, 66218, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66118, 66318, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66218, 66418, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66318, 66518, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66418, 66618, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66518, 66718, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66618, 66818, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66718, 66918, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66818, 67018, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66918, 67118, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67018, 67218, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67118, 67318, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67218, 67418, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67318, 67518, 0.0, 0.

In [36]:
#for each window
###limit range to appropriate chromosome
###ignore transcripts that don't overlap with range
###find all introns, CDSs, and UTRs (5' and 3') that overlap with window
###sum up totals for each of the four categories
###find percentages (as decimals) and save those values into the column

In [37]:
def findFraction(s1, e1, s2, e2):  #s2/e2 is the window
    #assumes they overlap
    fraction = 200
    #if (s1 > s2):
        #fraction -= (s1 - s2)
    fraction -= (s1 > s2) * (s1 - s2)
    fraction -= (e2 > e1) * (e2 - e1)
    #if (e2 > e1):
        #fraction -= (e2 - e1)
    return fraction

In [38]:
def hasOverlap(s1, e1, s2, e2): 
    if ((s1 >= e2) or (s2 >= e1)): #if there is no overlap
        return False
    return True

In [39]:
import time

In [40]:
len(windows)

13621619

In [41]:
chr_list[0].show()
for thingy in chr_list[0].transcripts[0].exon_intron:
    print()
    thingy.show()
    if (type(thingy) == Exon):
        for thing in thingy.cds_utr:
            thing.show()

chr1: 13955 transcripts

exon: 65418 65433 True 1
5' UTR: 65418 65433 True 

intron: 65433 65519 True

exon: 65519 65573 True 2
5' UTR: 65519 65564 True 
CDS: 65564 65573 True

intron: 65573 69036 True

exon: 69036 71585 True 2
CDS: 69036 70005 True
3' UTR: 70005 71585 True 


In [35]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i % 10000 == 0):
        print(i)
    if (i == 1000000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[:1000000]

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
53.022200254599255 minutes
ignore these indices:
[20787, 66938, 66939, 98466, 120808, 120809, 120810, 120811, 142915, 142916, 142917, 142918, 143803, 143804, 143805, 155524, 155525, 155526, 155527, 155528, 155529, 155530, 155531, 155532, 155533, 155534, 156548, 156549, 156550, 156551, 156552, 156553, 1716

[['chr1', 65418, 65618, 0.655, 0.045, 0.3, 0.0, 1, 1],
 ['chr1', 65518, 65718, 0.73, 0.045, 0.225, 0.0, 1, 1],
 ['chr1', 65618, 65818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65718, 65918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65818, 66018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65918, 66118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66018, 66218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66118, 66318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66218, 66418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66318, 66518, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66418, 66618, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66518, 66718, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66618, 66818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66718, 66918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66818, 67018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66918, 67118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67018, 67218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67118, 67318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67218, 67418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67318, 67518

In [29]:
import os

In [36]:
if os.path.isfile('6col-windows-pt01.bed'):
    os.remove('6col-windows-pt01.bed')

myFile = open('6col-windows-pt01.bed','w')

v1 = False

if (v1):
    for i in range(len(windows_200bp)):
        if (i == 1000000):
            break
        if (i in ignoreTheseIndices):
            print(i)
            continue
        window = windows_200bp[i]
        for j in range(len(window)):
            myFile.write(str(window[j]))
            myFile.write('\t')
        myFile.write('\n')
else:
    for window in windows_200bp[:1000000]:
        val = [str(item) for item in window]
        myFile.write('\t'.join(val) + '\n')
myFile.close()

In [37]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i < 1000000):
        continue
    if (i % 10000 == 0):
        print(i)
    if (i == 2000000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[1000000:2000000]

1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
1310000
1320000
1330000
1340000
1350000
1360000
1370000
1380000
1390000
1400000
1410000
1420000
1430000
1440000
1450000
1460000
1470000
1480000
1490000
1500000
1510000
1520000
1530000
1540000
1550000
1560000
1570000
1580000
1590000
1600000
1610000
1620000
1630000
1640000
1650000
1660000
1670000
1680000
1690000
1700000
1710000
1720000
1730000
1740000
1750000
1760000
1770000
1780000
1790000
1800000
1810000
1820000
1830000
1840000
1850000
1860000
1870000
1880000
1890000
1900000
1910000
1920000
1930000
1940000
1950000
1960000
1970000
1980000
1990000
2000000
30.884448285897573 minutes
ignore these indices:
[1020900, 1020901, 1021959, 1021960, 1021961, 1021962, 1021963, 1021964, 1062920, 1062921, 1062922, 1062923, 1062924, 1062925, 1062926, 1062927

[['chr1', 65418, 65618, 0.655, 0.045, 0.3, 0.0, 1, 1],
 ['chr1', 65518, 65718, 0.73, 0.045, 0.225, 0.0, 1, 1],
 ['chr1', 65618, 65818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65718, 65918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65818, 66018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65918, 66118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66018, 66218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66118, 66318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66218, 66418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66318, 66518, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66418, 66618, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66518, 66718, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66618, 66818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66718, 66918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66818, 67018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66918, 67118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67018, 67218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67118, 67318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67218, 67418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67318, 67518

In [38]:
if os.path.isfile('6col-windows-pt02.bed'):
    os.remove('6col-windows-pt02.bed')

myFile = open('6col-windows-pt02.bed','w')

for window in windows_200bp[1000000:2000000]:
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

In [39]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i < 2000000):
        continue
    if (i % 10000 == 0):
        print(i)
    if (i == 3000000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[2000000:3000000]

2000000
2010000
2020000
2030000
2040000
2050000
2060000
2070000
2080000
2090000
2100000
2110000
2120000
2130000
2140000
2150000
2160000
2170000
2180000
2190000
2200000
2210000
2220000
2230000
2240000
2250000
2260000
2270000
2280000
2290000
2300000
2310000
2320000
2330000
2340000
2350000
2360000
2370000
2380000
2390000
2400000
2410000
2420000
2430000
2440000
2450000
2460000
2470000
2480000
2490000
2500000
2510000
2520000
2530000
2540000
2550000
2560000
2570000
2580000
2590000
2600000
2610000
2620000
2630000
2640000
2650000
2660000
2670000
2680000
2690000
2700000
2710000
2720000
2730000
2740000
2750000
2760000
2770000
2780000
2790000
2800000
2810000
2820000
2830000
2840000
2850000
2860000
2870000
2880000
2890000
2900000
2910000
2920000
2930000
2940000
2950000
2960000
2970000
2980000
2990000
3000000
38.26554145415624 minutes
ignore these indices:
[2044657, 2044658, 2052717, 2052718, 2052719, 2052720, 2052721, 2052722, 2052723, 2052724, 2052725, 2052726, 2052727, 2052728, 2052729, 2052730,

[['chr1', 65418, 65618, 0.655, 0.045, 0.3, 0.0, 1, 1],
 ['chr1', 65518, 65718, 0.73, 0.045, 0.225, 0.0, 1, 1],
 ['chr1', 65618, 65818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65718, 65918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65818, 66018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65918, 66118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66018, 66218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66118, 66318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66218, 66418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66318, 66518, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66418, 66618, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66518, 66718, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66618, 66818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66718, 66918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66818, 67018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66918, 67118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67018, 67218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67118, 67318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67218, 67418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67318, 67518

In [40]:
if os.path.isfile('6col-windows-pt03.bed'):
    os.remove('6col-windows-pt03.bed')

myFile = open('6col-windows-pt03.bed','w')

for window in windows_200bp[2000000:3000000]:
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

In [41]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i < 3000000):
        continue
    if (i % 10000 == 0):
        print(i)
    if (i == 4000000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[3000000:4000000]

3000000
3010000
3020000
3030000
3040000
3050000
3060000
3070000
3080000
3090000
3100000
3110000
3120000
3130000
3140000
3150000
3160000
3170000
3180000
3190000
3200000
3210000
3220000
3230000
3240000
3250000
3260000
3270000
3280000
3290000
3300000
3310000
3320000
3330000
3340000
3350000
3360000
3370000
3380000
3390000
3400000
3410000
3420000
3430000
3440000
3450000
3460000
3470000
3480000
3490000
3500000
3510000
3520000
3530000
3540000
3550000
3560000
3570000
3580000
3590000
3600000
3610000
3620000
3630000
3640000
3650000
3660000
3670000
3680000
3690000
3700000
3710000
3720000
3730000
3740000
3750000
3760000
3770000
3780000
3790000
3800000
3810000
3820000
3830000
3840000
3850000
3860000
3870000
3880000
3890000
3900000
3910000
3920000
3930000
3940000
3950000
3960000
3970000
3980000
3990000
4000000
21.45166195631027 minutes
ignore these indices:
[3054415, 3054416, 3089794, 3112659, 3121881, 3121882, 3128548, 3128549, 3128550, 3156037, 3156038, 3156039, 3156040, 3156041, 3157232, 3157233,

[['chr1', 65418, 65618, 0.655, 0.045, 0.3, 0.0, 1, 1],
 ['chr1', 65518, 65718, 0.73, 0.045, 0.225, 0.0, 1, 1],
 ['chr1', 65618, 65818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65718, 65918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65818, 66018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65918, 66118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66018, 66218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66118, 66318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66218, 66418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66318, 66518, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66418, 66618, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66518, 66718, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66618, 66818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66718, 66918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66818, 67018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66918, 67118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67018, 67218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67118, 67318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67218, 67418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67318, 67518

In [42]:
if os.path.isfile('6col-windows-pt04.bed'):
    os.remove('6col-windows-pt04.bed')

myFile = open('6col-windows-pt04.bed','w')

for window in windows_200bp[3000000:4000000]:
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

In [43]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i < 4000000):
        continue
    if (i % 10000 == 0):
        print(i)
    if (i == 5000000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[4000000:5000000]

4000000
4010000
4020000
4030000
4040000
4050000
4060000
4070000
4080000
4090000
4100000
4110000
4120000
4130000
4140000
4150000
4160000
4170000
4180000
4190000
4200000
4210000
4220000
4230000
4240000
4250000
4260000
4270000
4280000
4290000
4300000
4310000
4320000
4330000
4340000
4350000
4360000
4370000
4380000
4390000
4400000
4410000
4420000
4430000
4440000
4450000
4460000
4470000
4480000
4490000
4500000
4510000
4520000
4530000
4540000
4550000
4560000
4570000
4580000
4590000
4600000
4610000
4620000
4630000
4640000
4650000
4660000
4670000
4680000
4690000
4700000
4710000
4720000
4730000
4740000
4750000
4760000
4770000
4780000
4790000
4800000
4810000
4820000
4830000
4840000
4850000
4860000
4870000
4880000
4890000
4900000
4910000
4920000
4930000
4940000
4950000
4960000
4970000
4980000
4990000
5000000
27.197817413012185 minutes
ignore these indices:
[4011132, 4011133, 4011134, 4011135, 4011136, 4011137, 4011138, 4011139, 4011140, 4011141, 4011142, 4011143, 4011144, 4011145, 4011146, 4011147

[['chr1', 65418, 65618, 0.655, 0.045, 0.3, 0.0, 1, 1],
 ['chr1', 65518, 65718, 0.73, 0.045, 0.225, 0.0, 1, 1],
 ['chr1', 65618, 65818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65718, 65918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65818, 66018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65918, 66118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66018, 66218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66118, 66318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66218, 66418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66318, 66518, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66418, 66618, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66518, 66718, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66618, 66818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66718, 66918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66818, 67018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66918, 67118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67018, 67218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67118, 67318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67218, 67418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67318, 67518

In [44]:
if os.path.isfile('6col-windows-pt05.bed'):
    os.remove('6col-windows-pt05.bed')

myFile = open('6col-windows-pt05.bed','w')

for window in windows_200bp[4000000:5000000]:
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

In [45]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i < 5000000):
        continue
    if (i % 10000 == 0):
        print(i)
    if (i == 6000000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[5000000:6000000]

5000000
5010000
5020000
5030000
5040000
5050000
5060000
5070000
5080000
5090000
5100000
5110000
5120000
5130000
5140000
5160000
5170000
5180000
5190000
5200000
5210000
5220000
5230000
5240000
5250000
5260000
5270000
5280000
5290000
5300000
5310000
5320000
5330000
5340000
5350000
5360000
5370000
5380000
5390000
5400000
5410000
5420000
5430000
5440000
5450000
5460000
5470000
5480000
5490000
5500000
5510000
5520000
5530000
5540000
5550000
5560000
5570000
5580000
5590000
5600000
5610000
5620000
5630000
5640000
5650000
5660000
5670000
5680000
5690000
5700000
5710000
5720000
5730000
5740000
5750000
5760000
5770000
5780000
5790000
5800000
5810000
5820000
5830000
5840000
5850000
5860000
5870000
5880000
5890000
5900000
5910000
5920000
5930000
5940000
5950000
5960000
5970000
5980000
5990000
6000000
31.52472451130549 minutes
ignore these indices:
[5023010, 5023011, 5023012, 5023098, 5023099, 5048952, 5048953, 5054295, 5054296, 5060494, 5060495, 5061104, 5061105, 5061106, 5074979, 5074980, 5074981

[['chr1', 65418, 65618, 0.655, 0.045, 0.3, 0.0, 1, 1],
 ['chr1', 65518, 65718, 0.73, 0.045, 0.225, 0.0, 1, 1],
 ['chr1', 65618, 65818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65718, 65918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65818, 66018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65918, 66118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66018, 66218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66118, 66318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66218, 66418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66318, 66518, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66418, 66618, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66518, 66718, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66618, 66818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66718, 66918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66818, 67018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66918, 67118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67018, 67218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67118, 67318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67218, 67418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67318, 67518

In [46]:
if os.path.isfile('6col-windows-pt06.bed'):
    os.remove('6col-windows-pt06.bed')

myFile = open('6col-windows-pt06.bed','w')

for window in windows_200bp[5000000:6000000]:
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

In [47]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i < 6000000):
        continue
    if (i % 10000 == 0):
        print(i)
    if (i == 7000000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[6000000:7000000]

6000000
6010000
6020000
6030000
6040000
6050000
6060000
6070000
6080000
6090000
6100000
6110000
6120000
6130000
6140000
6150000
6160000
6170000
6180000
6190000
6200000
6210000
6220000
6230000
6240000
6250000
6260000
6270000
6280000
6290000
6300000
6310000
6320000
6330000
6340000
6350000
6360000
6370000
6380000
6390000
6400000
6410000
6420000
6430000
6440000
6450000
6460000
6470000
6480000
6490000
6500000
6510000
6520000
6530000
6540000
6550000
6560000
6570000
6580000
6590000
6600000
6610000
6620000
6630000
6640000
6650000
6660000
6670000
6680000
6690000
6700000
6710000
6720000
6730000
6740000
6750000
6760000
6770000
6780000
6790000
6800000
6810000
6820000
6830000
6840000
6850000
6860000
6870000
6880000
6890000
6900000
6910000
6920000
6930000
6940000
6950000
6960000
6970000
6980000
6990000
7000000
48.75632092555364 minutes
ignore these indices:
[6004482, 6004483, 6012728, 6030390, 6030391, 6030392, 6030393, 6030394, 6030395, 6030396, 6030397, 6030398, 6030399, 6040084, 6040085, 6040086,

[['chr1', 65418, 65618, 0.655, 0.045, 0.3, 0.0, 1, 1],
 ['chr1', 65518, 65718, 0.73, 0.045, 0.225, 0.0, 1, 1],
 ['chr1', 65618, 65818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65718, 65918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65818, 66018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65918, 66118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66018, 66218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66118, 66318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66218, 66418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66318, 66518, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66418, 66618, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66518, 66718, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66618, 66818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66718, 66918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66818, 67018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66918, 67118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67018, 67218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67118, 67318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67218, 67418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67318, 67518

In [48]:
if os.path.isfile('6col-windows-pt07.bed'):
    os.remove('6col-windows-pt07.bed')

myFile = open('6col-windows-pt07.bed','w')

for window in windows_200bp[6000000:7000000]:
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

In [49]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i < 7000000):
        continue
    if (i % 10000 == 0):
        print(i)
    if (i == 8000000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[7000000:8000000]

7000000
7010000
7020000
7030000
7040000
7050000
7060000
7070000
7080000
7090000
7100000
7110000
7120000
7130000
7140000
7150000
7160000
7170000
7180000
7190000
7200000
7210000
7220000
7230000
7240000
7250000
7260000
7270000
7280000
7290000
7300000
7310000
7320000
7330000
7340000
7350000
7360000
7370000
7380000
7390000
7400000
7410000
7420000
7430000
7440000
7450000
7460000
7470000
7480000
7490000
7500000
7510000
7520000
7530000
7540000
7550000
7560000
7570000
7580000
7590000
7600000
7610000
7620000
7630000
7640000
7650000
7660000
7670000
7680000
7690000
7700000
7710000
7720000
7730000
7740000
7750000
7760000
7770000
7780000
7790000
7800000
7810000
7820000
7830000
7840000
7850000
7860000
7870000
7880000
7890000
7900000
7910000
7920000
7930000
7940000
7950000
7960000
7970000
7980000
7990000
8000000
34.12101310491562 minutes
ignore these indices:
[7010199, 7010200, 7010201, 7010202, 7010203, 7010204, 7010205, 7010206, 7010207, 7010208, 7010209, 7010210, 7010594, 7010595, 7010596, 7010597,

[['chr1', 65418, 65618, 0.655, 0.045, 0.3, 0.0, 1, 1],
 ['chr1', 65518, 65718, 0.73, 0.045, 0.225, 0.0, 1, 1],
 ['chr1', 65618, 65818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65718, 65918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65818, 66018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65918, 66118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66018, 66218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66118, 66318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66218, 66418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66318, 66518, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66418, 66618, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66518, 66718, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66618, 66818, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66718, 66918, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66818, 67018, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66918, 67118, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67018, 67218, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67118, 67318, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67218, 67418, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67318, 67518

In [50]:
if os.path.isfile('6col-windows-pt08.bed'):
    os.remove('6col-windows-pt08.bed')

myFile = open('6col-windows-pt08.bed','w')

for window in windows_200bp[7000000:8000000]:
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

In [42]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i < 8000000):
        continue
    if (i % 10000 == 0):
        print(i)
    if (i == 9000000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[8000000:9000000]

8000000
8010000
8020000
8030000
8040000
8050000
8060000
8070000
8080000
8090000
8100000
8110000
8120000
8130000
8140000
8150000
8160000
8170000
8180000
8190000
8200000
8210000
8220000
8230000
8240000
8250000
8260000
8270000
8280000
8290000
8300000
8310000
8320000
8330000
8340000
8350000
8360000
8370000
8380000
8390000
8400000
8410000
8420000
8430000
8440000
8450000
8460000
8470000
8480000
8490000
8500000
8510000
8520000
8530000
8540000
8550000
8560000
8570000
8580000
8590000
8600000
8610000
8620000
8630000
8640000
8650000
8660000
8670000
8680000
8690000
8700000
8710000
8720000
8730000
8740000
8750000
8760000
8770000
8780000
8790000
8800000
8810000
8820000
8830000
8840000
8850000
8860000
8870000
8880000
8890000
8900000
8910000
8920000
8930000
8940000
8950000
8960000
8970000
8980000
8990000
9000000
33.76397041082382 minutes
ignore these indices:
[8019980, 8019981, 8019982, 8019983, 8035180, 8035181, 8035182, 8035199, 8035236, 8035237, 8035238, 8037091, 8037092, 8045991, 8045992, 8062336,

[['chr1', 65418, 65618, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65518, 65718, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65618, 65818, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65718, 65918, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65818, 66018, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 65918, 66118, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66018, 66218, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66118, 66318, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66218, 66418, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66318, 66518, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66418, 66618, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66518, 66718, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66618, 66818, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66718, 66918, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66818, 67018, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 66918, 67118, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67018, 67218, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67118, 67318, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67218, 67418, 0.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr1', 67318, 67518, 0.0, 0.

In [45]:
if os.path.isfile('6col-windows-pt09.bed'):
    os.remove('6col-windows-pt09.bed')

myFile = open('6col-windows-pt09.bed','w')

for window in windows_200bp[8000000:9000000]:
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

In [46]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i < 9000000):
        continue
    if (i % 10000 == 0):
        print(i)
    if (i == 10000000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[9000000:10000000]

9000000
9010000
9020000
9030000
9040000
9050000
9060000
9070000
9080000
9090000
9100000
9110000
9120000
9130000
9140000
9150000
9160000
9170000
9180000
9190000
9200000
9210000
9220000
9230000
9240000
9250000
9260000
9270000
9280000
9290000
9300000
9310000
9320000
9330000
9340000
9350000
9360000
9370000
9380000
9390000
9400000
9410000
9420000
9430000
9440000
9450000
9460000
9470000
9480000
9490000
9500000
9510000
9520000
9530000
9540000
9550000
9560000
9570000
9580000
9590000
9600000
9610000
9620000
9630000
9640000
9650000
9660000
9670000
9680000
9690000
9700000
9710000
9720000
9730000
9740000
9750000
9760000
9770000
9780000
9790000
9800000
9810000
9820000
9830000
9840000
9850000
9860000
9870000
9880000
9890000
9900000
9910000
9920000
9930000
9940000
9950000
9960000
9970000
9980000
9990000
10000000
23.69607275724411 minutes
ignore these indices:
[9069095, 9069096, 9076957, 9087140, 9087141, 9102939, 9147552, 9147553, 9147554, 9147555, 9147556, 9160516, 9160517, 9299458, 9334241, 9334242

[['chr4', 53399138, 53399338, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr4', 53399140, 53399340, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr4', 53399238, 53399438, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr4', 53399240, 53399440, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr4', 53399338, 53399538, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr4', 53399340, 53399540, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr4', 53399438, 53399638, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr4', 53399440, 53399640, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr4',
  53399538,
  53399738,
  0.965865992414665,
  0.028445006321112517,
  0.0,
  0.005689001264222503,
  1,
  0],
 ['chr4',
  53399540,
  53399740,
  0.9581749049429658,
  0.03485424588086185,
  0.0,
  0.00697084917617237,
  1,
  0],
 ['chr4',
  53399638,
  53399838,
  0.5267727930535456,
  0.394356005788712,
  0.0,
  0.0788712011577424,
  1,
  0],
 ['chr4',
  53399640,
  53399840,
  0.5217391304347826,
  0.39855072463768115,
  0.0,
  0.07971014492753623,
  1,
  0],
 ['chr4',
  53399738,
  53399938,
  0.5665236051502146,
  0.361230

In [47]:
if os.path.isfile('6col-windows-pt10.bed'):
    os.remove('6col-windows-pt10.bed')

myFile = open('6col-windows-pt10.bed','w')

for window in windows_200bp[9000000:10000000]:
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

In [48]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i < 10000000):
        continue
    if (i % 10000 == 0):
        print(i)
    if (i == 11000000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[10000000:11000000]

10000000
10010000
10020000
10030000
10040000
10050000
10060000
10070000
10080000
10090000
10100000
10110000
10120000
10130000
10140000
10150000
10160000
10170000
10180000
10190000
10200000
10210000
10220000
10230000
10240000
10250000
10260000
10270000
10280000
10290000
10300000
10310000
10320000
10330000
10340000
10350000
10360000
10370000
10380000
10390000
10400000
10410000
10420000
10430000
10440000
10450000
10460000
10470000
10480000
10490000
10500000
10510000
10520000
10530000
10540000
10550000
10560000
10570000
10580000
10590000
10600000
10610000
10620000
10630000
10640000
10650000
10660000
10670000
10680000
10690000
10700000
10710000
10720000
10730000
10740000
10750000
10760000
10770000
10780000
10790000
10800000
10810000
10820000
10830000
10840000
10850000
10860000
10870000
10880000
10890000
10900000
10910000
10920000
10930000
10940000
10950000
10960000
10970000
10980000
10990000
11000000
25.42213575442632 minutes
ignore these indices:
[10040023, 10040024, 10041431, 10093376, 10

[['chr5', 127009213, 127009413, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127009313, 127009513, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127009413, 127009613, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127009513, 127009713, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127009613, 127009813, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127009713, 127009913, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127009813, 127010013, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127009913, 127010113, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127010013, 127010213, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127010113, 127010313, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127010213, 127010413, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127010313, 127010513, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127010413, 127010613, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127010513, 127010713, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127010613, 127010813, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127010713, 127010913, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr5', 127010813, 127011013, 1.0, 0.0, 0.0, 0.0, 0, 0

In [49]:
if os.path.isfile('6col-windows-pt11.bed'):
    os.remove('6col-windows-pt11.bed')

myFile = open('6col-windows-pt11.bed','w')

for window in windows_200bp[10000000:11000000]:
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

In [50]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i < 11000000):
        continue
    if (i % 10000 == 0):
        print(i)
    if (i == 12000000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[11000000:12000000]

11000000
11010000
11020000
11030000
11040000
11050000
11060000
11070000
11080000
11090000
11100000
11110000
11120000
11130000
11140000
11150000
11160000
11170000
11180000
11190000
11200000
11210000
11220000
11230000
11240000
11250000
11260000
11270000
11280000
11290000
11300000
11310000
11320000
11330000
11340000
11350000
11360000
11370000
11380000
11390000
11400000
11410000
11420000
11430000
11440000
11450000
11460000
11470000
11480000
11490000
11500000
11510000
11520000
11530000
11540000
11550000
11560000
11570000
11580000
11590000
11600000
11610000
11620000
11630000
11640000
11650000
11660000
11670000
11680000
11690000
11700000
11710000
11720000
11730000
11740000
11750000
11760000
11770000
11780000
11790000
11800000
11810000
11820000
11830000
11840000
11850000
11860000
11870000
11880000
11890000
11900000
11910000
11920000
11930000
11940000
11950000
11960000
11970000
11980000
11990000
12000000
26.27369483311971 minutes
ignore these indices:
[11018185, 11018186, 11028875, 11028876, 11

[['chr6', 154866896, 154867096, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154866996, 154867196, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154867096, 154867296, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154867196, 154867396, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154867296, 154867496, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154867396, 154867596, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154867496, 154867696, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154867596, 154867796, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154867696, 154867896, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154867796, 154867996, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154867896, 154868096, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154867996, 154868196, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154868096, 154868296, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154868196, 154868396, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154868296, 154868496, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154868396, 154868596, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 154868496, 154868696, 1.0, 0.0, 0.0, 0.0, 0, 0

In [51]:
if os.path.isfile('6col-windows-pt12.bed'):
    os.remove('6col-windows-pt12.bed')

myFile = open('6col-windows-pt12.bed','w')

for window in windows_200bp[11000000:12000000]:
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

In [52]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i < 12000000):
        continue
    if (i % 10000 == 0):
        print(i)
    if (i == 13000000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[12000000:13000000]

12000000
12010000
12020000
12030000
12040000
12050000
12060000
12070000
12080000
12090000
12100000
12110000
12120000
12130000
12140000
12150000
12160000
12170000
12180000
12190000
12200000
12210000
12220000
12230000
12240000
12250000
12260000
12270000
12280000
12290000
12300000
12310000
12320000
12330000
12340000
12350000
12360000
12370000
12380000
12390000
12400000
12410000
12420000
12430000
12440000
12450000
12460000
12470000
12480000
12490000
12500000
12510000
12520000
12530000
12540000
12550000
12560000
12570000
12580000
12590000
12600000
12610000
12620000
12630000
12640000
12650000
12660000
12670000
12680000
12690000
12700000
12710000
12720000
12730000
12740000
12750000
12760000
12770000
12780000
12790000
12800000
12810000
12820000
12830000
12840000
12850000
12860000
12870000
12880000
12890000
12900000
12910000
12920000
12930000
12940000
12950000
12960000
12970000
12980000
12990000
13000000
20.44086560408274 minutes
ignore these indices:
[12010360, 12010361, 12010362, 12010363, 12

[['chr8', 19026202, 19026402, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19026302, 19026502, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19026402, 19026602, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19026502, 19026702, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19026602, 19026802, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19026702, 19026902, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19026802, 19027002, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19026902, 19027102, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19027002, 19027202, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19027102, 19027302, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19027202, 19027402, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19027302, 19027502, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19027402, 19027602, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19027502, 19027702, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19027602, 19027802, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19027702, 19027902, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19027802, 19028002, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr8', 19027902, 19028102, 1

In [53]:
if os.path.isfile('6col-windows-pt13.bed'):
    os.remove('6col-windows-pt13.bed')

myFile = open('6col-windows-pt13.bed','w')

for window in windows_200bp[12000000:13000000]:
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

In [54]:
windows_200bp = windows.copy()
ignoreTheseIndices = []
invalid = -1

start = time.time()

for i in range(len(windows_200bp)):
    if (i < 13000000):
        continue
    if (i % 10000 == 0):
        print(i)
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] in rightChrs):
        chrm = chr_list[int(window[0][3:]) - 1]
    else:
        print('invalid chromosome name')
        invalid = window
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                if (hasOverlap(x.start, x.end, window[1], window[2])):
                    intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                if (not hasOverlap(y.start, y.end, window[1], window[2])):
                    continue
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    ###make window[8] equal to 1 if on boundary btwn cds & utr
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(0 < window[3] and window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001): #adds up to 1, leeway for rounding errors
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp[13000000:]

13000000
13010000
13020000
13030000
13040000
13050000
invalid chromosome name
1.1950910806655883 minutes
ignore these indices:
[13003510, 13003511, 13003512, 13003513, 13003514, 13024376, 13024377, 13024378, 13024379, 13024380, 13024381, 13031026, 13031027, 13031028, 13035307, 13035637, 13035638, 13042607, 13042608, 13042609, 13042610, 13042611, 13042612, 13042613, 13042614, 13042615, 13042616, 13042617, 13042618, 13042619, 13042620, 13042621, 13042622, 13044124, 13047570, 13047571]


[['chr9', 130078125, 130078325, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130078225, 130078425, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130078325, 130078525, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130078425, 130078625, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130078525, 130078725, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130078625, 130078825, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130078725, 130078925, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130078825, 130079025, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130078925, 130079125, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130079025, 130079225, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130079125, 130079325, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130079225, 130079425, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130079325, 130079525, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130079425, 130079625, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr9', 130079525, 130079725, 0.52, 0.386, 0.0, 0.094, 1, 1],
 ['chr9', 130079625, 130079825, 0.3, 0.562, 0.0, 0.138, 1, 1],
 ['chr9', 130079725, 130079925, 0.78, 0.176, 0.

In [55]:
if os.path.isfile('6col-windows-pt14.bed'):
    os.remove('6col-windows-pt14.bed')

myFile = open('6col-windows-pt14.bed','w')

for window in windows_200bp[13000000:]:
    if (window == invalid):
        print("invalid chromosome ") #it's supposed to do this b/c there's chrM
        break
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

invalid chromosome 


In [56]:
#group the files into a folder
!mkdir labelled_windows-group2

mkdir: cannot create directory ‘labelled_windows-group2’: File exists


In [57]:
!cp 6col-windows-pt* labelled_windows-group2/

In [58]:
!rm 6col-windows-pt*

In [59]:
#merge the 14 files together
import glob
import fileinput

In [60]:
read_files = glob.glob("labelled_windows-group2/6col-windows-pt*")

with open('labelled_windows-group2/6col-windows.bed', 'w') as file:
    file.writelines(fileinput.input(read_files))
file.close()

In [61]:
!bedtools sort -i "labelled_windows-group2/6col-windows.bed" > "labelled_windows-group2/sorted-6col-windows.bed"

# Thresholds

In [1]:
f = open('labelled_windows-group2/sorted-6col-windows.bed', 'r')
sorted_windows = f.readlines()
f.close()

for i in range(len(sorted_windows)):
    sorted_windows[i] = sorted_windows[i].split()

sorted_windows

[['chr1', '65418', '65618', '0.655', '0.045', '0.0', '0.3', '1', '1'],
 ['chr1', '65518', '65718', '0.73', '0.045', '0.0', '0.225', '1', '1'],
 ['chr1', '65618', '65818', '1.0', '0.0', '0.0', '0.0', '0', '0'],
 ['chr1', '65718', '65918', '1.0', '0.0', '0.0', '0.0', '0', '0'],
 ['chr1', '65818', '66018', '1.0', '0.0', '0.0', '0.0', '0', '0'],
 ['chr1', '65918', '66118', '1.0', '0.0', '0.0', '0.0', '0', '0'],
 ['chr1', '66018', '66218', '1.0', '0.0', '0.0', '0.0', '0', '0'],
 ['chr1', '66118', '66318', '1.0', '0.0', '0.0', '0.0', '0', '0'],
 ['chr1', '66218', '66418', '1.0', '0.0', '0.0', '0.0', '0', '0'],
 ['chr1', '66318', '66518', '1.0', '0.0', '0.0', '0.0', '0', '0'],
 ['chr1', '66418', '66618', '1.0', '0.0', '0.0', '0.0', '0', '0'],
 ['chr1', '66518', '66718', '1.0', '0.0', '0.0', '0.0', '0', '0'],
 ['chr1', '66618', '66818', '1.0', '0.0', '0.0', '0.0', '0', '0'],
 ['chr1', '66718', '66918', '1.0', '0.0', '0.0', '0.0', '0', '0'],
 ['chr1', '66818', '67018', '1.0', '0.0', '0.0', '0.0

In [2]:
threshold = 0.5
counter = [0, 0, 0, 0]

for sw in sorted_windows:
    counter[0] += int(float(sw[3]) > threshold)
    counter[1] += int(float(sw[4]) > threshold)
    counter[2] += int(float(sw[5]) > threshold)
    counter[3] += int(float(sw[6]) > threshold)

print(counter)

KeyboardInterrupt: 

In [None]:
threshold = 0.95
counter = [0, 0, 0, 0]

for sw in sorted_windows:
    counter[0] += int(float(sw[3]) > threshold)
    counter[1] += int(float(sw[4]) > threshold)
    counter[2] += int(float(sw[5]) > threshold)
    counter[3] += int(float(sw[6]) > threshold)

print(counter)