# Creating protein_coding_transcripts

In [1]:
!head protein_coding.gtf -n20

chr1	HAVANA	gene	65419	71585	.	+	.	gene_id "ENSG00000186092.6"; gene_type "protein_coding"; gene_name "OR4F5"; level 2; hgnc_id "HGNC:14825"; havana_gene "OTTHUMG00000001094.1";
chr1	HAVANA	transcript	65419	71585	.	+	.	gene_id "ENSG00000186092.6"; transcript_id "ENST00000641515.2"; gene_type "protein_coding"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_name "OR4F5-202"; level 2; protein_id "ENSP00000493376.2"; hgnc_id "HGNC:14825"; tag "RNA_Seq_supported_partial"; tag "basic"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";
chr1	HAVANA	exon	65419	65433	.	+	.	gene_id "ENSG00000186092.6"; transcript_id "ENST00000641515.2"; gene_type "protein_coding"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_name "OR4F5-202"; exon_number 1; exon_id "ENSE00003812156.1"; level 2; protein_id "ENSP00000493376.2"; hgnc_id "HGNC:14825"; tag "RNA_Seq_supported_partial"; tag "basic"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "O

In [2]:
!awk -v OFS="\t" '{if ($3 == "transcript" || $3 == "exon" || $3 == "CDS" || $3 == "UTR") print $1, $3, $4 - 1, $5, $7}' protein_coding.gtf > protein_coding_transcripts.txt
!cat protein_coding_transcripts.txt | head #it's very similar to .bed but there's an extra column so I changed it to .txt

chr1	transcript	65418	71585	+
chr1	exon	65418	65433	+
chr1	exon	65519	65573	+
chr1	CDS	65564	65573	+
chr1	exon	69036	71585	+
chr1	CDS	69036	70005	+
chr1	UTR	65418	65433	+
chr1	UTR	65519	65564	+
chr1	UTR	70005	71585	+
chr1	transcript	69054	70108	+
cat: write error: Broken pipe


In [3]:
f = open('protein_coding_transcripts.txt')
protein_coding_transcripts = f.readlines()
protein_coding_transcripts

['chr1\ttranscript\t65418\t71585\t+\n',
 'chr1\texon\t65418\t65433\t+\n',
 'chr1\texon\t65519\t65573\t+\n',
 'chr1\tCDS\t65564\t65573\t+\n',
 'chr1\texon\t69036\t71585\t+\n',
 'chr1\tCDS\t69036\t70005\t+\n',
 'chr1\tUTR\t65418\t65433\t+\n',
 'chr1\tUTR\t65519\t65564\t+\n',
 'chr1\tUTR\t70005\t71585\t+\n',
 'chr1\ttranscript\t69054\t70108\t+\n',
 'chr1\texon\t69054\t70108\t+\n',
 'chr1\tCDS\t69090\t70005\t+\n',
 'chr1\tUTR\t69054\t69090\t+\n',
 'chr1\tUTR\t70005\t70108\t+\n',
 'chr1\ttranscript\t450702\t451697\t-\n',
 'chr1\texon\t450702\t451697\t-\n',
 'chr1\tCDS\t450742\t451678\t-\n',
 'chr1\tUTR\t450702\t450742\t-\n',
 'chr1\tUTR\t451678\t451697\t-\n',
 'chr1\ttranscript\t685678\t686673\t-\n',
 'chr1\texon\t685678\t686673\t-\n',
 'chr1\tCDS\t685718\t686654\t-\n',
 'chr1\tUTR\t685678\t685718\t-\n',
 'chr1\tUTR\t686654\t686673\t-\n',
 'chr1\ttranscript\t923927\t939291\t+\n',
 'chr1\texon\t923927\t924948\t+\n',
 'chr1\tCDS\t924431\t924948\t+\n',
 'chr1\texon\t925921\t926013\t+\n',
 'chr

# Making classes.
## Chromosome, Transcript, Intron, Exon, CDS, UTR

In [4]:
class Chromosome:
    def __init__(self, nm): #string (format chr1, chr2, chrx, etc.)
        self.name = nm #string
        self.transcripts = []
    def show(self):
        show = str(str(self.name) + ": ")
        show += str(len(self.transcripts)) + " transcripts"
        print(show)
    def add_transcript(self, s, e, strand):
        self.transcripts.append(Transcript(s, e, strand))

In [5]:
class Transcript:
    def __init__(self, s, e, posneg): #int, int, string
        self.start = int(s)
        self.end = int(e)
        self.strand = (posneg == "+") #boolean
        self.exon_intron = []
        self.cds_start = -1 #smaller number, regardless of whether it's 5' or 3'
        self.cds_end = -1    #larger number, regardless of whether it's 3' or 5'
    def show(self):
        show = str("transcript: ")
        show += str(self.start) + " "      #start position
        show += str(self.end) + " "        #end position
        show += str(self.strand) + " "     #positive or negative strand
        show += str(len(self.exon_intron)) #how many exons
        print(show)
    def add_exon(self, s, e):
        self.exon_intron.append(Exon(s, e, self.strand))
        self.sort()
    def add_intron(self, s, e):
        self.exon_intron.append(Intron(s, e, self.strand))
        self.sort()
    def sort(self):
        self.exon_intron.sort(key=lambda x: x.start)
    def setCdsPos(self): #assumes there aren't any UTRs
        for currExon in self.exon_intron:
            if (not currExon.has_cds()):
                continue
            exon_cds_start = currExon.cds_utr[0].start
            exon_cds_end = currExon.cds_utr[0].end
            if (self.cds_start == -1 or self.cds_start > exon_cds_start):
                self.cds_start = exon_cds_start
            if (self.cds_end == -1 or self.cds_end < exon_cds_end):
                self.cds_end = exon_cds_end
    def has_intron(self):
        for x in self.exon_intron:
            if (type(x) == Intron):
                #print("there's an intron")
                return True
        #print("  no introns!")
        return False
    def has_utr(self):
        for x in self.exon_intron:
            if x.has_utr():
                return True
        return False

In [6]:
class Intron:
    def __init__(self, s, e, trnscrpt_strand): #int, int, boolean
        self.start = int(s)
        self.end = int(e)
        self.strand = trnscrpt_strand #boolean
    def show(self):
        show = str("intron: ")
        show += str(self.start) + " "      #start position
        show += str(self.end) + " "        #end position
        show += str(self.strand)           #positive or negative strand
        print(show)
    def has_utr(self):
        return False
    def has_cds(self):
        return False

In [7]:
class Exon:
    def __init__(self, s, e, trnscrpt_strand): #int, int, boolean
        self.start = int(s)
        self.end = int(e)
        self.strand = trnscrpt_strand #boolean
        self.cds_utr = []
    def show(self):
        show = str("exon: ")
        show += str(self.start) + " "      #start position
        show += str(self.end) + " "        #end position
        show += str(self.strand) + " "     #positive or negative strand
        show += str(len(self.cds_utr))     #how many CDSs and UTRs
        print(show)
    def add_cds(self, s, e):
        self.cds_utr.append(Cds(s, e, self.strand))
        self.sort()
    def add_utr(self, s, e, is_start):
        self.cds_utr.append(Utr(s, e, self.strand, is_start)) #start = UTR index < CDS index (5' if pos, 3' if neg)
        self.sort()
    def sort(self):
        self.cds_utr.sort(key=lambda x: x.start)
    def has_cds(self):
        for x in self.cds_utr:
            if (type(x) == Cds):
                return True
        return False
    def has_utr(self):
        for x in self.cds_utr:
            if (type(x) == Utr):
                return True
        return False

In [8]:
class Cds:
    def __init__(self, s, e, exon_strand): #int, int, boolean
        self.start = int(s)
        self.end = int(e)
        self.strand = exon_strand #boolean
    def show(self):
        show = str("CDS: ")
        show += str(self.start) + " "       #start position
        show += str(self.end) + " "        #end position
        show += str(self.strand)           #positive or negative strand
        print(show)

In [9]:
class Utr:
    def __init__(self, s, e, exon_strand, is_start): #int, int, boolean, boolean
        self.start = int(s)
        self.end = int(e)
        self.strand = exon_strand #boolean
        self.is_five = is_start                #True = 5' UTR, False = 3' UTR
        if (not self.strand):                  #if it is on the negative strand
            self.is_five = not self.is_five    #flip
    def show(self):
        if(self.is_five):
            show = str("5' ")
        else:
            show = str("3' ")
        show += str("UTR: ")
        show += str(self.start) + " "      #start position
        show += str(self.end) + " "        #end position
        show += str(self.strand) + " "     #positive or negative strand
        print(show)

In [10]:
example_transcript1 = Transcript(1, 100, "+")
example_transcript2 = Transcript(30, 600, "-")

example_transcript1.show()
example_transcript2.show()

print(example_transcript1)
print(example_transcript2)

print()
print("-------------")
print()

example_transcript1.add_exon(2, 10)
example_transcript1.add_exon(21, 22)
example_transcript1.add_exon(40, 50)
example_transcript2.add_exon(99, 580)

example_transcript1.show()
example_transcript2.show()

for i in range(3):
    print("example_transcript1")
    example_transcript1.exon_intron[i].show()
print("example_transcript2")
example_transcript2.exon_intron[0].show()

transcript: 1 100 True 0
transcript: 30 600 False 0
<__main__.Transcript object at 0x7f3c7a2a3b50>
<__main__.Transcript object at 0x7f3c7a2a3b10>

-------------

transcript: 1 100 True 3
transcript: 30 600 False 1
example_transcript1
exon: 2 10 True 0
example_transcript1
exon: 21 22 True 0
example_transcript1
exon: 40 50 True 0
example_transcript2
exon: 99 580 False 0


In [11]:
example_chromosome1 = Chromosome("chr1")
example_chromosome5 = Chromosome("chr5")

example_chromosome1.show()
example_chromosome5.show()

print()
print("-------------")
print()

example_chromosome1.add_transcript(11, 20, "+")

example_chromosome1.show()
example_chromosome5.show()

print()
print("-------------")
print()

test = example_chromosome1.transcripts[0]
print(example_chromosome1.transcripts[0])
print(test)
test.show()

chr1: 0 transcripts
chr5: 0 transcripts

-------------

chr1: 1 transcripts
chr5: 0 transcripts

-------------

<__main__.Transcript object at 0x7f3c7a23fc90>
<__main__.Transcript object at 0x7f3c7a23fc90>
transcript: 11 20 True 0


# Filling chr_list

### Chromosomes, Exons, CDSs, UTRs

In [12]:
chr_list = []
chr_ind = -1
trn_ind = -1
exn_ind = -1
for i in range(len(protein_coding_transcripts)):
    if (i % 200000 == 0): #just to make sure it's actually running (not stuck)
        print(i)
    x = protein_coding_transcripts[i]
    temp = x.split()
    if (chr_ind == -1 or temp[0] != chr_list[chr_ind].name): #if it's the next chromosome
        #create new chromosome in chr_list, reset transcript & exon indices
        if ("1" <= temp[0][3:4] and temp[0][3:4] <= "9"): #if it's a number
            if ((int(temp[0][3:]) < 1 or 22 < int(temp[0][3:]))):
                continue
        elif (temp[0] != "chrX" and temp[0] != "chrY"): #the rest are letters
            continue
        chr_list.append(Chromosome(temp[0]))
        chr_ind += 1
        trn_ind = -1
    if (temp[1] == 'transcript'): #if it's a transcript
        #add new transcript to current chromosome, reset exon index
        chr_list[chr_ind].add_transcript(int(temp[2]), int(temp[3]), temp[4])
        trn_ind += 1
        exn_ind = -1
    elif (temp[1] == 'exon'): #if it's an exon
        #add new exon to current transcript
        chr_list[chr_ind].transcripts[trn_ind].add_exon(int(temp[2]), int(temp[3]))
        exn_ind += 1
    elif (temp[1] == 'CDS'): #if it's a CDS
        #find the exon it's in
        currTranscript = chr_list[chr_ind].transcripts[trn_ind]
        for exn in currTranscript.exon_intron: #assumes no introns yet
            if (exn.start <= int(temp[2]) and int(temp[3]) <= exn.end):
                currExon = exn
                break
        
        #add new CDS to whatever exon it's in
        currExon.add_cds(int(temp[2]), int(temp[3]))
    elif (temp[1] == 'UTR'): # if it's a UTR
        #find the exon it's in
        currTranscript = chr_list[chr_ind].transcripts[trn_ind]
        currTranscript.setCdsPos()
        
        #find the exon it's in
        for exn in currTranscript.exon_intron: #assumes no introns yet
            if (exn.start <= int(temp[3]) and int(temp[3]) <= exn.end):
                currExon = exn
                break
        
        #add new UTR to whatever exon it's in
        if (int(temp[3]) <= currTranscript.cds_start): #if UTR position is smaller than CDS position
            currExon.add_utr(temp[2], temp[3], True)
        elif (int(temp[2]) >= currTranscript.cds_end): #if UTR position is larger than CDS position
            currExon.add_utr(temp[2], temp[3], False)
        else:
            print("something's wrong")
            print("  UTR start: " + temp[2])
            print("  UTR end: " + temp[3])
            print("  CDS start: " + str(currTranscript.cds_start))
            print("  CDS end: " + str(currTranscript.cds_end))

for chrm in chr_list:
    chrm.show()

0
200000
400000
600000
800000
1000000
1200000
1400000
1600000
1800000
2000000
2200000
chr1: 13955 transcripts
chr2: 10476 transcripts
chr3: 9508 transcripts
chr4: 5910 transcripts
chr5: 6843 transcripts
chr6: 6468 transcripts
chr7: 7185 transcripts
chr8: 5650 transcripts
chr9: 4932 transcripts
chr10: 4884 transcripts
chr11: 10221 transcripts
chr12: 9227 transcripts
chr13: 2057 transcripts
chr14: 5595 transcripts
chr15: 5433 transcripts
chr16: 7978 transcripts
chr17: 10284 transcripts
chr18: 2556 transcripts
chr19: 10902 transcripts
chr20: 3383 transcripts
chr21: 1581 transcripts
chr22: 3377 transcripts
chrX: 4745 transcripts
chrY: 309 transcripts


In [13]:
chr_list[0].show()
for thingy in chr_list[0].transcripts[0].exon_intron:
    print()
    thingy.show()
    if (type(thingy) == Exon):
        for thing in thingy.cds_utr:
            thing.show()

print()
print()
print()

chr_list[1].show()
for thingy in chr_list[1].transcripts[0].exon_intron:
    print()
    thingy.show()
    if (type(thingy) == Exon):
        for thing in thingy.cds_utr:
            thing.show()

chr1: 13955 transcripts

exon: 65418 65433 True 1
5' UTR: 65418 65433 True 

exon: 65519 65573 True 2
5' UTR: 65519 65564 True 
CDS: 65564 65573 True

exon: 69036 71585 True 2
CDS: 69036 70005 True
3' UTR: 70005 71585 True 



chr2: 10476 transcripts

exon: 38813 41627 False 2
3' UTR: 38813 41610 False 
CDS: 41610 41627 False

exon: 45439 46505 False 2
CDS: 45439 46385 False
5' UTR: 46385 46505 False 


### Introns

In [14]:
#for each transcript, whatever's not an exon is an intron

In [15]:
chr_list[1].transcripts[0].show()

transcript: 38813 46505 False 2


In [16]:
for thing in chr_list[1].transcripts[0].exon_intron:
    print(str(type(thing)) + " " + str(thing.start) + " " + str(thing.end))

<class '__main__.Exon'> 38813 41627
<class '__main__.Exon'> 45439 46505


In [17]:
#for chrm in chr_list:
for chrm in chr_list:
    #if (chrm == chr_list[2]):
        #break
    #chrm.show()
    for trn in chrm.transcripts:
        #if (trn == chrm.transcripts[1]):
            #break
        if (trn.has_intron()): #don't add introns if there already are introns
            #trn.sort()
            #print("  yes intron")
            continue
        #print("  moving on")
        prev = trn.start
        exon_list_copy = trn.exon_intron.copy()
        for exn in exon_list_copy:
            #print("    next exon")
            curr = exn.start
            #print("    prev = " + str(prev) + " curr = " + str(curr))
            if (prev < curr):
                #print("    adding intron from " + str(prev) + " to " + str(curr))
                trn.add_intron(prev, curr) #0-indexed
            prev = exn.end
        #print("  ended for loop")
        #print("    prev = " + str(prev) + " trn.end = " + str(trn.end))
        if (prev < trn.end):
            #print("    adding intron from " + str(prev) + " to " + str(trn.end))
            trn.add_intron(prev, trn.end) #0-indexed
        #trn.sort()

chr_list[1].show()
for thing in chr_list[1].transcripts[0].exon_intron:
    print(str(type(thing)) + " " + str(thing.start) + " " + str(thing.end))

chr2: 10476 transcripts
<class '__main__.Exon'> 38813 41627
<class '__main__.Intron'> 41627 45439
<class '__main__.Exon'> 45439 46505


In [18]:
chr_list[0].show() #gene on positive strand
for thingy in chr_list[0].transcripts[0].exon_intron:
    thingy.show()
print()
chr_list[1].show() #gene on negative strand
for thingy in chr_list[1].transcripts[0].exon_intron:
    thingy.show()

chr1: 13955 transcripts
exon: 65418 65433 True 1
intron: 65433 65519 True
exon: 65519 65573 True 2
intron: 65573 69036 True
exon: 69036 71585 True 2

chr2: 10476 transcripts
exon: 38813 41627 False 2
intron: 41627 45439 False
exon: 45439 46505 False 2


# Converting data to 200bp windows (bed)

In [19]:
#Option 1: pcg_windows_sorted_200.bed (extends the non-200bp windows to 200bp)
#Option 2: pcg_windows_sorted_rem-not200.bed (removes non-200bp windows)
#will be using option 1 (seems better imho)

In [20]:
f = open('pcg_windows_sorted_200.bed')
windows = f.readlines()
for i in range(len(windows)):
    windows[i] = windows[i].split()
    windows[i][1] = int(windows[i][1])
    windows[i][2] = int(windows[i][2])
    for n in range(6): #4 categories + intron/exon boolean column + cds/utr boolean column
        windows[i].append(0.0)
windows

[['chr1', 65418, 65618, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 65518, 65718, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 65618, 65818, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 65718, 65918, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 65818, 66018, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 65918, 66118, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 66018, 66218, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 66118, 66318, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 66218, 66418, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 66318, 66518, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 66418, 66618, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 66518, 66718, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 66618, 66818, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 66718, 66918, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 66818, 67018, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 66918, 67118, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 67018, 67218, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 67118, 67318, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 ['chr1', 

In [21]:
#for each window
###limit range to appropriate chromosome
###ignore transcripts that don't overlap with range
###find all introns, CDSs, and UTRs (5' and 3') that overlap with window
###sum up totals for each of the four categories
###find percentages (as decimals) and save those values into the column

In [22]:
def findFraction(s1, e1, s2, e2):  #s2/e2 is the window
    if ((s1 >= e2) or (s2 >= e1)): #if there is no overlap
        return 0.0
    fraction = 200
    #if (s1 > s2):
        #fraction -= (s1 - s2)
    fraction -= (s1 > s2) * (s1 - s2)
    fraction -= (e2 > e1) * (e2 - e1)
    #if (e2 > e1):
        #fraction -= (e2 - e1)
    return fraction

In [23]:
import time

In [24]:
len(windows)

13661310

In [27]:
windows_200bp = windows.copy()
ignoreTheseIndices = []

start = time.time()

for i in range(len(windows_200bp)):
    if (i % 100 == 0):
        print(i)
    if (i == 1000):
        break
    window = windows_200bp[i]
    intron = 0.0
    cds = 0.0
    utr5 = 0.0
    utr3 = 0.0
    if (window[0] < 'chrA'): #if it is a number (assuming there's only chr1-chr22, chrX, and chrY)
        chrm = chr_list[int(window[0][3:]) - 1]
    elif (window[0] == 'chrX'):
        chrm = chr_list[22]
    elif (window[0] == 'chrY'):
        chrm = chr_list[23]
    else:
        print('invalid chromosome name')
        break
    for trn in chrm.transcripts:
        if (window[2] < trn.start or window[1] > trn.end):
            continue
        for x in trn.exon_intron:
            if (type(x) == Intron):
                intron += findFraction(x.start, x.end, window[1], window[2])
                continue
            for ind in range(len(x.cds_utr)):
                y = x.cds_utr[ind]
                ###make window[8] equal to 1 if on boundary btwn cds & utr
                frac = findFraction(y.start, y.end, window[1], window[2]) 
                if (type(y) == Cds):
                    cds += frac
                    has_boundary1 = ind > 0 and type(x.cds_utr[ind - 1]) == Utr
                    has_boundary2 = ind < (len(x.cds_utr) - 1) and type(x.cds_utr[ind + 1]) == Utr
                    is_boundary1 = has_boundary1 and window[1] < y.start and y.start < window[2]
                    is_boundary2 = has_boundary2 and window[1] < y.end and y.end < window[2]
                    window[8] = int(window[8] or is_boundary1 or is_boundary2)
                elif (y.is_five):
                    utr5 += frac
                else:
                    utr3 += frac
    total = intron + cds + utr5 + utr3
    if (total == 0):
        ignoreTheseIndices.append(i)
        continue
    window[3] = intron / total
    window[4] = cds / total
    window[5] = utr5 / total
    window[6] = utr3 / total
    
    ###make window[7] equal to 1 if window[3] < 1 (a.k.a. on intron/exon boundary)
    window[7] = int(window[3] < 1)

print(str((time.time() - start) / 60) + " minutes")

for i in range(len(windows_200bp)): #this shouldn't print anything if everything's correct
    window = windows_200bp[i]
    #sum = window[7]
    #if (window[7] == 0.0):
        #break
    sum = window[3] + window[4] + window[5] + window[6]
    if (abs(sum - 1.0) > 0.000000001 and abs(sum - 0.0) > 0.000000001):
        print("hi")
        print(window)
        print(sum)

print("ignore these indices:")
print(ignoreTheseIndices)

windows_200bp

0
100
200
300
400
500
600
700
800
900
1000
0.043989694118499754 minutes
ignore these indices:
[440]


[['chr1', 65418, 65618, 0.655, 0.045, 0.3, 0.0, 1, 1],
 ['chr1', 65518, 65718, 0.73, 0.045, 0.225, 0.0, 1, 1],
 ['chr1', 65618, 65818, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 65718, 65918, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 65818, 66018, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 65918, 66118, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 66018, 66218, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 66118, 66318, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 66218, 66418, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 66318, 66518, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 66418, 66618, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 66518, 66718, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 66618, 66818, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 66718, 66918, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 66818, 67018, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 66918, 67118, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 67018, 67218, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 67118, 67318, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 67218, 67418, 1.0, 0.0, 0.0, 0.0, 0, 1],
 ['chr1', 67318, 67518

In [None]:
#current issue: last column is all 1

In [None]:
chr_list[0].show()
for thingy in chr_list[0].transcripts[0].exon_intron:
    print()
    thingy.show()
    if (type(thingy) == Exon):
        for thing in thingy.cds_utr:
            thing.show()

In [None]:
maxEILen = 0
chrInd = -1
trnInd = -1
ei_Ind = -1
for i in range(len(chr_list)):
    if (i > 0):
        break
    for j in range(len(chr_list[i].transcripts)):
        for k in range(len(chr_list[i].transcripts[j].exon_intron)):
            x = chr_list[i].transcripts[j].exon_intron[k]
            if (maxEILen < (x.end - x.start)):
                maxEILen = x.end - x.start
                chrInd = i
                trnInd = j
                ei_Ind = k
print(maxEILen)
print(chrInd)
print(trnInd)
print(ei_Ind)

In [None]:
#MyFile = open('labelled_windows-pt1.bed','w')
MyFile = open('test-pt1.bed','w')
for i in range(len(windows_200bp)):
    if (i == 10000):
        break
    if (i in ignoreTheseIndices):
        print(i)
        continue
    window = windows_200bp[i]
    for j in range(len(window)):
        MyFile.write(str(window[j]))
        if (j == 6):
            break
        MyFile.write('\t')
    MyFile.write('\n')
MyFile.close()