In [103]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

In [55]:
ESSENTIALITY_FILE = 'essentiality.csv'
ESSENTIALITY_DICT = {
    "Nonessential": 0,
    "Conditional": 1,
    "Essential": 2
}
essentialities = {}

num_lines_ess = sum(1 for line in open(ESSENTIALITY_FILE,'r'))
with open(ESSENTIALITY_FILE) as f:
    # titles
    f.readline()
    
    for i in tqdm(range(num_lines_ess)):
        line = f.readline().split(",")

        # ignore empty lines
        if len(line) < 2:
            continue

        ess = line[1]
        # sanity check
        if ess in ESSENTIALITY_DICT:
            essentialities[line[0]] = ESSENTIALITY_DICT[ess]

100%|██████████| 21557/21557 [00:00<00:00, 380290.01it/s]


In [58]:
print(sum([1 if v == ESSENTIALITY_DICT["Nonessential"] else 0 for k, v in essentialities.items()]))
print(sum([1 if v == ESSENTIALITY_DICT["Conditional"] else 0 for k, v in essentialities.items()]))
print(sum([1 if v == ESSENTIALITY_DICT["Essential"] else 0 for k, v in essentialities.items()]))

14388
6985
183


In [10]:
num_lines = sum(1 for line in open('introns.csv','r'))

with open('introns.csv') as f:
    
    num_start_gt = 0
    num_end_ag = 0
    total = 0
    
    for i in tqdm(range(num_lines)):
        code = f.readline().split(',')[4][:-1]
        if code[:2] == "GT":
            num_start_gt += 1
        if code[-2:] == "AG":
            num_end_ag += 1
        total += 1

print("percent start with GT: ", num_start_gt, 'out of ', total, ' = ', num_start_gt/total, '%')
print("percent end with AG: ", num_end_ag, 'out of ', total, ' = ', num_end_ag/total, '%')

100%|██████████| 248692/248692 [00:06<00:00, 40284.53it/s]

percent start with GT:  241805 out of  248692  =  0.9723071108037251 %
percent end with AG:  244830 out of  248692  =  0.9844707509690702 %





In [17]:
keys = {'A':0, 'C':1, 'G':2, 'T':3}
probs = np.zeros((4, 4))

with open('introns.csv') as f:
    for i in tqdm(range(num_lines)):
        code = f.readline().split(',')[4][:-1]
        code = code[2:6]
        
        for i, char in enumerate(code):
            if char == "N": 
                probs[i] += 0.25 # add equal weight to each base pair
            else:
                char_key = keys[char]
                probs[i][char_key] += 1

100%|██████████| 248692/248692 [00:07<00:00, 33640.60it/s]


In [40]:
print('Probabilities of each base pair appearing immediately after GT:')
print('  A          C          G          T')
print(probs / num_lines)

Probabilities of each base pair appearing immediately after GT:
  A          C          G          T
[[0.58617085 0.03152896 0.34753028 0.03443215]
 [0.69059419 0.0752668  0.12010137 0.1135712 ]
 [0.09108154 0.05997077 0.76940251 0.07901038]
 [0.18356944 0.14681715 0.18900186 0.48002449]]


In [44]:
probs / num_lines

array([[0.58617085, 0.03152896, 0.34753028, 0.03443215],
       [0.69059419, 0.0752668 , 0.12010137, 0.1135712 ],
       [0.09108154, 0.05997077, 0.76940251, 0.07901038],
       [0.18356944, 0.14681715, 0.18900186, 0.48002449]])

In [46]:
total_bp = 0
total_n = 0
with open('introns.csv') as f:
    for _ in tqdm(range(num_lines)):
        code = f.readline().split(',')[4][:-1]
        total_bp += len(code)
        total_n += sum([1 if x == "N" else 0 for x in code])

print(total_n, "Ns out of", total_bp, "--> percent: ", total_n/total_bp)
# 582320 Ns out of 1535783156 --> percent:  0.00037916811219408897

100%|██████████| 248692/248692 [04:31<00:00, 916.30it/s] 

582320 Ns out of 1535783156 --> percent:  0.00037916811219408897





In [48]:
# as calculated above
probabilities = [
    [0.58617085, 0.03152896, 0.34753028, 0.03443215],
    [0.69059419, 0.0752668 , 0.12010137, 0.1135712 ],
    [0.09108154, 0.05997077, 0.76940251, 0.07901038],
    [0.18356944, 0.14681715, 0.18900186, 0.48002449]]

keys = {'A':0, 'C':1, 'G':2, 'T':3}
BRANCH_LOC = 40
GC_count = 0
GT_count = 0
GC_prob_count = 0
GT_prob_count = 0
GC_prob = 0
GT_prob = 0
AG_count = 0

num_base_pairs = 0

with open('introns.csv') as f:
    for _ in tqdm(range(num_lines)):
        code = f.readline().split(',')[4][:-1]
        
        #We begin at 6 to ignore the first 5' splice site
        for i in range(6,len(code)-1):
            

            if i < len(code)-(BRANCH_LOC+6):


                if code[i:i+2] == "GC":
                    GC_count += 1
                    if 'N' not in code[i+2:i+6]:
                        consensus = sum([probabilities[j][keys[code[i+2+j]]] for j in range(0,4)])
                        GC_prob += consensus
                        GC_prob_count += 1
                elif code[i:i+2] == "GT":
                    GT_count += 1
                    if 'N' not in code[i+2:i+6]:
                        consensus = sum([probabilities[j][keys[code[i+2+j]]] for j in range(0,4)])
                        GT_prob += consensus
                        GT_prob_count += 1

            elif i > len(code) - BRANCH_LOC:
                if code[i:i+2] == "AG":
                    AG_count += 1

print("Length: " + str(len(code)))
print("GC: " + str(GC_count / num_lines))
print("GT: " + str(GT_count / num_lines))
print("GCAAGU: " + str(GC_prob / GC_prob_count))
print("GUAAGU: " + str(GT_prob / GT_prob_count))
print("AG: " + str(AG_count / num_lines))

100%|██████████| 248692/248692 [25:22<00:00, 163.30it/s] 

Length: 2029
GC: 264.6191192318209
GT: 330.2089291171409
GCAAGU: 0.9855244293248102
GUAAGU: 1.0081794636100954
AG: 2.01617261512232





In [49]:
num_base_pairs = 0
with open('introns.csv') as f:
    for _ in tqdm(range(num_lines)):
        code = f.readline().split(',')[4][:-1]
        num_base_pairs += len(code) - BRANCH_LOC - 6
print(num_base_pairs)

100%|██████████| 248692/248692 [00:03<00:00, 66709.11it/s]


1524343324


In [51]:
print("Number of base pairs: " + str(num_base_pairs))
print("GC frequency: " + str(GC_count / num_base_pairs))
print("GT frequency: " + str(GT_count / num_base_pairs))
print("GC[AAGU] consensus avg: " + str(GC_prob / GC_prob_count))
print("GT[AAGU] consensus avg: " + str(GT_prob / GT_prob_count))
print("AG frequency (per last 40): " + str(AG_count / num_lines))

Number of base pairs: 1524343324
GC frequency: 0.04317180845277871
GT frequency: 0.0538725874329345
GC[AAGU] consensus avg: 0.9855244293248102
GT[AAGU] consensus avg: 1.0081794636100954
AG frequency (per last 40): 2.01617261512232


In [93]:

def calculate_ethans_stuff(essential_filter=None, only_first_intron=False):
    # as calculated above
    probabilities = [
        [0.58617085, 0.03152896, 0.34753028, 0.03443215],
        [0.69059419, 0.0752668 , 0.12010137, 0.1135712 ],
        [0.09108154, 0.05997077, 0.76940251, 0.07901038],
        [0.18356944, 0.14681715, 0.18900186, 0.48002449]]
    keys = {'A':0, 'C':1, 'G':2, 'T':3}
    BRANCH_LOC = 40

    GC_count = 0
    GT_count = 0
    GC_prob_count = 0
    GT_prob_count = 0
    GC_prob = 0
    GT_prob = 0
    AG_count = 0

    start_with_gt = 0
    start_with_gt_count = 0
    end_with_ag = 0
    end_with_ag_count = 0

    num_base_pairs = 0
    num_introns_considered = 0
    genes_considered = set()

    with open('introns.csv') as f:
        for _ in tqdm(range(num_lines), position=0, leave=True):
            line = f.readline().split(',')
            gene = line[0]
            intron_num = int(line[3]) # <--- TODO check this is right
            code = line[4][:-1]

            # First intron filter, if applicable
            if only_first_intron and intron_num != 1:
                continue
                
            # Essentiality filter, if applicable
            if essential_filter != None:
                if gene in essentialities:
                    # essentiality is not what we want; skip it
                    if essentialities[gene] != ESSENTIALITY_DICT[essential_filter]:
                        continue
                else:
                    # we don't know essentiality; skip it
                    continue

            # If we passed all filters

            genes_considered.add(gene)
            num_introns_considered += 1
            num_base_pairs += len(code) - BRANCH_LOC - 6
            
            # Check start for GT
            if code[:2] == "GT":
                start_with_gt += 1
            start_with_gt_count += 1

            # Check end fot AG
            if code[-2:] == "AG":
                end_with_ag += 1
            end_with_ag_count += 1

            # Josh wrote this stuff:
            
            #We begin at 6 to ignore the first 5' splice site
            for i in range(6,len(code)-1):

                if i < len(code)-(BRANCH_LOC+6):

                    # Look at sequences immediately following GC
                    if code[i:i+2] == "GC":
                        GC_count += 1
                        if 'N' not in code[i+2:i+6]:
                            consensus = sum([probabilities[j][keys[code[i+2+j]]] for j in range(0,4)])
                            GC_prob += consensus
                            GC_prob_count += 1

                    # Look at sequences immediately following GT
                    elif code[i:i+2] == "GT":
                        GT_count += 1
                        if 'N' not in code[i+2:i+6]:
                            consensus = sum([probabilities[j][keys[code[i+2+j]]] for j in range(0,4)])
                            GT_prob += consensus
                            GT_prob_count += 1

                elif i > len(code) - BRANCH_LOC:
                    if code[i:i+2] == "AG":
                        AG_count += 1
                        
    if essential_filter != None:
        print(">>> ONLY CONSIDERING '" + essential_filter + "' GENES <<<")
    if only_first_intron:
        print(">>> ONLY CONSIDERING FIRST INTRONS <<<")

    print("Looked at", num_base_pairs, "base pairs from", num_introns_considered, "introns belonging to", len(genes_considered), "genes.")
    print("--------------------------")
    print("[PRE-BRANCH] Frequency of GCs: " + str(GC_count / num_base_pairs) + "  =  (" + str(GC_count) + "/" + str(num_base_pairs) + ")")
    print("[PRE-BRANCH] Frequency of GTs: " + str(GT_count / num_base_pairs) + "  =  (" + str(GT_count) + "/" + str(num_base_pairs) + ")")
    print("[PRE-BRANCH] Post-GC consensus score avg: " + str(GC_prob / GC_prob_count) + "  =  (" + str(GC_prob) + "/" + str(GC_prob_count) + ")")
    print("[PRE-BRANCH] Post-GT consensus score avg: " + str(GT_prob / GT_prob_count) + "  =  (" + str(GT_prob) + "/" + str(GT_prob_count) + ")")
    print()
    print("[POST-BRANCH] Average number of AGs in last 40: " + str(AG_count / num_introns_considered) + "  =  (" + str(AG_count) + "/" + str(num_introns_considered) + ")")
    print()
    print("[MISC] % Start with GT: " + str(start_with_gt/start_with_gt_count) + "%  =  (" + str(start_with_gt) + "/" + str(start_with_gt_count) + ")")
    print("[MISC] % Ends with AG:  " + str(end_with_ag/end_with_ag_count) + "%  =  (" + str(end_with_ag) + "/" + str(end_with_ag_count) + ")")
                            
    return {
        "essential_filter": essential_filter,
        "only_first_intron": only_first_intron,
        "num_base_pairs": num_base_pairs,
        "num_introns_considered": num_introns_considered,
        
        "GC_count": GC_count, 
        "GC_prob_count": GC_prob_count,
        "GT_count": GT_count,
        "GT_prob_count": GT_prob_count,
        "GC_prob": GC_prob,
        "GT_prob": GT_prob,
        "AG_count": AG_count,

        "start_with_gt": start_with_gt,
        "start_with_gt_count": start_with_gt_count,
        "end_with_ag": end_with_ag,
        "end_with_ag_count": end_with_ag_count
    }

In [90]:
results = {}
results_first_intron = {}
for e in ["Essential", "Conditional", "Nonessential"]:
    results[e] = calculate_ethans_stuff(essential_filter=e, only_first_intron=False)

results_first_intron = {}
for e in ["Essential", "Conditional", "Nonessential"]:
    results_first_intron[e] = calculate_ethans_stuff(essential_filter=e, only_first_intron=True)

100%|██████████| 248692/248692 [00:06<00:00, 39239.20it/s]
  0%|          | 195/248692 [00:00<02:53, 1436.19it/s]

>>> ONLY CONSIDERING 'Essential' GENES <<<
Looked at 4520173 base pairs from 1992 introns belonging to 164 genes.
--------------------------
[PRE-BRANCH] Frequency of GCs: 0.047538003523316476  =  (214880/4520173)
[PRE-BRANCH] Frequency of GTs: 0.05686817739055563  =  (257054/4520173)
[PRE-BRANCH] Post-GC consensus score avg: 0.9778034559302915  =  (210109.4288068451/214879)
[PRE-BRANCH] Post-GT consensus score avg: 0.9987838284118109  =  (256741.37822856964/257054)

[POST-BRANCH] Average number of AGs in last 40: 2.012550200803213  =  (4009/1992)

[MISC] % Start with GT: 0.9929718875502008%  =  (1978/1992)
[MISC] % Ends with AG:  0.9984939759036144%  =  (1989/1992)


100%|██████████| 248692/248692 [04:01<00:00, 1027.67it/s] 
  0%|          | 58/248692 [00:00<07:09, 578.55it/s]

>>> ONLY CONSIDERING 'Conditional' GENES <<<
Looked at 346670126 base pairs from 69371 introns belonging to 6640 genes.
--------------------------
[PRE-BRANCH] Frequency of GCs: 0.044053614241914806  =  (15272072/346670126)
[PRE-BRANCH] Frequency of GTs: 0.05560133843202861  =  (19275323/346670126)
[PRE-BRANCH] Post-GC consensus score avg: 0.9825078024899346  =  (15004927.935172455/15272070)
[PRE-BRANCH] Post-GT consensus score avg: 1.0073144149794255  =  (19416306.682026803/19275319)

[POST-BRANCH] Average number of AGs in last 40: 1.9731299822692479  =  (136878/69371)

[MISC] % Start with GT: 0.9907021666114082%  =  (68726/69371)
[MISC] % Ends with AG:  0.9980106961122083%  =  (69233/69371)


100%|██████████| 248692/248692 [07:49<00:00, 529.72it/s]  
  6%|▋         | 16034/248692 [00:00<00:02, 83019.03it/s]

>>> ONLY CONSIDERING 'Nonessential' GENES <<<
Looked at 672872863 base pairs from 109640 introns belonging to 12476 genes.
--------------------------
[PRE-BRANCH] Frequency of GCs: 0.04345758851029782  =  (29241432/672872863)
[PRE-BRANCH] Frequency of GTs: 0.054244382567706556  =  (36499573/672872863)
[PRE-BRANCH] Post-GC consensus score avg: 0.9843128790598097  =  (28782712.213874374/29241426)
[PRE-BRANCH] Post-GT consensus score avg: 1.0081491347396379  =  (36797005.8812723/36499566)

[POST-BRANCH] Average number of AGs in last 40: 1.9850237139730025  =  (217638/109640)

[MISC] % Start with GT: 0.9892192630426851%  =  (108458/109640)
[MISC] % Ends with AG:  0.9978657424297701%  =  (109406/109640)


100%|██████████| 248692/248692 [00:03<00:00, 70239.85it/s]
  0%|          | 435/248692 [00:00<00:58, 4218.18it/s]

>>> ONLY CONSIDERING 'Essential' GENES <<<
>>> ONLY CONSIDERING FIRST INTRONS <<<
Looked at 694923 base pairs from 164 introns belonging to 164 genes.
--------------------------
[PRE-BRANCH] Frequency of GCs: 0.050719288324030144  =  (35246/694923)
[PRE-BRANCH] Frequency of GTs: 0.057446652362923664  =  (39921/694923)
[PRE-BRANCH] Post-GC consensus score avg: 0.9759394131680657  =  (34397.96055652164/35246)
[PRE-BRANCH] Post-GT consensus score avg: 1.001694014833581  =  (39988.62676617138/39921)

[POST-BRANCH] Average number of AGs in last 40: 1.9634146341463414  =  (322/164)

[MISC] % Start with GT: 0.9939024390243902%  =  (163/164)
[MISC] % Ends with AG:  1.0%  =  (164/164)


100%|██████████| 248692/248692 [00:59<00:00, 4214.74it/s] 
  0%|          | 265/248692 [00:00<02:17, 1811.32it/s]

>>> ONLY CONSIDERING 'Conditional' GENES <<<
>>> ONLY CONSIDERING FIRST INTRONS <<<
Looked at 79612840 base pairs from 6640 introns belonging to 6640 genes.
--------------------------
[PRE-BRANCH] Frequency of GCs: 0.04652789675635237  =  (3704218/79612840)
[PRE-BRANCH] Frequency of GTs: 0.05627353577638984  =  (4480096/79612840)
[PRE-BRANCH] Post-GC consensus score avg: 0.9767425729027093  =  (3618067.419912528/3704218)
[PRE-BRANCH] Post-GT consensus score avg: 1.005820565797406  =  (4506168.670264432/4480092)

[POST-BRANCH] Average number of AGs in last 40: 1.9480421686746987  =  (12935/6640)

[MISC] % Start with GT: 0.9890060240963855%  =  (6567/6640)
[MISC] % Ends with AG:  0.9989457831325301%  =  (6633/6640)


100%|██████████| 248692/248692 [02:04<00:00, 1990.20it/s] 

>>> ONLY CONSIDERING 'Nonessential' GENES <<<
>>> ONLY CONSIDERING FIRST INTRONS <<<
Looked at 175231813 base pairs from 12476 introns belonging to 12476 genes.
--------------------------
[PRE-BRANCH] Frequency of GCs: 0.04504613554389236  =  (7893516/175231813)
[PRE-BRANCH] Frequency of GTs: 0.05480476881215627  =  (9603539/175231813)
[PRE-BRANCH] Post-GC consensus score avg: 0.9801005332218404  =  (7736437.280394062/7893514)
[PRE-BRANCH] Post-GT consensus score avg: 1.00777503609017  =  (9678204.846768284/9603537)

[POST-BRANCH] Average number of AGs in last 40: 1.968499519076627  =  (24559/12476)

[MISC] % Start with GT: 0.9845302981724912%  =  (12283/12476)
[MISC] % Ends with AG:  0.9961526130169926%  =  (12428/12476)





In [87]:
calculate_ethans_stuff(essential_filter="Essential", only_first_intron=True)


  0%|          | 0/248692 [00:00<?, ?it/s][A
  4%|▎         | 8861/248692 [00:00<00:02, 88591.50it/s][A
  6%|▋         | 15867/248692 [00:00<00:02, 81394.90it/s][A
 10%|▉         | 24335/248692 [00:00<00:02, 82352.86it/s][A
 12%|█▏        | 29615/248692 [00:00<00:03, 63344.88it/s][A
 14%|█▍        | 35805/248692 [00:00<00:03, 62904.17it/s][A
 18%|█▊        | 43628/248692 [00:00<00:03, 66741.72it/s][A
 21%|██        | 51343/248692 [00:00<00:02, 69554.08it/s][A
 23%|██▎       | 57893/248692 [00:00<00:02, 67961.10it/s][A
 26%|██▌       | 64414/248692 [00:00<00:02, 66961.58it/s][A
 29%|██▉       | 71523/248692 [00:01<00:02, 68143.98it/s][A
 32%|███▏      | 78675/248692 [00:01<00:02, 69122.43it/s][A
 34%|███▍      | 85749/248692 [00:01<00:02, 69472.93it/s][A
 38%|███▊      | 93311/248692 [00:01<00:02, 71190.38it/s][A
 40%|████      | 100399/248692 [00:01<00:02, 67153.82it/s][A
 43%|████▎     | 107889/248692 [00:01<00:02, 69303.84it/s][A
 46%|████▌     | 114855/248692 [00:01

>>> ONLY CONSIDERING 'Essential' GENES <<<
>>> ONLY CONSIDERING FIRST INTRONS <<<
Looked at 694923 base pairs from 164 introns belonging to 164 genes.
--------------------------
[PRE-BRANCH] Frequency of GCs: 0.050719288324030144  =  (35246/694923)
[PRE-BRANCH] Frequency of GTs: 0.057446652362923664  =  (39921/694923)
[PRE-BRANCH] Post-GC consensus score avg: 0.9759394131680657  =  (34397.96055652164/35246)
[PRE-BRANCH] Post-GT consensus score avg: 1.001694014833581  =  (39988.62676617138/39921)

[POST-BRANCH] Average number of AGs in last 40: 1.9634146341463414  =  (322/164)

[MISC] % Start with GT: 0.9939024390243902%  =  (163/164)
[MISC] % Ends with AG:  1.0%  =  (164/164)





{'essential_filter': 'Essential',
 'only_first_intron': True,
 'num_base_pairs': 694923,
 'num_introns_considered': 164,
 'GC_count': 35246,
 'GC_prob_count': 35246,
 'GT_count': 39921,
 'GT_prob_count': 39921,
 'GC_prob': 34397.96055652164,
 'GT_prob': 39988.62676617138,
 'AG_count': 322,
 'start_with_gt': 163,
 'start_with_gt_count': 164,
 'end_with_ag': 164,
 'end_with_ag_count': 164}

In [None]:
with open('introns.csv') as f:
        for _ in tqdm(range(num_lines)):
            line = f.readline().split(',')
            gene = line[0]
            

In [97]:

def calculate_start_end_freq(essential_filter=None, only_first_intron=False):
    # as calculated above
    probabilities = [
        [0.58617085, 0.03152896, 0.34753028, 0.03443215],
        [0.69059419, 0.0752668 , 0.12010137, 0.1135712 ],
        [0.09108154, 0.05997077, 0.76940251, 0.07901038],
        [0.18356944, 0.14681715, 0.18900186, 0.48002449]]
    keys = {'A':0, 'C':1, 'G':2, 'T':3}
    BRANCH_LOC = 40

    AT_AC = 0
    ATATCCT_AC = 0
    ATATCCT_AG = 0
    GTATCCT_AC = 0
    GTATCCT_AG = 0

    num_base_pairs = 0
    num_introns_considered = 0
    genes_considered = set()

    with open('introns.csv') as f:
        for _ in tqdm(range(num_lines), position=0, leave=True):
            line = f.readline().split(',')
            gene = line[0]
            intron_num = int(line[3]) # <--- TODO check this is right
            code = line[4][:-1]

            # First intron filter, if applicable
            if only_first_intron and intron_num != 1:
                continue
                
            # Essentiality filter, if applicable
            if essential_filter != None:
                if gene in essentialities:
                    # essentiality is not what we want; skip it
                    if essentialities[gene] != ESSENTIALITY_DICT[essential_filter]:
                        continue
                else:
                    # we don't know essentiality; skip it
                    continue

            # If we passed all filters

            genes_considered.add(gene)
            num_introns_considered += 1
            num_base_pairs += len(code) - BRANCH_LOC - 6
            
            # Check start for AT
            if code[:2] == "AT":
                if code[-2:] == "AC": # end with AC
                    AT_AC += 1
        
            # Check start for GTATCCT
            if code[:7] == "GTATCCT":
                if code[-2:] == "AC": # end with AC
                    GTATCCT_AC += 1
                elif code[-2:] == "AG": # end with AG
                    GTATCCT_AG += 1

            # Check start for ATATCCT
            if code[:7] == "ATATCCT":
                if code[-2:] == "AC": # end with AC
                    ATATCCT_AC += 1
                elif code[-2:] == "AG": # end with AG
                    ATATCCT_AG += 1

                        
    if essential_filter != None:
        print(">>> ONLY CONSIDERING '" + essential_filter + "' GENES <<<")
    if only_first_intron:
        print(">>> ONLY CONSIDERING FIRST INTRONS <<<")

    print("Looked at", num_base_pairs, "base pairs from", num_introns_considered, "introns belonging to", len(genes_considered), "genes.")
    print("--------------------------")
    print("Start with AT      and end with AC: " + str(AT_AC / num_introns_considered) + "  =  (" + str(AT_AC) + "/" + str(num_introns_considered) + ")")
    print("Start with ATATCCT and end with AC: " + str(ATATCCT_AC / num_introns_considered) + "  =  (" + str(ATATCCT_AC) + "/" + str(num_introns_considered) + ")")
    print("Start with ATATCCT and end with AG: " + str(ATATCCT_AG / num_introns_considered) + "  =  (" + str(ATATCCT_AG) + "/" + str(num_introns_considered) + ")")
    print("Start with GTATCCT and end with AC: " + str(GTATCCT_AC / num_introns_considered) + "  =  (" + str(GTATCCT_AC) + "/" + str(num_introns_considered) + ")")
    print("Start with GTATCCT and end with AG: " + str(GTATCCT_AG / num_introns_considered) + "  =  (" + str(GTATCCT_AG) + "/" + str(num_introns_considered) + ")")
                         
    return {
        "essential_filter": essential_filter,
        "only_first_intron": only_first_intron,
        "num_base_pairs": num_base_pairs,
        "num_introns_considered": num_introns_considered,
        
        "AT_AC": GC_count, 
        "ATATCCT_AC": ATATCCT_AC,
        "ATATCCT_AG": ATATCCT_AG,
        "GTATCCT_AC": GTATCCT_AC,
        "GTATCCT_AG": GTATCCT_AG,
    }

In [98]:
results_start_end = {}
results_start_end_first = {}
for e in ["Essential", "Conditional", "Nonessential"]:
    results_start_end[e] = calculate_start_end_freq(essential_filter=e, only_first_intron=False)

results_first_intron = {}
for e in ["Essential", "Conditional", "Nonessential"]:
    results_start_end_first[e] = calculate_start_end_freq(essential_filter=e, only_first_intron=True)

100%|██████████| 248692/248692 [00:03<00:00, 76136.16it/s]
  8%|▊         | 20244/248692 [00:00<00:02, 102694.02it/s]

>>> ONLY CONSIDERING 'Essential' GENES <<<
Looked at 4520173 base pairs from 1992 introns belonging to 164 genes.
--------------------------
Start with AT      and end with AC: 0.0015060240963855422  =  (3/1992)
Start with ATATCCT and end with AC: 0.0015060240963855422  =  (3/1992)
Start with ATATCCT and end with AG: 0.0  =  (0/1992)
Start with GTATCCT and end with AC: 0.0  =  (0/1992)
Start with GTATCCT and end with AG: 0.004016064257028112  =  (8/1992)


100%|██████████| 248692/248692 [00:03<00:00, 73633.07it/s]
  8%|▊         | 19215/248692 [00:00<00:02, 98469.20it/s] 

>>> ONLY CONSIDERING 'Conditional' GENES <<<
Looked at 346670126 base pairs from 69371 introns belonging to 6640 genes.
--------------------------
Start with AT      and end with AC: 0.0010667281717144051  =  (74/69371)
Start with ATATCCT and end with AC: 0.0009658214527684479  =  (67/69371)
Start with ATATCCT and end with AG: 2.8830491127416354e-05  =  (2/69371)
Start with GTATCCT and end with AC: 0.0  =  (0/69371)
Start with GTATCCT and end with AG: 0.003027201568378717  =  (210/69371)


100%|██████████| 248692/248692 [00:03<00:00, 73808.75it/s]
  4%|▍         | 10529/248692 [00:00<00:02, 105171.00it/s]

>>> ONLY CONSIDERING 'Nonessential' GENES <<<
Looked at 672872863 base pairs from 109640 introns belonging to 12476 genes.
--------------------------
Start with AT      and end with AC: 0.0008482305727836556  =  (93/109640)
Start with ATATCCT and end with AC: 0.0007022984312294783  =  (77/109640)
Start with ATATCCT and end with AG: 7.296607077708865e-05  =  (8/109640)
Start with GTATCCT and end with AC: 0.0  =  (0/109640)
Start with GTATCCT and end with AG: 0.001906238599051441  =  (209/109640)


100%|██████████| 248692/248692 [00:03<00:00, 76714.52it/s]
  4%|▍         | 11059/248692 [00:00<00:02, 110584.57it/s]

>>> ONLY CONSIDERING 'Essential' GENES <<<
>>> ONLY CONSIDERING FIRST INTRONS <<<
Looked at 694923 base pairs from 164 introns belonging to 164 genes.
--------------------------
Start with AT      and end with AC: 0.0  =  (0/164)
Start with ATATCCT and end with AC: 0.0  =  (0/164)
Start with ATATCCT and end with AG: 0.0  =  (0/164)
Start with GTATCCT and end with AC: 0.0  =  (0/164)
Start with GTATCCT and end with AG: 0.0  =  (0/164)


100%|██████████| 248692/248692 [00:03<00:00, 76606.96it/s]
  4%|▍         | 10831/248692 [00:00<00:02, 108309.07it/s]

>>> ONLY CONSIDERING 'Conditional' GENES <<<
>>> ONLY CONSIDERING FIRST INTRONS <<<
Looked at 79612840 base pairs from 6640 introns belonging to 6640 genes.
--------------------------
Start with AT      and end with AC: 0.00045180722891566266  =  (3/6640)
Start with ATATCCT and end with AC: 0.00030120481927710846  =  (2/6640)
Start with ATATCCT and end with AG: 0.0  =  (0/6640)
Start with GTATCCT and end with AC: 0.0  =  (0/6640)
Start with GTATCCT and end with AG: 0.00286144578313253  =  (19/6640)


100%|██████████| 248692/248692 [00:03<00:00, 75257.70it/s]

>>> ONLY CONSIDERING 'Nonessential' GENES <<<
>>> ONLY CONSIDERING FIRST INTRONS <<<
Looked at 175231813 base pairs from 12476 introns belonging to 12476 genes.
--------------------------
Start with AT      and end with AC: 0.0007213850593138826  =  (9/12476)
Start with ATATCCT and end with AC: 0.0005610772683552421  =  (7/12476)
Start with ATATCCT and end with AG: 0.0  =  (0/12476)
Start with GTATCCT and end with AC: 0.0  =  (0/12476)
Start with GTATCCT and end with AG: 0.0012824623276691247  =  (16/12476)



