# GFF Processing Metrics

Using a processed GFF file (either from NIH or Ensembl) we can apply some metrics to understand the characteristics of our RNA dataset.  In particular, we are interested in characterizing the number of intermediate states possible for a gene once it has left the DNA.  These intermediate states are relevant to the study of alternative splicing, secondary RNA structure, and transcription time.

# Libraries

In [1]:
# Libraries


# Combinations
from itertools import combinations

# NumPy
import numpy as np

# Pandas
import pandas as pd

# Sampling
# import random  
from random import sample

In [2]:
# Set the data folder.
data_folder = '/media/apollo/Samsung_T5/transfer/mayur/annotations/'
# data_folder = '/home/mad1188/rvallsamples/'

# NIH Processing

First, read in the processed GFF file from NIH.

In [3]:
# NIH file.

# Skip the metadata lines (1-9).
df = pd.read_csv(
    data_folder + 'GCF_000001405.40_GRCh38.p14_genomic.gff.exon.processed',
    sep = '\t'
)

In [4]:
df

Unnamed: 0,nih_molecule_accession,source,category,start,stop,strand,information,ID,exon_parent,exon_gbkey,exon_gene,exon_transcript_id,predicted
0,NC_000001.11,BestRefSeq,exon,17369.0,17436.0,-,ID=exon-NR_106918.1-1;Parent=rna-NR_106918.1;D...,exon-NR_106918.1-1,rna-NR_106918.1,precursor_RNA,MIR6859-1,NR_106918.1,no
1,NC_000001.11,BestRefSeq,exon,17369.0,17391.0,-,ID=exon-MIR6859-1-1;Parent=rna-MIR6859-1;Dbxre...,exon-MIR6859-1-1,rna-MIR6859-1,ncRNA,MIR6859-1,,not_available
2,NC_000001.11,BestRefSeq,exon,17409.0,17431.0,-,ID=exon-MIR6859-1-2-1;Parent=rna-MIR6859-1-2;D...,exon-MIR6859-1-2-1,rna-MIR6859-1-2,ncRNA,MIR6859-1,,not_available
3,NC_000001.11,Gnomon,exon,29774.0,30667.0,+,ID=exon-XR_007065314.1-1;Parent=rna-XR_0070653...,exon-XR_007065314.1-1,rna-XR_007065314.1,ncRNA,MIR1302-2HG,XR_007065314.1,yes
4,NC_000001.11,Gnomon,exon,30976.0,31093.0,+,ID=exon-XR_007065314.1-2;Parent=rna-XR_0070653...,exon-XR_007065314.1-2,rna-XR_007065314.1,ncRNA,MIR1302-2HG,XR_007065314.1,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1698714,NC_012920.1,RefSeq,exon,14149.0,14673.0,-,ID=exon-ND6-1;Parent=rna-ND6;Dbxref=GeneID:454...,exon-ND6-1,rna-ND6,mRNA,ND6,,not_available
1698715,NC_012920.1,RefSeq,exon,14674.0,14742.0,-,ID=exon-TRNE-1;Parent=rna-TRNE;Dbxref=GeneID:4...,exon-TRNE-1,rna-TRNE,tRNA,TRNE,,not_available
1698716,NC_012920.1,RefSeq,exon,14747.0,15887.0,+,ID=exon-CYTB-1;Parent=rna-CYTB;Dbxref=GeneID:4...,exon-CYTB-1,rna-CYTB,mRNA,CYTB,,not_available
1698717,NC_012920.1,RefSeq,exon,15888.0,15953.0,+,ID=exon-TRNT-1;Parent=rna-TRNT;Dbxref=GeneID:4...,exon-TRNT-1,rna-TRNT,tRNA,TRNT,,not_available


# Metrics (<= 25 introns lengths)

For the metrics that follow, we will concern ourselves only with exons for which a transcript ID is given (including predicted products).

In [5]:
# Only want ones with transcripts.
has_transcript = df[df['predicted'] != 'not_available'].copy()

In [6]:
has_transcript

Unnamed: 0,nih_molecule_accession,source,category,start,stop,strand,information,ID,exon_parent,exon_gbkey,exon_gene,exon_transcript_id,predicted
0,NC_000001.11,BestRefSeq,exon,17369.0,17436.0,-,ID=exon-NR_106918.1-1;Parent=rna-NR_106918.1;D...,exon-NR_106918.1-1,rna-NR_106918.1,precursor_RNA,MIR6859-1,NR_106918.1,no
3,NC_000001.11,Gnomon,exon,29774.0,30667.0,+,ID=exon-XR_007065314.1-1;Parent=rna-XR_0070653...,exon-XR_007065314.1-1,rna-XR_007065314.1,ncRNA,MIR1302-2HG,XR_007065314.1,yes
4,NC_000001.11,Gnomon,exon,30976.0,31093.0,+,ID=exon-XR_007065314.1-2;Parent=rna-XR_0070653...,exon-XR_007065314.1-2,rna-XR_007065314.1,ncRNA,MIR1302-2HG,XR_007065314.1,yes
5,NC_000001.11,Gnomon,exon,34168.0,35418.0,+,ID=exon-XR_007065314.1-3;Parent=rna-XR_0070653...,exon-XR_007065314.1-3,rna-XR_007065314.1,ncRNA,MIR1302-2HG,XR_007065314.1,yes
6,NC_000001.11,BestRefSeq,exon,30366.0,30503.0,+,ID=exon-NR_036051.1-1;Parent=rna-NR_036051.1;D...,exon-NR_036051.1-1,rna-NR_036051.1,precursor_RNA,MIR1302-2,NR_036051.1,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1698677,NC_000024.10,Gnomon,exon,57192325.0,57192537.0,+,ID=exon-XM_017030055.2-1;Parent=rna-XM_0170300...,exon-XM_017030055.2-1,rna-XM_017030055.2,mRNA,IL9R,XM_017030055.2,yes
1698678,NC_000024.10,Gnomon,exon,57194043.0,57194127.0,+,ID=exon-XM_017030055.2-2;Parent=rna-XM_0170300...,exon-XM_017030055.2-2,rna-XM_017030055.2,mRNA,IL9R,XM_017030055.2,yes
1698679,NC_000024.10,Gnomon,exon,57196336.0,57197337.0,+,ID=exon-XM_017030055.2-3;Parent=rna-XM_0170300...,exon-XM_017030055.2-3,rna-XM_017030055.2,mRNA,IL9R,XM_017030055.2,yes
1698680,NC_000024.10,BestRefSeq,exon,57203182.0,57203350.0,-,ID=exon-NR_138048.1-2-1;Parent=rna-NR_138048.1...,exon-NR_138048.1-2-1,rna-NR_138048.1-2,ncRNA,WASIR1,NR_138048.1,no


Get exon information.

In [7]:
# Keep DataFrame.
# Source: https://stackoverflow.com/a/42324086
exon_counts = has_transcript.groupby(['exon_gene', 'exon_transcript_id'], as_index = False).size()
exon_counts = exon_counts.rename(
    columns = {
        "size": "n_exons"
    }
)

In [8]:
exon_counts

Unnamed: 0,exon_gene,exon_transcript_id,n_exons
0,A1BG,NM_130786.4,8
1,A1BG-AS1,NR_015380.2,4
2,A1CF,NM_001198818.2,14
3,A1CF,NM_001198819.2,15
4,A1CF,NM_001198820.2,14
...,...,...,...
139214,ZZZ3,XM_047417329.1,14
139215,ZZZ3,XM_047417333.1,13
139216,ZZZ3,XM_047417337.1,14
139217,ZZZ3,XM_047417341.1,13


We can immediately identify the number of introns as well.

In [9]:
# Add the intron counts.
exon_counts['n_introns'] = exon_counts['n_exons']-1

In [10]:
exon_counts

Unnamed: 0,exon_gene,exon_transcript_id,n_exons,n_introns
0,A1BG,NM_130786.4,8,7
1,A1BG-AS1,NR_015380.2,4,3
2,A1CF,NM_001198818.2,14,13
3,A1CF,NM_001198819.2,15,14
4,A1CF,NM_001198820.2,14,13
...,...,...,...,...
139214,ZZZ3,XM_047417329.1,14,13
139215,ZZZ3,XM_047417333.1,13,12
139216,ZZZ3,XM_047417337.1,14,13
139217,ZZZ3,XM_047417341.1,13,12


Now we will explicitly calculate the possible lengths for transcripts with n_introns <= 25.  First, get the dataframe of such transcripts.

In [11]:
# 1 <= n_introns <= 25.
lte_25_introns = exon_counts[(exon_counts.n_introns >= 1) & (exon_counts.n_introns <= 25)].copy()

In [12]:
lte_25_introns

Unnamed: 0,exon_gene,exon_transcript_id,n_exons,n_introns
0,A1BG,NM_130786.4,8,7
1,A1BG-AS1,NR_015380.2,4,3
2,A1CF,NM_001198818.2,14,13
3,A1CF,NM_001198819.2,15,14
4,A1CF,NM_001198820.2,14,13
...,...,...,...,...
139214,ZZZ3,XM_047417329.1,14,13
139215,ZZZ3,XM_047417333.1,13,12
139216,ZZZ3,XM_047417337.1,14,13
139217,ZZZ3,XM_047417341.1,13,12


Subset to get the exon starts and stops.

In [13]:
# Which transcripts?
lte_25_introns_transcript_accessions = list(set(lte_25_introns['exon_transcript_id']))

# Get exon positions.
exon_filter = has_transcript['exon_transcript_id'].isin(lte_25_introns_transcript_accessions)

In [14]:
exon_positions = has_transcript[exon_filter].copy()

In [15]:
exon_positions

Unnamed: 0,nih_molecule_accession,source,category,start,stop,strand,information,ID,exon_parent,exon_gbkey,exon_gene,exon_transcript_id,predicted
3,NC_000001.11,Gnomon,exon,29774.0,30667.0,+,ID=exon-XR_007065314.1-1;Parent=rna-XR_0070653...,exon-XR_007065314.1-1,rna-XR_007065314.1,ncRNA,MIR1302-2HG,XR_007065314.1,yes
4,NC_000001.11,Gnomon,exon,30976.0,31093.0,+,ID=exon-XR_007065314.1-2;Parent=rna-XR_0070653...,exon-XR_007065314.1-2,rna-XR_007065314.1,ncRNA,MIR1302-2HG,XR_007065314.1,yes
5,NC_000001.11,Gnomon,exon,34168.0,35418.0,+,ID=exon-XR_007065314.1-3;Parent=rna-XR_0070653...,exon-XR_007065314.1-3,rna-XR_007065314.1,ncRNA,MIR1302-2HG,XR_007065314.1,yes
8,NC_000001.11,BestRefSeq,exon,35721.0,36081.0,-,ID=exon-NR_026818.1-1;Parent=rna-NR_026818.1;D...,exon-NR_026818.1-1,rna-NR_026818.1,ncRNA,FAM138A,NR_026818.1,no
9,NC_000001.11,BestRefSeq,exon,35277.0,35481.0,-,ID=exon-NR_026818.1-2;Parent=rna-NR_026818.1;D...,exon-NR_026818.1-2,rna-NR_026818.1,ncRNA,FAM138A,NR_026818.1,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1698677,NC_000024.10,Gnomon,exon,57192325.0,57192537.0,+,ID=exon-XM_017030055.2-1;Parent=rna-XM_0170300...,exon-XM_017030055.2-1,rna-XM_017030055.2,mRNA,IL9R,XM_017030055.2,yes
1698678,NC_000024.10,Gnomon,exon,57194043.0,57194127.0,+,ID=exon-XM_017030055.2-2;Parent=rna-XM_0170300...,exon-XM_017030055.2-2,rna-XM_017030055.2,mRNA,IL9R,XM_017030055.2,yes
1698679,NC_000024.10,Gnomon,exon,57196336.0,57197337.0,+,ID=exon-XM_017030055.2-3;Parent=rna-XM_0170300...,exon-XM_017030055.2-3,rna-XM_017030055.2,mRNA,IL9R,XM_017030055.2,yes
1698680,NC_000024.10,BestRefSeq,exon,57203182.0,57203350.0,-,ID=exon-NR_138048.1-2-1;Parent=rna-NR_138048.1...,exon-NR_138048.1-2-1,rna-NR_138048.1-2,ncRNA,WASIR1,NR_138048.1,no


Let's process the positive strand first.

## Positive strand

In [16]:
# Positive strand.
positive_strand = exon_positions[exon_positions.strand == '+']

Order by gene, then transcript, then start position.

In [17]:
# Sort on gene, transcript, and start position.
positive_strand = positive_strand.sort_values(by = ['exon_gene', 'exon_transcript_id', 'start'])

In [18]:
positive_strand[1:20]

Unnamed: 0,nih_molecule_accession,source,category,start,stop,strand,information,ID,exon_parent,exon_gbkey,exon_gene,exon_transcript_id,predicted
1557862,NC_000019.10,BestRefSeq,exon,58353379.0,58353474.0,+,ID=exon-NR_015380.2-2;Parent=rna-NR_015380.2;D...,exon-NR_015380.2-2,rna-NR_015380.2,ncRNA,A1BG-AS1,NR_015380.2,no
1557863,NC_000019.10,BestRefSeq,exon,58353714.0,58353857.0,+,ID=exon-NR_015380.2-3;Parent=rna-NR_015380.2;D...,exon-NR_015380.2-3,rna-NR_015380.2,ncRNA,A1BG-AS1,NR_015380.2,no
1557864,NC_000019.10,BestRefSeq,exon,58354369.0,58355183.0,+,ID=exon-NR_015380.2-4;Parent=rna-NR_015380.2;D...,exon-NR_015380.2-4,rna-NR_015380.2,ncRNA,A1BG-AS1,NR_015380.2,no
1060562,NC_000012.12,BestRefSeq,exon,9065177.0,9065228.0,+,ID=exon-NR_026971.1-1;Parent=rna-NR_026971.1;D...,exon-NR_026971.1-1,rna-NR_026971.1,ncRNA,A2M-AS1,NR_026971.1,no
1060563,NC_000012.12,BestRefSeq,exon,9065826.0,9066060.0,+,ID=exon-NR_026971.1-2;Parent=rna-NR_026971.1;D...,exon-NR_026971.1-2,rna-NR_026971.1,ncRNA,A2M-AS1,NR_026971.1,no
1060564,NC_000012.12,BestRefSeq,exon,9066156.0,9068055.0,+,ID=exon-NR_026971.1-3;Parent=rna-NR_026971.1;D...,exon-NR_026971.1-3,rna-NR_026971.1,ncRNA,A2M-AS1,NR_026971.1,no
1060558,NC_000012.12,BestRefSeq,exon,9065177.0,9065228.0,+,ID=exon-NR_137424.1-1;Parent=rna-NR_137424.1;D...,exon-NR_137424.1-1,rna-NR_137424.1,ncRNA,A2M-AS1,NR_137424.1,no
1060559,NC_000012.12,BestRefSeq,exon,9065808.0,9068055.0,+,ID=exon-NR_137424.1-2;Parent=rna-NR_137424.1;D...,exon-NR_137424.1-2,rna-NR_137424.1,ncRNA,A2M-AS1,NR_137424.1,no
1060560,NC_000012.12,BestRefSeq,exon,9065177.0,9065228.0,+,ID=exon-NR_137425.1-1;Parent=rna-NR_137425.1;D...,exon-NR_137425.1-1,rna-NR_137425.1,ncRNA,A2M-AS1,NR_137425.1,no
1060561,NC_000012.12,BestRefSeq,exon,9065826.0,9068055.0,+,ID=exon-NR_137425.1-2;Parent=rna-NR_137425.1;D...,exon-NR_137425.1-2,rna-NR_137425.1,ncRNA,A2M-AS1,NR_137425.1,no


We can avoid expensive grouping and subtraction operations by making three lists (starts,stops, and transcript ID), and subtracting the lists.  We then exclude meaningless subtractions (e.g. between different transcripts).

In [19]:
# Starts, stops, and IDs.
starts = list(positive_strand.start)[1:]
stops = list(positive_strand.stop)[:-1]
etids_starts = list(positive_strand['exon_transcript_id'])[1:]
etids_stops = list(positive_strand['exon_transcript_id'])[:-1]

# The intron lengths.
intron_lengths = {}

# Double set to ensure we get all IDs.
for tid in set(set(etids_starts).union(set(etids_stops))):
    intron_lengths[tid] = []

# Subtract.
for i in range(1, len(starts)):
# for i in range(1, 20):
    
    # Exploit the ordered nature of the data.
    start_tid = etids_starts[i]
    stop_tid = etids_stops[i]
    
    if start_tid == stop_tid:
        
        # TODO: fix data type for start and stop columns
        # earlier in analysis because we are doing
        # int typecasting here.
        intron_lengths[start_tid].append(int(starts[i] - stops[i]) - 1)

In [20]:
# intron_lengths

Use even simpler logic to find the exon lengths.

In [21]:
# Make an exon length column.
positive_strand['exon_length'] = positive_strand.stop - positive_strand.start + 1

In [22]:
positive_strand

Unnamed: 0,nih_molecule_accession,source,category,start,stop,strand,information,ID,exon_parent,exon_gbkey,exon_gene,exon_transcript_id,predicted,exon_length
1557861,NC_000019.10,BestRefSeq,exon,58351970.0,58353044.0,+,ID=exon-NR_015380.2-1;Parent=rna-NR_015380.2;D...,exon-NR_015380.2-1,rna-NR_015380.2,ncRNA,A1BG-AS1,NR_015380.2,no,1075.0
1557862,NC_000019.10,BestRefSeq,exon,58353379.0,58353474.0,+,ID=exon-NR_015380.2-2;Parent=rna-NR_015380.2;D...,exon-NR_015380.2-2,rna-NR_015380.2,ncRNA,A1BG-AS1,NR_015380.2,no,96.0
1557863,NC_000019.10,BestRefSeq,exon,58353714.0,58353857.0,+,ID=exon-NR_015380.2-3;Parent=rna-NR_015380.2;D...,exon-NR_015380.2-3,rna-NR_015380.2,ncRNA,A1BG-AS1,NR_015380.2,no,144.0
1557864,NC_000019.10,BestRefSeq,exon,58354369.0,58355183.0,+,ID=exon-NR_015380.2-4;Parent=rna-NR_015380.2;D...,exon-NR_015380.2-4,rna-NR_015380.2,ncRNA,A1BG-AS1,NR_015380.2,no,815.0
1060562,NC_000012.12,BestRefSeq,exon,9065177.0,9065228.0,+,ID=exon-NR_026971.1-1;Parent=rna-NR_026971.1;D...,exon-NR_026971.1-1,rna-NR_026971.1,ncRNA,A2M-AS1,NR_026971.1,no,52.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731296,NC_000007.14,Gnomon,exon,143381557.0,143381779.0,+,ID=exon-XM_047420818.1-2;Parent=rna-XM_0474208...,exon-XM_047420818.1-2,rna-XM_047420818.1,mRNA,ZYX,XM_047420818.1,yes,223.0
731297,NC_000007.14,Gnomon,exon,143382248.0,143382447.0,+,ID=exon-XM_047420818.1-3;Parent=rna-XM_0474208...,exon-XM_047420818.1-3,rna-XM_047420818.1,mRNA,ZYX,XM_047420818.1,yes,200.0
731298,NC_000007.14,Gnomon,exon,143382593.0,143382685.0,+,ID=exon-XM_047420818.1-4;Parent=rna-XM_0474208...,exon-XM_047420818.1-4,rna-XM_047420818.1,mRNA,ZYX,XM_047420818.1,yes,93.0
731299,NC_000007.14,Gnomon,exon,143382801.0,143383322.0,+,ID=exon-XM_047420818.1-5;Parent=rna-XM_0474208...,exon-XM_047420818.1-5,rna-XM_047420818.1,mRNA,ZYX,XM_047420818.1,yes,522.0


Get the exon lengths.

In [23]:
# Get all exon lengths.
exons = list(positive_strand.exon_length)

# Get all transcripts.
exon_transcripts = list(positive_strand.exon_transcript_id)

# The exon lengths.
exon_lengths = {}

# Go position by position and see where
# the exons belong.
for i in range(0, len(exon_transcripts)):
    
    # Get the transcript first.
    tscript = exon_transcripts[i]
    
    # In the dictionary already?
    if tscript not in exon_lengths:
        exon_lengths[tscript] = []
    
    # Now append.
    exon_lengths[tscript].append(int(exons[i]))

In [24]:
# exon_lengths

Check to see if the lengths of the intron list and the exon list match.

In [25]:
len(intron_lengths.keys()) == len(exon_lengths.keys())

True

We can define minimum and maximum possible transcript lengths easily.

In [26]:
# Maximum possible transcript lengths.
transcript_length_extrema = {}

for k, v in exon_lengths.items():
    transcript_length_extrema[k] = {
        'minimum_possible_transcript_length': sum(v),
        'maximum_possible_transcript_length': sum(v) + sum(intron_lengths[k])
    }

Now go transcript by transcript and find all possible transcript lengths, saving the information to file (expensive).  Start by getting all combinations of 1 to 25 introns.

TODO: split into files 1_1, 1_2, 1_3, ..., 1_25 and load based on the transcript list we are actually working with.

In [None]:
# All intronic combinations up to 25.
lte_25_intron_combinations = {}

# The combinations.
with open('/media/apollo/Samsung_T5/transfer/mayur/1_25_combinations.tsv', 'r') as f:
    
    combos = f.readlines()[1:]
    
    # Initialize the dictionary.
    for i in range(1, 26):
        
        lte_25_intron_combinations[i] = {}
        
        for j in range(1, i + 1):
            
            lte_25_intron_combinations[i][j] = []
    
    # Now go through the combinations
    # and populate the dictionary.
    for combo in combos:
        
        # Strip and split.
        stripped_and_split = combo.strip().split('\t')
        
        # Store it.
        lte_25_intron_combinations[int(stripped_and_split[0])][int(stripped_and_split[1])].append(tuple(stripped_and_split[2:]))

In [None]:
# lte_25_intron_combinations

The transcripts.

In [None]:
# The file to write to.
with open('intermediate_states.tsv', 'w') as f:
    
    # Write the header.
    f.write('\t'.join(['transcript_id', 'minimum_transcript_length', 'maximum_transcript_length', 'n_exons_in_transcript', 'n_introns_in_transcript', 'index', 'choose_number', 'combination', 'intermediate_transcript_length']) + '\n')

    # Each transcript.
    for transcript_id in list(intron_lengths.keys())[0:5]:

        # Get the extrema of the transcript length.        
        minimum_transcript_length = transcript_length_extrema[transcript_id]['minimum_possible_transcript_length']
        maximum_transcript_length = transcript_length_extrema[transcript_id]['maximum_possible_transcript_length']

        # Get the number of introns and exons in this transcript.
        n_introns_in_transcript = len(intron_lengths[transcript_id])
        n_exons_in_transcript = len(exon_lengths[transcript_id])
        
        # Write the "no selection" line.
        
        # TODO: put a 0 selection in the 1_25 file 
        # so we can use the file instead?
        f.write('\t'.join([
            transcript_id, 
            str(minimum_transcript_length), 
            str(maximum_transcript_length), 
            str(n_exons_in_transcript),
            str(n_introns_in_transcript),
            '0',
            '0', 
            '0', 
            str(minimum_transcript_length)
        ]) + '\n')

        # Get the possible combinations for this number
        # of introns.
        combos = lte_25_intron_combinations[n_introns_in_transcript]

        # We will create an index so that we can
        # more easily track combinations.
        index = 1

        # Now iterate over the combinations.
        for choose_number in combos.keys():
            for selections in combos[choose_number]:

                # Create a variable to hold the length.
                intermediate_transcript_length = minimum_transcript_length

                # Get exactly which introns we're selecting.
                for intron_number in selections[0].split(','):
                    
                    # intron_number adjusted down because of 0-indexing
                    # in python.

                    # Select the intron and add to the length.
                    intermediate_transcript_length += intron_lengths[transcript_id][int(intron_number) - 1]
                
                # Write to file.
                f.write('\t'.join([
                    transcript_id, 
                    str(minimum_transcript_length), 
                    str(maximum_transcript_length), 
                    str(n_exons_in_transcript),
                    str(n_introns_in_transcript),
                    str(index),
                    str(choose_number), 
                    selections[0], 
                    str(intermediate_transcript_length)
                ]) + '\n')
                
                # Increase the index.
                index += 1  

## Negative strand

We use largely the same logic as above, with the exception of the order for the start and stop positions.

In [None]:
# Positive strand.
negative_strand = exon_positions[exon_positions.strand == '-']

In [None]:
negative_strand

Order by gene, then transcript, then stop position.

In [None]:
# Sort on gene, transcript, and start position.
negative_strand = negative_strand.sort_values(ascending = [True, True, False], by = ['exon_gene', 'exon_transcript_id', 'stop'])

In [None]:
negative_strand[1:30]

In [None]:
# Starts, stops, and IDs.
starts = list(negative_strand.stop)[:-1]
stops = list(negative_strand.start)[1:]
etids_starts = list(negative_strand['exon_transcript_id'])[:-1]
etids_stops = list(negative_strand['exon_transcript_id'])[1:]

# The intron lengths.
intron_lengths = {}

# Double set to ensure we get all IDs.
for tid in set(set(etids_starts).union(set(etids_stops))):
    intron_lengths[tid] = []

# Subtract.
for i in range(1, len(starts)):
# for i in range(1, 20):
    
    # Exploit the ordered nature of the data.
    start_tid = etids_starts[i]
    stop_tid = etids_stops[i]
    
    if start_tid == stop_tid:
        
        # TODO: fix data type for start and stop columns
        # earlier in analysis because we are doing
        # int typecasting here.
        intron_lengths[start_tid].append(int(starts[i] - stops[i]) - 1)

Use even simpler logic to find the exon lengths.

In [None]:
# Make an exon length column.
negative_strand['exon_length'] = negative_strand.stop - negative_strand.start + 1

In [None]:
negative_strand

Get the exon lengths.

In [None]:
# Get all exon lengths.
exons = list(negative_strand.exon_length)

# Get all transcripts.
exon_transcripts = list(negative_strand.exon_transcript_id)

# The exon lengths.
exon_lengths = {}

# Go position by position and see where
# the exons belong.
for i in range(0, len(exon_transcripts)):
    
    # Get the transcript first.
    tscript = exon_transcripts[i]
    
    # In the dictionary already?
    if tscript not in exon_lengths:
        exon_lengths[tscript] = []
    
    # Now append.
    exon_lengths[tscript].append(int(exons[i]))

In [None]:
exon_lengths

Check to see if the lengths of the intron list and the exon list match.

In [None]:
len(intron_lengths.keys()) == len(exon_lengths.keys())

We can define minimum and maximum possible transcript lengths easily.

In [None]:
# Maximum possible transcript lengths.
transcript_length_extrema = {}

for k, v in exon_lengths.items():
    transcript_length_extrema[k] = {
        'minimum_possible_transcript_length': sum(v),
        'maximum_possible_transcript_length': sum(v) + sum(intron_lengths[k])
    }

Now go transcript by transcript and find all possible transcript lengths, saving the information to file (expensive).  Start by getting all combinations of 1 to 25 introns.

TODO: split into files 1_1, 1_2, 1_3, ..., 1_25 and load based on the transcript list we are actually working with.

The transcripts.

In [None]:
# The file to write to.
with open('intermediate_states.tsv', 'a') as f:
    
    # Write the header.
    f.write('\t'.join(['transcript_id', 'minimum_transcript_length', 'maximum_transcript_length', 'n_exons_in_transcript', 'n_introns_in_transcript', 'index', 'choose_number', 'combination', 'intermediate_transcript_length']) + '\n')

    # Each transcript.
    for transcript_id in list(intron_lengths.keys())[0:5]:

        # Get the extrema of the transcript length.        
        minimum_transcript_length = transcript_length_extrema[transcript_id]['minimum_possible_transcript_length']
        maximum_transcript_length = transcript_length_extrema[transcript_id]['maximum_possible_transcript_length']

        # Get the number of introns and exons in this transcript.
        n_introns_in_transcript = len(intron_lengths[transcript_id])
        n_exons_in_transcript = len(exon_lengths[transcript_id])
        
        # Write the "no selection" line.
        
        # TODO: put a 0 selection in the 1_25 file 
        # so we can use the file instead?
        f.write('\t'.join([
            transcript_id, 
            str(minimum_transcript_length), 
            str(maximum_transcript_length), 
            str(n_exons_in_transcript),
            str(n_introns_in_transcript),
            '0',
            '0', 
            '0', 
            str(minimum_transcript_length)
        ]) + '\n')

        # Get the possible combinations for this number
        # of introns.
        combos = lte_25_intron_combinations[n_introns_in_transcript]

        # We will create an index so that we can
        # more easily track combinations.
        index = 1

        # Now iterate over the combinations.
        for choose_number in combos.keys():
            for selections in combos[choose_number]:

                # Create a variable to hold the length.
                intermediate_transcript_length = minimum_transcript_length

                # Get exactly which introns we're selecting.
                for intron_number in selections[0].split(','):
                    
                    # intron_number adjusted down because of 0-indexing
                    # in python.

                    # Select the intron and add to the length.
                    intermediate_transcript_length += intron_lengths[transcript_id][int(intron_number) - 1]
                
                # Write to file.
                f.write('\t'.join([
                    transcript_id, 
                    str(minimum_transcript_length), 
                    str(maximum_transcript_length), 
                    str(n_exons_in_transcript),
                    str(n_introns_in_transcript),
                    str(index),
                    str(choose_number), 
                    selections[0], 
                    str(intermediate_transcript_length)
                ]) + '\n')
                
                # Increase the index.
                index += 1  