In [2]:
import pandas as pd
import numpy as np
import os
import sys
import re

import warnings
warnings.filterwarnings('ignore')

In [3]:
ens69_gtf = pd.read_table("../raw_data/Homo_sapiens.GRCh37.69.gtf", header = None, sep = '\t')

In [5]:
ens69_gtf.columns = ['chr', 'rna_type', 'transcript_region', 'start', 'end', '5', 'strand', '7', 'transcript_info']

In [7]:
def tidy_split(df, column, sep=';', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.
    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row
    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df.reset_index(drop=True)

In [8]:
info_data = ens69_gtf['transcript_info'].str.split(';', expand=True)
info_data = info_data.iloc[:, :7]
info_data.columns = ['gene_id', 'transcript_id', 'exon_number', 'gene_name', 
                     'gene_biotype', 'transcript_name', 'exon_id']
info_data = info_data.fillna('exon id "none"')
info_data['exon_id'] = info_data['exon_id'].replace('', 'exon id "none"')

In [9]:
def update_columns(data, column):
    data[column] = data[column].map(lambda x: x.split('"')[1])
    return data
    
for i in info_data.columns:
    info_data = update_columns(info_data, i)

In [83]:
ens69_gtf_expand = pd.concat([ens69_gtf, info_data], axis=1)

In [84]:
ens69_gtf_expand.head(2)

Unnamed: 0,chr,rna_type,transcript_region,start,end,5,strand,7,transcript_info,gene_id,transcript_id,exon_number,gene_name,gene_biotype,transcript_name,exon_id
0,GL000213.1,miRNA,exon,104742,104817,.,+,.,"gene_id ""ENSG00000265283""; transcript_id ""ENS...",ENSG00000265283,ENST00000578976,1,MIR3118-5,miRNA,MIR3118-5-201,ENSE00002688538
1,GL000213.1,protein_coding,exon,138767,139339,.,-,.,"gene_id ""ENSG00000237375""; transcript_id ""ENS...",ENSG00000237375,ENST00000327822,1,BX072566.1,protein_coding,BX072566.1-201,ENSE00001628136


In [85]:
ens69_gtf_expand['chr'] = ens69_gtf_expand['chr'].astype(str)

In [80]:
# ens69_gtf_expand = ens69_gtf_expand.rename(columns={'start': 'tx_start', 'end': 'tx_end'})

In [86]:
chromosomes_lst = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', 'X', 'Y']

In [19]:
# ens69_gtf_expand['chr'].unique()

In [87]:
ens69_gtf_chr = ens69_gtf_expand[ens69_gtf_expand['chr'].isin(chromosomes_lst )]

In [88]:
transcripts_min = ens69_gtf_chr.groupby(['transcript_id'])[['start']].first().drop_duplicates()
transcripts_min = transcripts_min.dropna()
transcripts_min['transcript_id'] = transcripts_min.index
transcripts_min = transcripts_min.dropna()
transcripts_min = transcripts_min.reset_index(drop = True)

In [89]:
transcripts_max = ens69_gtf_chr.groupby(['transcript_id'])[['end']].max().drop_duplicates()
transcripts_max = transcripts_max.dropna()
transcripts_max['transcript_id'] = transcripts_max.index 
transcripts_max = transcripts_max.dropna()
transcripts_max = transcripts_max.reset_index(drop  = True)

In [91]:
transcripts_start_end = pd.merge(transcripts_min, transcripts_max, how= 'left')

transcripts_start_end = transcripts_start_end.rename(columns={'start': 'tx_start', 'end': 'tx_end'})

In [92]:
# Write start/end of full transcripts to file 
transcripts_start_end.to_csv("../tables_output/all_chr_start_end_transcripts.tsv", sep = '\t', index = False)

## Find start and end genes (for .PRO preparation)

In [109]:
ens69_gtf_chr_subset = ens69_gtf_chr[['chr', 'gene_name', 'strand', 'transcript_id', 'gene_id', 'gene_biotype']]

In [110]:
transcripts_start_end_genes = pd.merge(transcripts_start_end, ens69_gtf_chr_subset, how = 'left')
transcripts_start_end_genes = transcripts_start_end_genes.dropna()

In [111]:
transcripts_start_end_genes['tx_end'] = transcripts_start_end_genes['tx_end'].astype(int)

In [124]:
transcripts_start_end_genes = transcripts_start_end_genes.drop_duplicates()
transcripts_start_end_genes = transcripts_start_end_genes.sort_values(['chr','tx_start',  'gene_id' , 'gene_name'])

In [125]:
transcripts_start_end_genes.to_csv("../tables_output/all_chr_transcripts_start_end_genes.tsv", sep = '\t', index = False)

In [126]:
transcripts_start_end_genes.reset_index(inplace = True, drop = True)

In [127]:
transcripts_start_end_genes['tx_len'] = transcripts_start_end_genes['tx_end'] - transcripts_start_end_genes['tx_start']

In [128]:
# Count the number of transcripts and assume equal contribution of RNA from each transcript not accounting for length
transcripts_start_end_genes['count_transcripts'] = transcripts_start_end_genes.groupby('gene_id')['gene_id'].transform('count')

In [129]:
transcripts_start_end_genes['sum_tx_len'] = transcripts_start_end_genes.groupby('gene_name')['tx_len'].transform('sum')

In [130]:
transcripts_start_end_genes.head(2)

Unnamed: 0,tx_start,transcript_id,tx_end,chr,gene_name,strand,gene_id,gene_biotype,tx_len,count_transcripts,sum_tx_len
0,11869,ENST00000456328,14409,1,DDX11L1,+,ENSG00000223972,pseudogene,2540,3,6740
1,11872,ENST00000515242,14412,1,DDX11L1,+,ENSG00000223972,pseudogene,2540,3,6740


In [132]:
transcripts_start_end_genes_coding = transcripts_start_end_genes[transcripts_start_end_genes['gene_biotype'] == "protein_coding"]

In [135]:
transcripts_start_end_genes_coding = transcripts_start_end_genes_coding.sort_values(by = ['chr','tx_start',  'gene_id' , 'gene_name'])

transcripts_start_end_genes_coding = transcripts_start_end_genes_coding.reset_index(drop = True)

In [137]:
transcripts_start_end_genes_coding.head(10)

Unnamed: 0,tx_start,transcript_id,tx_end,chr,gene_name,strand,gene_id,gene_biotype,tx_len,count_transcripts,sum_tx_len
0,35721,ENST00000417324,36081,1,FAM138A,-,ENSG00000237613,protein_coding,360,1,360
1,69091,ENST00000335137,70008,1,OR4F5,+,ENSG00000186092,protein_coding,917,1,917
2,367640,ENST00000426406,368634,1,OR4F29,+,ENSG00000235249,protein_coding,994,1,994
3,621059,ENST00000332831,622053,1,OR4F16,-,ENSG00000185097,protein_coding,994,1,994
4,860260,ENST00000420190,874671,1,SAMD11,+,ENSG00000187634,protein_coding,14411,8,49471
5,860530,ENST00000437963,871173,1,SAMD11,+,ENSG00000187634,protein_coding,10643,8,49471
6,865692,ENST00000341065,879955,1,SAMD11,+,ENSG00000187634,protein_coding,14263,8,49471
7,874655,ENST00000455979,879639,1,SAMD11,+,ENSG00000187634,protein_coding,4984,8,49471
8,875726,ENST00000478729,877553,1,SAMD11,+,ENSG00000187634,protein_coding,1827,8,49471
9,876456,ENST00000474461,878374,1,SAMD11,+,ENSG00000187634,protein_coding,1918,8,49471


In [138]:
transcripts_start_end_genes_coding['tx_proportion'] = transcripts_start_end_genes_coding['tx_len'] / transcripts_start_end_genes_coding['sum_tx_len']

In [139]:
transcripts_start_end_genes_coding.to_csv('../tables_output/transcripts_start_end_genes_coding.tsv', sep = '\t', index = False)

In [140]:
transcripts_coding_lst = transcripts_start_end_genes_coding['transcript_id'].unique

In [142]:
ens69_gtf_coding = ens69_gtf[ens69_gtf['transcript_info'].str.contains('protein_coding')]

In [143]:
ens69_gtf_coding.to_csv('../tables_output/ens69_gtf_protein_coding.tsv', sep = '\t', index = False)