# Genic Notation

According to the paper, they have mapped to the human reference genome (hg19).
HG19 is the alias to GRCh37.

Human Gene Annotation - GENCODE Release 19 (GRCh37.p13)

https://www.gencodegenes.org/human/release_19.html

In [1]:
# Imports
import gzip
import shutil
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# File_in contains the comprehensive gene annotation on the reference chromosomes only
file_in = 'Data/ComprehensiveGeneAnnotation/gencode.v19.annotation.gff3.gz'
file_out = 'Data/ComprehensiveGeneAnnotation/gencode.v19.annotation.gff3'

# # File_in contains the basic gene annotation on the reference chromosomes only
# # This is a subset of the corresponding comprehensive annotation, including only those transcripts tagged as 'basic' in every gene
# # This is the main annotation file for most users
# file_in = 'gencode.v42.basic.annotation.gff3.gz'
# file_out = 'gencode.v42.basic.annotation.gff3'

with gzip.open(file_in, 'rb') as f_in:
    with open(file_out, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [3]:
# Read dataset
data_df = pd.read_csv(file_out, sep='\t', header=None, skiprows=7)
#data_df.head(5)

In [4]:
print(data_df.shape)    # 2.6M x 9
data_df.head(5)

(2615590, 9)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr1,HAVANA,gene,11869.0,14412.0,.,+,.,ID=ENSG00000223972.4;gene_id=ENSG00000223972.4...
1,chr1,HAVANA,transcript,11869.0,14409.0,.,+,.,ID=ENST00000456328.2;Parent=ENSG00000223972.4;...
2,chr1,HAVANA,exon,11869.0,12227.0,.,+,.,ID=exon:ENST00000456328.2:1;Parent=ENST0000045...
3,chr1,HAVANA,exon,12613.0,12721.0,.,+,.,ID=exon:ENST00000456328.2:2;Parent=ENST0000045...
4,chr1,HAVANA,exon,13221.0,14409.0,.,+,.,ID=exon:ENST00000456328.2:3;Parent=ENST0000045...


In [5]:
# Copy dataset
data_df_mod = data_df.copy(deep=True)

In [6]:
# Remove rows that contain comments

# data_df_mod.shape
# data_df_mod.isnull().values.any()
# data_df_mod.isnull().sum()
# print(data_df_mod[data_df_mod.isnull().any(axis=1)])
data_df_mod = data_df_mod.dropna(axis = 0, how = 'any')
# print(data_df_mod[data_df_mod.isnull().any(axis=1)])
# data_df_mod.shape
data_df_mod.shape   # 2.6M x 9

(2615566, 9)

In [7]:
# Create column ID

data_df_mod['ID'] = data_df_mod.apply(
    lambda x: x[8][x[8].find('ID=')+3:x[8].find('.')], axis=1)


In [8]:
data_df_mod.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,ID
0,chr1,HAVANA,gene,11869.0,14412.0,.,+,.,ID=ENSG00000223972.4;gene_id=ENSG00000223972.4...,ENSG00000223972
1,chr1,HAVANA,transcript,11869.0,14409.0,.,+,.,ID=ENST00000456328.2;Parent=ENSG00000223972.4;...,ENST00000456328
2,chr1,HAVANA,exon,11869.0,12227.0,.,+,.,ID=exon:ENST00000456328.2:1;Parent=ENST0000045...,exon:ENST00000456328
3,chr1,HAVANA,exon,12613.0,12721.0,.,+,.,ID=exon:ENST00000456328.2:2;Parent=ENST0000045...,exon:ENST00000456328
4,chr1,HAVANA,exon,13221.0,14409.0,.,+,.,ID=exon:ENST00000456328.2:3;Parent=ENST0000045...,exon:ENST00000456328


In [9]:
# Create column Parent

#The find() method returns the index of first occurrence of the substring (if found). If not found, it returns -1.
data_df_mod['Parent'] = data_df_mod.apply(
    lambda x: x[8].split(';')[1][7:x[8].split(';')[1].find('.')] if x[2] != 'gene' else 'nan', 
    axis=1)


In [10]:
data_df_mod.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,ID,Parent
0,chr1,HAVANA,gene,11869.0,14412.0,.,+,.,ID=ENSG00000223972.4;gene_id=ENSG00000223972.4...,ENSG00000223972,
1,chr1,HAVANA,transcript,11869.0,14409.0,.,+,.,ID=ENST00000456328.2;Parent=ENSG00000223972.4;...,ENST00000456328,ENSG00000223972
2,chr1,HAVANA,exon,11869.0,12227.0,.,+,.,ID=exon:ENST00000456328.2:1;Parent=ENST0000045...,exon:ENST00000456328,ENST00000456328
3,chr1,HAVANA,exon,12613.0,12721.0,.,+,.,ID=exon:ENST00000456328.2:2;Parent=ENST0000045...,exon:ENST00000456328,ENST00000456328
4,chr1,HAVANA,exon,13221.0,14409.0,.,+,.,ID=exon:ENST00000456328.2:3;Parent=ENST0000045...,exon:ENST00000456328,ENST00000456328
5,chr1,ENSEMBL,transcript,11872.0,14412.0,.,+,.,ID=ENST00000515242.2;Parent=ENSG00000223972.4;...,ENST00000515242,ENSG00000223972
6,chr1,ENSEMBL,exon,11872.0,12227.0,.,+,.,ID=exon:ENST00000515242.2:1;Parent=ENST0000051...,exon:ENST00000515242,ENST00000515242
7,chr1,ENSEMBL,exon,12613.0,12721.0,.,+,.,ID=exon:ENST00000515242.2:2;Parent=ENST0000051...,exon:ENST00000515242,ENST00000515242
8,chr1,ENSEMBL,exon,13225.0,14412.0,.,+,.,ID=exon:ENST00000515242.2:3;Parent=ENST0000051...,exon:ENST00000515242,ENST00000515242
9,chr1,ENSEMBL,transcript,11874.0,14409.0,.,+,.,ID=ENST00000518655.2;Parent=ENSG00000223972.4;...,ENST00000518655,ENSG00000223972


In [11]:
# Create column with Exon length
data_df_mod['ExonLength'] = data_df_mod.apply(
    lambda x: x[4]-x[3] if x[2]=='exon' else 0,
    axis = 1
)



In [12]:
data_df_mod.head(15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,ID,Parent,ExonLength
0,chr1,HAVANA,gene,11869.0,14412.0,.,+,.,ID=ENSG00000223972.4;gene_id=ENSG00000223972.4...,ENSG00000223972,,0.0
1,chr1,HAVANA,transcript,11869.0,14409.0,.,+,.,ID=ENST00000456328.2;Parent=ENSG00000223972.4;...,ENST00000456328,ENSG00000223972,0.0
2,chr1,HAVANA,exon,11869.0,12227.0,.,+,.,ID=exon:ENST00000456328.2:1;Parent=ENST0000045...,exon:ENST00000456328,ENST00000456328,358.0
3,chr1,HAVANA,exon,12613.0,12721.0,.,+,.,ID=exon:ENST00000456328.2:2;Parent=ENST0000045...,exon:ENST00000456328,ENST00000456328,108.0
4,chr1,HAVANA,exon,13221.0,14409.0,.,+,.,ID=exon:ENST00000456328.2:3;Parent=ENST0000045...,exon:ENST00000456328,ENST00000456328,1188.0
5,chr1,ENSEMBL,transcript,11872.0,14412.0,.,+,.,ID=ENST00000515242.2;Parent=ENSG00000223972.4;...,ENST00000515242,ENSG00000223972,0.0
6,chr1,ENSEMBL,exon,11872.0,12227.0,.,+,.,ID=exon:ENST00000515242.2:1;Parent=ENST0000051...,exon:ENST00000515242,ENST00000515242,355.0
7,chr1,ENSEMBL,exon,12613.0,12721.0,.,+,.,ID=exon:ENST00000515242.2:2;Parent=ENST0000051...,exon:ENST00000515242,ENST00000515242,108.0
8,chr1,ENSEMBL,exon,13225.0,14412.0,.,+,.,ID=exon:ENST00000515242.2:3;Parent=ENST0000051...,exon:ENST00000515242,ENST00000515242,1187.0
9,chr1,ENSEMBL,transcript,11874.0,14409.0,.,+,.,ID=ENST00000518655.2;Parent=ENSG00000223972.4;...,ENST00000518655,ENSG00000223972,0.0


In [13]:
# Original: 2.6 M rows
# Exon: 1.2M rows
data_df_ExonLength = data_df_mod.loc[data_df_mod[2]=='exon',[2,'ID','Parent','ExonLength']]
print(data_df_ExonLength.shape)
data_df_ExonLength.head(5)

(1196293, 4)


Unnamed: 0,2,ID,Parent,ExonLength
2,exon,exon:ENST00000456328,ENST00000456328,358.0
3,exon,exon:ENST00000456328,ENST00000456328,108.0
4,exon,exon:ENST00000456328,ENST00000456328,1188.0
6,exon,exon:ENST00000515242,ENST00000515242,355.0
7,exon,exon:ENST00000515242,ENST00000515242,108.0


In [14]:
# Dataframe that contains name of transcripts and length: data_df_TranscriptLength

# Groupby Parent: 117592 rows
data_df_TranscriptLength = data_df_ExonLength.groupby('Parent')['ExonLength'].sum()
data_df_TranscriptLength.head(5)
# data_df_TranscriptLength.shape
# data_df_TranscriptLength.loc['ENST00000450305']

Parent
ENST00000000233    1097.0
ENST00000000412    2749.0
ENST00000000442    2208.0
ENST00000001008    2257.0
ENST00000001146    4726.0
Name: ExonLength, dtype: float64

In [15]:
# Dataframe that contains name of gens and transcripts
data_df_gen = data_df_mod.loc[data_df_mod[2]=='transcript',['ID','Parent']]
data_df_gen = data_df_gen.rename(columns = {'ID':'transcript','Parent':'gene'})
data_df_gen.head(5)

Unnamed: 0,transcript,gene
1,ENST00000456328,ENSG00000223972
5,ENST00000515242,ENSG00000223972
9,ENST00000518655,ENSG00000223972
14,ENST00000450305,ENSG00000223972
22,ENST00000438504,ENSG00000227232


In [16]:
data_df_gen.index = data_df_gen['transcript']
data_df_gen.head(5)

Unnamed: 0_level_0,transcript,gene
transcript,Unnamed: 1_level_1,Unnamed: 2_level_1
ENST00000456328,ENST00000456328,ENSG00000223972
ENST00000515242,ENST00000515242,ENSG00000223972
ENST00000518655,ENST00000518655,ENSG00000223972
ENST00000450305,ENST00000450305,ENSG00000223972
ENST00000438504,ENST00000438504,ENSG00000227232


In [17]:
df_total = pd.merge(data_df_gen, data_df_TranscriptLength, left_index=True, right_index=True, how='inner')
df_total = df_total.rename(columns = {'ExonLength':'transcriptLength'})
print(df_total.shape)
df_total.head(5)
# df_total.loc['ENST00000450305']

(196520, 3)


Unnamed: 0,transcript,gene,transcriptLength
ENST00000456328,ENST00000456328,ENSG00000223972,1654.0
ENST00000515242,ENST00000515242,ENSG00000223972,1650.0
ENST00000518655,ENST00000518655,ENSG00000223972,1479.0
ENST00000450305,ENST00000450305,ENSG00000223972,626.0
ENST00000438504,ENST00000438504,ENSG00000227232,1771.0


In [18]:
# How many transcripts per gene
df_total.groupby('gene').count()

Unnamed: 0_level_0,transcript,transcriptLength
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000000003,3,3
ENSG00000000005,2,2
ENSG00000000419,7,7
ENSG00000000457,5,5
ENSG00000000460,10,10
...,...,...
ENSGR0000264819,1,1
ENSGR0000265350,1,1
ENSGR0000265658,1,1
ENSGR0000266731,1,1


In [19]:
# Save file
file='GFF3.csv'
df_total.to_csv(file, sep=';', na_rep='n.a.')