In [115]:
import numpy as np
import pandas as pd

In [116]:
pd.set_option('display.max_columns', None)

In [117]:
ens_annot = pd.read_csv('../data/raw/ensembl_general.txt', low_memory=False)
ens_uniprot = pd.read_csv('../data/raw/ensembl_uniprot.txt')
ens_exons = pd.read_csv('../data/raw/ensembl_exons.txt')
ens_ccds = pd.read_csv('../data/raw/ensembl_ccds.txt')

In [119]:
ens_annot = ens_annot.rename({'Transcript stable ID': 'ensembl_trs_id',
                              'Gene stable ID': 'ensembl_gene_id',
                              'Gene name': 'ensembl_gene_name',
                              'Protein stable ID': 'ensembl_protein_id',
                              'Chromosome/scaffold name': 'chromosome_or_scaffold',
                              'Gene start (bp)': 'gene_start',
                              'Gene end (bp)': 'gene_end',
                              'Strand': 'strand',
                              'Transcript start (bp)': 'trs_start',
                              'Transcript end (bp)': 'trs_end',
                              'Transcript length (including UTRs and CDS)': 'trs_length_bp',
                              'Transcript support level (TSL)': 'trs_support_level',
                              'GENCODE basic annotation': 'in_gencode_basic',
                              'APPRIS annotation': 'appris_annotation',
                              'Ensembl Canonical': 'is_ensembl_canonical',
                              'Transcript name': 'ensembl_trs_name',
                              'Gene type': 'gene_type',
                              'Transcript type': 'trs_type',}, axis=1)

In [120]:
ens_annot['is_ensembl_canonical'] = ens_annot['is_ensembl_canonical'].map(lambda x: True if x == 1.0 else False)
ens_annot['in_gencode_basic'] = ens_annot['in_gencode_basic'].map(lambda x: True if x == 'GENCODE basic' else False)
ens_annot['trs_support_level'] = ens_annot['trs_support_level'].fillna('tslNA').map(lambda x: x[:x.index(' ')] if ' ' in x else x)

In [121]:
ens_uniprot = ens_uniprot.rename({'Transcript stable ID': 'ensembl_trs_id',
                                  'Gene stable ID': 'ensembl_gene_id',
                                  'Protein stable ID': 'ensembl_protein_id',
                                  'UniProtKB isoform ID': 'uniprot_isoform_id',
                                  'UniProtKB/Swiss-Prot ID': 'uniprot_base_id',
                                  'UniProtKB/TrEMBL ID': 'uniprot_trembl_id'}, axis=1)

In [122]:
ens_uniprot['uniprot_trembl_id'] = ens_uniprot.groupby('ensembl_trs_id')['uniprot_trembl_id'].transform(lambda x: ', '.join(sorted([i for i in x if i is not np.nan])))
ens_uniprot['uniprot_trembl_id'] = ens_uniprot['uniprot_trembl_id'].map(lambda x: np.nan if x == '' else x)
ens_uniprot['uniprot_base_id'] = ens_uniprot['uniprot_base_id'].map(lambda x: x + '-1' if x is not np.nan else np.nan)
ens_uniprot['uniprot_isoform_id'] = ens_uniprot['uniprot_isoform_id'].combine_first(ens_uniprot['uniprot_base_id'])
ens_uniprot['uniprot_base_id'] = ens_uniprot['uniprot_base_id'].map(lambda x: x[:x.index('-')] if x is not np.nan else np.nan)
ens_uniprot = ens_uniprot.drop_duplicates('ensembl_trs_id', keep='first')

In [124]:
ens_ccds = ens_ccds.rename({'Transcript stable ID': 'ensembl_trs_id',
                            'Gene stable ID': 'ensembl_gene_id',
                            'Protein stable ID': 'ensembl_protein_id',
                            'CCDS ID': 'ccds_id'}, axis=1)

In [125]:
ens_annot['uniprot_base_id'] = ens_annot['ensembl_trs_id'].map(ens_uniprot.set_index('ensembl_trs_id')['uniprot_base_id'])
ens_annot['uniprot_isoform_id'] = ens_annot['ensembl_trs_id'].map(ens_uniprot.set_index('ensembl_trs_id')['uniprot_isoform_id'])
ens_annot['uniprot_trembl_id'] = ens_annot['ensembl_trs_id'].map(ens_uniprot.set_index('ensembl_trs_id')['uniprot_trembl_id'])
ens_annot['ccds_id'] = ens_annot['ensembl_trs_id'].map(ens_ccds.set_index('ensembl_trs_id')['ccds_id'])

In [126]:
ens_exons = ens_exons.rename({'Gene stable ID': 'ensembl_gene_id', 
                              'Transcript stable ID': 'ensembl_trs_id', 
                              '5\' UTR start': '5utr_start', 
                              '5\' UTR end': '5utr_end', 
                              '3\' UTR start': '3utr_start', 
                              '3\' UTR end': '3utr_end', 
                              'Exon region start (bp)': 'exon_start', 
                              'Exon region end (bp)': 'exon_end',
                              'Exon stable ID': 'ensembl_exon_id', 
                              'Constitutive exon': 'exon_is_constitutive', 
                              'Genomic coding start': 'cds_start', 
                              'Genomic coding end': 'cds_end',
                              'CDS Length': 'cds_length_bp'}, axis=1)

In [127]:
ens_exons = ens_exons[['ensembl_gene_id', 'ensembl_trs_id', 'ensembl_exon_id', '5utr_start', 
                       '5utr_end', '3utr_start', '3utr_end', 'exon_start', 'exon_end',
                       'exon_is_constitutive', 'cds_start', 'cds_end', 'cds_length_bp']]

In [128]:
ens_annot['5utr_start'] = ens_annot['ensembl_trs_id'].map(ens_exons.groupby('ensembl_trs_id')['5utr_start'].min())
ens_annot['5utr_end'] = ens_annot['ensembl_trs_id'].map(ens_exons.groupby('ensembl_trs_id')['5utr_end'].max())
ens_annot['3utr_start'] = ens_annot['ensembl_trs_id'].map(ens_exons.groupby('ensembl_trs_id')['3utr_start'].min())
ens_annot['3utr_end'] = ens_annot['ensembl_trs_id'].map(ens_exons.groupby('ensembl_trs_id')['3utr_end'].max())
ens_annot['cds_start'] = ens_annot['ensembl_trs_id'].map(ens_exons.groupby('ensembl_trs_id')['cds_start'].min())
ens_annot['cds_end'] = ens_annot['ensembl_trs_id'].map(ens_exons.groupby('ensembl_trs_id')['cds_end'].max())

In [129]:
ens_annot['cds_length_bp'] = ens_annot['ensembl_trs_id'].map(ens_exons.drop_duplicates('ensembl_trs_id').set_index('ensembl_trs_id')['cds_length_bp'])
ens_annot['protein_length_aa'] = ens_annot['cds_length_bp'].map(lambda x: (x - 3) / 3 if x is not np.nan else np.nan)

In [130]:
ens_annot = ens_annot[['chromosome_or_scaffold', 'ensembl_gene_id', 'ensembl_gene_name', 'gene_type', 
                       'strand', 'gene_start', 'gene_end', 'ensembl_trs_id', 'ensembl_trs_name', 'trs_type', 
                       'is_ensembl_canonical', 'trs_support_level', 'trs_length_bp', 'trs_start', 
                       'trs_end', '5utr_start', '5utr_end', '3utr_start', '3utr_end', 'cds_start',
                       'cds_end', 'cds_length_bp', 'in_gencode_basic', 'appris_annotation', 'ensembl_protein_id',
                       'uniprot_base_id', 'uniprot_isoform_id', 'uniprot_trembl_id', 'ccds_id', 'protein_length_aa']]

In [132]:
ens_annot.to_csv('../data/processed/ensembl_annotation_20220705.csv', index=False)
ens_exons.to_csv('../data/processed/ensembl_exons_20220705.csv', index=False)