# Get Conservation Scores

Get PhyloP conservation scores for ensembl transcripts

In [1]:
import pandas as pd
import requests
import sys
from tqdm import tqdm

from datasets import expanded_dataset_list

In [2]:

def get_conservation(chr, start, end, genome):
    api_url = 'http://api.genome.ucsc.edu/getData/track'
    if genome == 'hg38':
        track = 'phyloP100way'
    elif genome == 'mm39':
        track = 'phyloP35way'
    else:
        raise ValueError('Genome not recognized')
    chrom = 'chr' + chr
    params = {
        'genome': genome,
        'track': track,
        'start': start,
        'end': end,
        'chrom': chrom
    }
    results = requests.get(api_url, data=params)
    if results.ok:
        value_df = (pd.DataFrame([pd.Series(x) for x in pd.read_json(results.content.decode('utf8'))[chrom].values])
                    .rename(columns={'value': 'conservation'}))
    else:
        raise ValueError(results.reason)
    return value_df


def build_request_url(ext, server="https://rest.ensembl.org"):
    request_url = "/".join([server, ext])
    return request_url


def handle_results(results):
    if not results.ok:
        results.raise_for_status()
        sys.exit()
    decoded = results.json()
    return decoded


def get_transcript_info(base_transcript):
    """Using an ensembl transcript ID, get

    :param base_transcript: str
    :return: (exon_df, trans_sr, chr)
        exon_df: DataFrame, with global exon start and end position
        trans_sr: Series, with global translation start and stop positions for CDS and translation length
        chr: str

    """
    request_url = build_request_url("/lookup/id/" + base_transcript + "?expand=1")
    r = requests.get(request_url, headers={"Content-Type": "application/json"})
    decoded = handle_results(r)
    exon_df = pd.DataFrame(decoded['Exon'])
    trans_sr = pd.Series(decoded['Translation'])
    chr = decoded['seq_region_name']
    return exon_df, trans_sr, chr


def get_exon_conservation(exon_df, chr, genome):
    conservation_dict = {}
    for i, row in exon_df.set_index('id').iterrows():
        # subtract one since the nucleotide conservation corresponds to the "end" index
        conservation_dict[i] = get_conservation(chr, row['start'] - 1, row['end'], genome)
        # get the conservation of i
    conservation_df = (pd.concat(conservation_dict)
                       .reset_index(level=0)
                       .reset_index(drop=True)
                       .rename({'level_0': 'exon_id',
                                'end': 'genomic position'}, axis=1)
                       .drop('start', axis=1))
    return conservation_df


def get_transcript_conservation(transcript_id, target_strand, genome):
    exon_df, trans_sr, chr = get_transcript_info(transcript_id)
    # only include translated positions
    exon_df['start'] = exon_df['start'].apply(lambda x: max(x, trans_sr['start']))
    exon_df['end'] = exon_df['end'].apply(lambda x: min(x, trans_sr['end']))
    exon_df = exon_df[exon_df['end'] > exon_df['start']].reset_index(drop=True)
    conservation_df = get_exon_conservation(exon_df, chr, genome)
    conservation_df['Transcript Base'] = transcript_id
    if target_strand == '-':
        ascending = False
    else:
        ascending = True
    conservation_df = (conservation_df
                       .sort_values('genomic position', ascending=ascending)
                       .reset_index(drop=True))
    conservation_df['target position'] = conservation_df.index + 1
    conservation_df['chromosome'] = chr
    conservation_df['genome'] = genome
    conservation_df['translation length'] = trans_sr['length']
    return conservation_df


In [3]:
data_list = list()
for ds in expanded_dataset_list:
    if ds.endogenous:
        data_list.append(ds)

design_list = list()
for ds in tqdm(data_list):
    ds.load_data()
    ds.set_sgrnas()
    design_list.append(ds.get_designs())

100%|██████████| 11/11 [05:21<00:00, 29.19s/it]


In [4]:
design_df = (pd.concat(design_list)
             .drop_duplicates())
transcript_refseq_df = (design_df[['Target Transcript', 'Strand of Target', 'Target Total Length']]
                        .drop_duplicates())
transcript_refseq_df['Transcript Base'] = (transcript_refseq_df['Target Transcript']
    .str.split('.', expand=True)[0])
transcript_refseq_df['genome'] = transcript_refseq_df['Transcript Base'].apply(lambda trans:
                                                                               'mm39' if 'MUS' in trans else 'hg38')
len(transcript_refseq_df)


3011

In [5]:
transcript_conservation_list = []
failed_list = []
for i, row in tqdm(transcript_refseq_df.iterrows(), total=transcript_refseq_df.shape[0]):
    try:
        transcript_conservation_list.append(get_transcript_conservation(row['Transcript Base'],
                                                                        row['Strand of Target'],
                                                                        row['genome']))
    except:
        failed_list.append(row)

100%|██████████| 3011/3011 [4:00:45<00:00,  4.80s/it]  


In [6]:
print('Failed transcripts: ' + str(len(failed_list)))

Failed transcripts: 13


We'll filter out transcripts with a mistmatched translation length

In [7]:
transcript_conservation_df = (pd.concat(transcript_conservation_list))
transcript_cons_designs = (transcript_conservation_df
                           .merge(transcript_refseq_df, how='inner',
                                  on=['Transcript Base', 'genome']))
filtered_transcript_conservation = transcript_cons_designs[
    (transcript_cons_designs['translation length'] + 1)*3 == transcript_cons_designs['Target Total Length']].copy()
print('Filtered: ' + str(transcript_conservation_df['Transcript Base'].nunique() -
                         filtered_transcript_conservation['Transcript Base'].nunique()) +
      ' transcripts with mismatched length')

Filtered: 20 transcripts with mismatched length


In [8]:
out_conservation = filtered_transcript_conservation
out_conservation['ranked_conservation'] = (out_conservation.groupby('Transcript Base')
                                           ['conservation']
                                           .rank(pct=True))
out_conservation['Transcript Base'].nunique()

2978

In [9]:
out_conservation.to_parquet('../data/interim/conservation.parquet', index=False)