# Processing Exonic/Intronic ratios

In [None]:
import pandas as pd
import numpy as np
from plotnine import *

In [None]:
data_path = '../../data/'

GTEx's samples can be attributed to major tissues or subtissues. Subtissues are a more detailed description of the tissue the samples belong to and are a subset of the major tissues. 

## Subtissues

We will have to filter the genes by expressed only per tissue and select the major transcripts afterwards

In [None]:
ei_ratios_df = pd.read_csv(data_path+'EI-ratios-masked.csv')

In [None]:
ei_ratios_df

In [None]:
ei_ratios_df = ei_ratios_df.rename({'gene.id':'gene_id'}, axis=1).set_index('gene_id')
ei_ratios_df

Let's load the annotations of the samples from gtex:

In [None]:
gtex_annotation = pd.read_csv(data_path+'gtex_sample_metadata_public.csv', index_col=0)
gtex_annotation

In [None]:
ei_ratios_df = ei_ratios_df.T.merge(gtex_annotation.loc[:, ['sample.id', 'SMTSD']], left_index=True, right_on='sample.id')
ei_ratios_df

Reading major isoforms table:

In [None]:
major_iso_df = pd.read_csv(data_path+'gtex_major_isoform_per_subtissue.csv').set_index('gene_id')
major_iso_df

In [None]:
samples_ei_df_melted = ei_ratios_df.melt(id_vars=['sample.id', 'SMTSD'], var_name='gene_id', value_name='ei_ratio')
samples_ei_df_melted

Let's melt the major isoforms table so we can merge it with the samples

In [None]:
melted_major_isoforms = pd.melt(major_iso_df.reset_index(), id_vars='gene_id', value_name='major_isoform', var_name='subtissue')
melted_major_isoforms = melted_major_isoforms.dropna()
melted_major_isoforms

In [None]:
samples_mi_ei_df = samples_ei_df_melted.merge(melted_major_isoforms, left_on=['gene_id', 'SMTSD'], right_on=['gene_id', 'subtissue'], how='left')
samples_mi_ei_df

Drop nas (which corrrespond to non expressed genes)

In [None]:
samples_mi_ei_df = samples_mi_ei_df.dropna()
samples_mi_ei_df

In [None]:
major_iso_df = samples_mi_ei_df.pivot(values='ei_ratio', columns='sample.id', index='major_isoform')
major_iso_df

In [None]:
thresh = int((2/3) * len(major_iso_df.columns)) #keep transcripts which are expressed in more than 2/3s of the samples
thresh

In [None]:
ei_iso_df_non_centered_t = major_iso_df.dropna(thresh=thresh)
ei_iso_df_non_centered_t

In [None]:
ei_subtissue_df =  gtex_annotation.loc[:, ['sample.id','SMTSD']].set_index(
    'sample.id').join(ei_iso_df_non_centered_t.T, how='inner')
ei_subtissue_df

In order to get a exon/intron ratio value per tissue, the median EI ratio per major isoform is taken.

In [None]:
ei_subtissue_df = ei_subtissue_df.groupby('SMTSD').median().T
ei_subtissue_df

In [None]:
thresh = int((.85) * len(ei_subtissue_df.columns)) #keep transcripts which are expressed in more than 85% of the tissues
thresh

In [None]:
ei_subtissue_df = ei_subtissue_df.dropna(thresh=thresh)
ei_subtissue_df

In [None]:
ei_subtissue_centered_df = ei_subtissue_df.sub(ei_subtissue_df.mean(axis=1), axis=0)
ei_subtissue_centered_df

In [None]:
ei_subtissue_centered_df.to_csv(data_path+'gtex_ei_ratio_subtissues_85_percent_non_nas.csv')

## Major tissues

In [None]:
ei_ratios_df = pd.read_csv(data_path+'EI-ratios-masked.csv')

In [None]:
ei_ratios_df

In [None]:
ei_ratios_df = ei_ratios_df.rename({'gene.id':'gene_id'}, axis=1).set_index('gene_id')
ei_ratios_df

In [None]:
ei_ratios_df = ei_ratios_df.T.merge(gtex_annotation.loc[:, ['sample.id', 'SMTS']], left_index=True, right_on='sample.id')
ei_ratios_df

Reading major isoforms table:

In [None]:
major_iso_df = pd.read_csv(data_path+'gtex_major_isoform_per_major_tissue.csv').set_index('gene_id')
major_iso_df

In [None]:
samples_ei_df_melted = ei_ratios_df.melt(id_vars=['sample.id', 'SMTS'], var_name='gene_id', value_name='ei_ratio')
samples_ei_df_melted

Let's melt the major isoforms table so we can merge it with the samples

In [None]:
melted_major_isoforms = pd.melt(major_iso_df.reset_index(), id_vars='gene_id', value_name='major_isoform', var_name='tissue')
melted_major_isoforms = melted_major_isoforms.dropna()
melted_major_isoforms

In [None]:
samples_mi_ei_df = samples_ei_df_melted.merge(melted_major_isoforms, left_on=['gene_id', 'SMTS'], right_on=['gene_id', 'tissue'], how='left')
samples_mi_ei_df

Drop nas (which corrrespond to non expressed genes)

In [None]:
samples_mi_ei_df = samples_mi_ei_df.dropna()
samples_mi_ei_df

In [None]:
major_iso_df = samples_mi_ei_df.pivot(values='ei_ratio', columns='sample.id', index='major_isoform')
major_iso_df

In [None]:
thresh = int((2/3) * len(major_iso_df.columns)) #keep transcripts which are expressed in more than 2/3s of the samples
thresh

In [None]:
ei_iso_df_non_centered_t = major_iso_df.dropna(thresh=thresh)
ei_iso_df_non_centered_t

In [None]:
ei_tissue_df =  gtex_annotation.loc[:, ['sample.id','SMTS']].set_index(
    'sample.id').join(ei_iso_df_non_centered_t.T, how='inner')
ei_tissue_df

In order to get a exon/intron ratio value per tissue, the median EI ratio per major isoform is taken.

In [None]:
ei_tissue_df = ei_tissue_df.groupby('SMTS').median().T
ei_tissue_df

In [None]:
thresh = int((.85) * len(ei_tissue_df.columns)) #keep transcripts which are expressed in more than 2/3s of the tissues
thresh

In [None]:
ei_tissue_df = ei_tissue_df.dropna(thresh=thresh)
ei_tissue_df

In [None]:
ei_tissue_centered_df = ei_tissue_df.sub(ei_tissue_df.mean(axis=1), axis=0)
ei_tissue_centered_df

In [None]:
ei_tissue_centered_df.to_csv(data_path+'gtex_ei_ratio_major_tissues_85_percent_non_nas.csv')