# Exploring Data from Favate et al. 2021

In [7]:
import numpy as np 
import pandas as pd 
import diaux.viz 
import altair as alt 
colors, palette = diaux.viz.altair_style()

In [17]:
# Load the tidied Favate data
data = pd.read_csv('../../data/Favate2021/processed/Favate2021_reads_tidy.csv')

# Set up our own cog class organization
cog_hier = {'Information storage and processing': ['J', 'A', 'K', 'L', 'B'],
            'Cellular processes and signaling': ['D', 'Y', 'V', 'T', 'M', 'N', 'Z', 'W', 'U', 'O'],
            'Metabolism': ['C', 'G', 'E', 'F', 'H', 'I', 'P', 'Q'],
            'Poorly Characterized/Not Assigned': ['R', 'S', 'Not Assigned']}
cog_hier_rev = {}
for k, v in cog_hier.items():
    for letter in v:
        cog_hier_rev[letter] = k

for k, v in cog_hier_rev.items():
    data.loc[data['cog_letter']==k, 'cog_sector'] = v

# For each line and replicate, compute the total fraction of each cog classification
grouped = []
for g, d in data.groupby(['line', 'replicate']):
    total_rnas = d['est_counts'].sum()
    _grouped = d.groupby(['line', 'replicate', 'cog_sector']).sum().reset_index()
    _grouped['fraction'] = _grouped['est_counts'].values / total_rnas 
    grouped.append(_grouped)

agg = pd.concat(grouped, sort=False)


In [19]:
chart = alt.Chart(agg[agg['replicate']==1]).mark_bar().encode(
            x=alt.X(field='fraction', type='quantitative', title='RNA transcript fraction'),
            y=alt.Y(field='line', type='nominal', title='evolved line'),
            color=alt.Color(field='cog_sector', type='nominal', title='COG class'))
chart