In [1]:
import numpy as np
import pandas as pd
import altair as alt
from matplotlib import cm, colors

In [2]:
# read in file and get sheet names
file = '../Tissue-Specific Cit IDs.xlsx'
xl = pd.ExcelFile(file)
sheets = xl.sheet_names
sheets = sheets[1:] # 'Summary page not needed'
brain = sheets[:5]
organs = sheets[5:]

# create master dataframe of all proteins in all tissue
proteins = pd.DataFrame()
for sheet in sheets:
    df = pd.read_excel(file, sheet_name=sheet, usecols=range(0, 1)).dropna()
    df = df.drop_duplicates()
    df.columns = [sheet]
    if proteins.empty:
        proteins = df
    else:
        proteins = proteins.join(df, how='outer')

brain = proteins[brain]
organs = proteins[organs]    

In [3]:
# create new dataframe with count of shared proteins
# in brain sections
brain_share = pd.DataFrame()
for source in brain:
    found = brain[source]
    found_df = pd.DataFrame(found)
    for target in brain:
        portion = {}
        if source == target:
            other = pd.melt(brain[[c for c in brain if c != source]])
            other = other.dropna()
            sub = found_df[~found_df[source].isin(other.value)].dropna()
        else:
            sub = brain[brain[target].isin(found)][target]
            sub = sub.dropna()

        portion['source_tissue'] = source
        portion['Compared Tissue'] = 'Overlaps ' + target
        portion['overlapping'] = len(sub)
        portion['value_norm'] = len(sub) / len(found)
        if brain_share.empty:
            brain_share = pd.DataFrame(portion,
                    index=range(1))
        else:
            brain_share = pd.concat([brain_share,
                    pd.DataFrame(portion, index=range(1))])


In [4]:
# custom colors
n = len(brain_share['Compared Tissue'].unique())
my_colors = cm.get_cmap('coolwarm', n)
my_colors = my_colors(np.linspace(0, 1, n))
my_colors = [colors.to_hex(c) for c in my_colors]
domain = [c for c in brain_share['Compared Tissue'].unique()]

# make stacked bar chart with number of overlaps
bars = alt.Chart(brain_share).mark_bar().encode(
    x=alt.X('overlapping:Q', stack='zero'),
    y=alt.Y('source_tissue:O'),
    color=alt.Color('Compared Tissue:O', scale=alt.Scale(domain=domain, range=my_colors)),
    order=alt.Order('Compared Tissue', sort='descending')
).properties(
    title='Shared Proteins Between Brain Regions',
    width=1000,
    height=150
)

text = alt.Chart(brain_share).mark_text(align='right', dx=-10, dy=0, size=17, color='black').encode(
    x=alt.X('overlapping:Q', stack='zero',
            axis=alt.Axis(tickCount=10), title='Number of Overlapping Proteins'),
    y=alt.Y('source_tissue:O', title='', ),
    detail='Compared Tissue:O',
    text=alt.Text('overlapping:Q'),
    order=alt.Order('Compared Tissue', sort='descending')
).properties(
    width=1000,
    height=150
)

(bars + text).configure_axis(
    labelFontSize=20
).configure_title(
    fontSize=20,
    anchor='middle'
)

In [5]:
# create new dataframe with count of shared proteins
# in organ tissue
organ_share = pd.DataFrame()
for source in organs:
    found = organs[source]
    found_df = pd.DataFrame(found)
    for target in organs:
        portion = {}
        if source == target:
            other = pd.melt(organs[[c for c in organs if c != source]])
            other = other.dropna()
            sub = found_df[~found_df[source].isin(other.value)].dropna()
        else:
            sub = organs[organs[target].isin(found)][target]
            sub = sub.dropna()
        portion['source_tissue'] = source
        portion['Compared Tissue'] = 'Overlaps ' + target
        portion['overlapping'] = len(sub)
        portion['value_norm'] = len(sub) / len(found)
        if organ_share.empty:
            organ_share = pd.DataFrame(portion,
                    index=range(1))
        else:
            organ_share = pd.concat([organ_share,
                    pd.DataFrame(portion, index=range(1))])


In [6]:
# cutom colors 
n = len(organ_share['Compared Tissue'].unique())
my_colors = cm.get_cmap('coolwarm', n)
my_colors = my_colors(np.linspace(0, 1, n))
my_colors = [colors.to_hex(c) for c in my_colors]
domain = [c for c in organ_share['Compared Tissue'].unique()]

# make stacked bar chart with number of overlaps
# in organ tissue
bars = alt.Chart(organ_share).mark_bar().encode(
    x=alt.X('overlapping:Q', stack='zero'),
    y=alt.Y('source_tissue:O'),
    color=alt.Color('Compared Tissue:O', scale = alt.Scale(domain=domain, range=my_colors)),
    order=alt.Order('Compared Tissue', sort='descending')
).properties(
    title='Shared Proteins Between Organ Tissue',
    width=1000,
    height=150
)

text = alt.Chart(organ_share).mark_text(align='right', dx=-10, dy=0, size=17, color='black').encode(
    x=alt.X('overlapping:Q', stack='zero',
             axis=alt.Axis(tickCount=10), title='Number of Overlapping Proteins'),
    y=alt.Y('source_tissue:O', title='', ),
    detail='Compared Tissue:O',
    text=alt.Text('overlapping:Q'),
    order=alt.Order('Compared Tissue', sort='descending')
).properties(
    width=1000,
    height=150
)

(bars + text).configure_axis(
    labelFontSize=20
).configure_title(
    fontSize=20,
    anchor='middle'
).save('Protein_Organs.svg')

# colors later edited in vector editing software