# Init

In [4]:
import pandas as pd
import seaborn as sns
import numpy as np
import scipy.stats as stats
%matplotlib inline
import os
from matplotlib import pyplot as plt
import matplotlib as mpl
import plotly
import plotly.express as px





savefig_args = {"dpi": 500, "bbox_inches": "tight", "pad_inches": 0.05}
mpl.rc('savefig', dpi=500)
output_dir = "../../figures/tcr_clonality/"
if not os.path.exists(output_dir):
            os.makedirs(output_dir)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


def tissue_colors():
    
    tissue_color_dict = {'Bladder': '#e7969c',
             'Blood': '#d6616b',
             'Bone_Marrow': '#cedb9c',
             'Eye': '#c7ea46',#"#00ff7f",
             'Fat': '#e7cb94',
             'Heart': '#ff0800',
             'Kidney': '#7b4173',
             'Large_Intestine': '#31a354',
             'Liver': '#000080',
             'Lung': '#3182bd',
             'Lymph_Node': '#8c6d31',
             'Mammary':'#ce6dbd',
             'Muscle': '#e7ba52',
             'Pancreas': '#fd8d3c',
             'Prostate':'#637939',#'#a55194',#
             'Salivary_Gland':'#622a0f',
             'Skin': '#de9ed6',
             'Small_Intestine': '#6baed6',
             'Spleen': '#393b79',
             'Thymus': '#9c9ede',
             'Tongue':'#b5cf6b',
             'Trachea': '#969696',
             'Uterus':'#c64b8c',#'#ff0090',
             'Vasculature': '#843c39'}
    
    return tissue_color_dict

def donor_colors():
    donors = ['TSP1','TSP2','TSP3','TSP4','TSP5','TSP6','TSP7','TSP8','TSP9','TSP10','TSP11','TSP12','TSP13','TSP14','TSP15']
    
    import matplotlib.colors as pltcolors
    
    cmap = plt.cm.get_cmap("YlGnBu")
        
    donor_color_dict = {}
    j=1/len(donors)
    for d in donors:
        donor_color_dict[d] = pltcolors.to_hex(cmap(j))
        j+=1/len(donors)
        
    return donor_color_dict


def donor_colors():
    donor_color_dict = {'TSP6': '#034001', 'TSP7': '#8FBC8F', 'TSP4': '#c5ba30', 'TSP5': '#DC143C', 
                    'TSP10': '#FFD700', 'TSP8': '#8839ff', 'TSP3': '#c34a17', 'TSP11': '#00edff', 
                    'TSP12': '#f507a0', 'TSP9': '#9CADCA', 'TSP14': '#27BFD2', 'TSP15': '#bc7c00', 
                    'TSP1': '#4169E1', 'TSP2': '#636c8c', 'TSP13': '#ADD8E6'}
    return donor_color_dict


In [5]:
df = pd.read_table('../../data/processed_data/merged_airr_scirpy.tsv')


df = df[df.receptor_type == 'TCR']
df['node_color'] = df.donor.map(donor_colors())

## Rename Clones from numbers to letters

In [6]:
# this is to name them alphabetically bc there are currently too many
selector = df.clone_id.value_counts()
selector = selector[selector > 8].index
df = df[df.clone_id.isin(selector)]

import string
def listAlphabet():
  return list(string.ascii_lowercase)

rename_clones = dict(zip(df.clone_id.unique(), listAlphabet()))

df.loc[:,'clone_id'] = df.clone_id.map(rename_clones)

## Sankey Code

In [7]:

import plotly
import plotly.graph_objects as go
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        print(idx, colorNum)
        colorList = colorList + [colorPalette[idx]]*colorNum
        print(colorList)
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    
    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count']
        )
      )
    
    layout =  dict(
        title = title,
        font = dict(
          size = 10
        )
    )
       
    fig = dict(data=[data], layout=layout)
    return fig

In [8]:
### format data for sankey plot

In [9]:
selector = df.clone_id.value_counts() > 8

clone_ids = selector[selector == True].index

df_clone_ids = df[df.clone_id.isin(clone_ids)]
#df_clone_ids = df_clone_ids[~df_clone_ids.tissue.isin(['Fat', 'Heart', 'Skin'])]

forSankey = df_clone_ids.groupby(['clone_id', 'cell_ontology_class']).tissue.value_counts()

forSankey = pd.DataFrame(forSankey)

forSankey.columns = ['counts']

forSankey.reset_index(inplace = True)

forSankey.tissue = forSankey.tissue.str.replace('BM', 'Bone_Marrow')

forSankey.tissue = forSankey.tissue.str.replace('LymphNode', 'Lymph_Node')

fig = genSankey(forSankey, cat_cols=['tissue', 'clone_id', 'cell_ontology_class'],
                value_cols='counts', 
                title='')

0 9
['#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE']
1 17
['#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998']
2 11
['#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#4B8BBE', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#306998', '#FFE873', '#FFE873', '#FFE873', '#FFE873', '#FFE873', '#FFE873', '#FFE873', '#FFE873', '#FFE873', '#FFE873', '#FFE873']


In [10]:
cell_type_colors = dict(zip(list(df_clone_ids.cell_ontology_class.unique()), px.colors.qualitative.Dark24[:]))
# Increase the font size
fig['layout']['font']['size'] = 30
# Update colors to be the TS colors
_num_tissues = df_clone_ids.tissue.unique().shape[0]
_num_clone_ids = df_clone_ids.clone_id.unique().shape[0]
# Breaks if there are too many clone_ids involved
_dict = dict(zip(list(df_clone_ids.clone_id.unique()), px.colors.qualitative.Alphabet[:]))

_dict.update(tissue_colors())
_dict.update(cell_type_colors)

# create color list according to color dictionary
old_dict = dict(zip(fig['data'][0]['node']['label'], fig['data'][0]['node']['color']))


new_dict = {key: _dict[key] for key, value in old_dict.items()}

fig['data'][0]['node']['color'] = list(new_dict.values())

In [11]:
plotly.offline.plot(fig, image_height=500, image_width=500, validate=False, image='svg', image_filename="T_Cell_Sankey")

'temp-plot.html'