In [None]:
import os 
import sys

import anndata
import scanpy as sc

import matplotlib.pyplot as plt
import seaborn as sns

# sys.path.append('/home/yike/phd/function/')

if __name__ == '__main__':
    
    print('Load interaction')
    fn_int = '/home/yike/phd/dengue/data/interaction_unpacked_mouse.tsv'
    interactions = pd.read_csv(fn_int, sep='\t')[['gene_name_a', 'gene_name_b']]
    ga, gb = interactions['gene_name_a'], interactions['gene_name_b']
    
    if True:
        print('Load high-quality cells only')
        fn_h5ad = '/home/yike/phd/dengue/data/mergedata_20200930_high_quality.h5ad'
        adata = anndata.read_h5ad(fn_h5ad)
        adata.obs['dataset'] = adata.obs['platform'].replace({
            '10X': 'child',
            'plate': 'adult'   
        })
        sc.pp.normalize_total(adata, target_sum=1e6) #normalize data to CPM (counts per million)
        
    print('Restrict to interaction genes')
    genes = np.unique(interactions)
    adatag = adata[:, genes]
    
    print('Split by cell type, adult and child, and condition')
    # from anndata_utils.partition import expressing_fractions
    obs = adatag.obs
    adatag.obs['split_col'] = obs['dataset'] + '+' + obs['Condition'].astype(str) + '+' + obs['cell_type'].astype(str)
    
    split_cols = ['dataset', 'Condition', 'cell_type']
    
    fracd = expressing_fractions(adatag, ['dataset', 'Condition', 'cell_type'])
    avgd = average(adatag, ['dataset', 'Condition', 'cell_type'], log=False)
    stats = {
        'frac_exp': fracd,
        'avg_exp': avgd,
    }

    # Flexible criterion
    criterion = {'key': 'frac_exp', 'threshold': 0.1}
    criterion = {'key': 'avg_exp', 'threshold': {'child': 60, 'adult': 35}}

    from collections import defaultdict
    th = criterion['threshold']
    cell_types = list(obs['cell_type'].cat.categories)
    res = {}
    for col in fracd.columns:
        datas, cond, cell_type1 = col
        for cell_type2 in cell_types:
            col2 = (datas, cond, cell_type2)
            res[(datas, cond, cell_type1, cell_type2)] = []
            fra = fracd.loc[ga, col].values
            frb = fracd.loc[gb, col2].values
            avga = avgd.loc[ga, col].values
            avgb = avgd.loc[gb, col2].values
            key = criterion['key']
            if isinstance(th, dict):
                th1 = th[col[0]]
            else:
                th1 = th

            ind = (stats[key].loc[ga, col].values > th1) & (stats[key].loc[gb, col2].values > th1)
            ind = ind.nonzero()[0]
            for i in ind:
                resi = {
                    'dataset': datas,
                    'Condition': cond,
                    'cell_type1': cell_type1,
                    'cell_type2': cell_type2,
                    'gene_name_a': interactions.iloc[i]['gene_name_a'],
                    'gene_name_b': interactions.iloc[i]['gene_name_b'],
                    'frac1': fra[i],
                    'frac2': frb[i],
                    'avg1': avga[i],
                    'avg2': avgb[i],
                }
                res[(datas, cond, cell_type1, cell_type2)].append(resi)
            res[(datas, cond, cell_type1, cell_type2)] = pd.DataFrame(res[(datas, cond, cell_type1, cell_type2)])     
    
    for value in res.values():
        value['fra_sum'] = value['frac1'] + value['frac2']
        value['av_sum'] = value['avg1'] + value['avg2']
        
    # combine dataframe of T_cell, B_cell and dataframe of B_cell, T_cell
    merge_res_list = []
    cell_types=['B_cells','NK_cells' , 'T_cells', 'Monocytes', 'Plasmablasts', 'pDCs', 'cDCs']
    for datas in ['child', 'adult']:
        for cond in ['S_dengue', 'dengue', 'Healthy', 'DWS']:
            for i, cell_type1 in enumerate(cell_types):
                for j, cell_type2 in enumerate(cell_types[: i+1]):
                    merge_res_list.append((datas, cond, cell_type1, cell_type2))
 
    res2={}
    for key, value in res.items():
        (datas, cond, cell_type1, cell_type2) = key
        key2 = (datas, cond, cell_type2, cell_type1)
        if key in merge_res_list:
            res[key2] = res[key2] [['dataset', 'Condition',
                                    'cell_type2','cell_type1', 
                                    'gene_name_b', 'gene_name_a',
                                    'frac2','frac1', 
                                    'avg2', 'avg1', 
                                    'fra_sum', 'av_sum']]
            res[key2].columns=['dataset', 'Condition',
                               'cell_type1','cell_type2', 
                               'gene_name_a', 'gene_name_b',
                               'frac1', 'frac2',
                               'avg1','avg2', 
                               'fra_sum', 'av_sum']
            res2[key]=pd.merge(res[key], res[key2],on=['dataset', 'Condition',
                                                       'cell_type1','cell_type2', 
                                                       'gene_name_a', 'gene_name_b',
                                                       'frac1', 'frac2',
                                                       'avg1','avg2', 
                                                       'fra_sum', 'av_sum'],
                               how='outer')
            res2[key] = res2[key].sort_values('av_sum',ascending=False)

    #for key in res2.keys(): 
    #    (datas, cond, cell_type1, cell_type2) = key
    #    res2[key].to_excel('/home/yike/phd/dengue/data/excels/average_expressions/' 
    #                     + datas + '_' + cond + '_' + cell_type1 + '_' + cell_type2 + '.xls', index=False)

In [6]:
from interaction_functions import interaction_analysis_same