In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import seaborn as sns
from bs4 import BeautifulSoup
import requests
import time
from tqdm import tqdm_notebook

In [49]:
sns.set(context="notebook", style="ticks", font="Helvetica")

In [50]:
overlaps = pd.read_csv("overlap_family_lower_raw.csv", index_col=[0])

In [51]:
datasets = [
    "Gaublomme_GSE75109_TPM_clean",
    "Gaublomme_GSE75110_TPM_clean",
    "Gaublomme_GSE75111_TPM_clean",
    "kakadarov_tpm",
    "somatosensory_converted_into_tpm",
    "Dopaminergic_TPM_clean",
    "Rbp4_positive_cells",
    "Cheng_ES_TPM",
    "Alveolar_cells_Type_II_Merged_Batches",
    "Alveolar_cells_Type_I_Merged_Batches",
    "klein",
    "hepat_TPM_yang_clean",
    "Yu_First_wave_endocrine_cells",
]

overlaps = overlaps.loc[datasets, datasets]

In [52]:
families = "clean_panther4march"

dfs = {x: pd.read_csv("{}/{}/results/family_IC.csv".format(x, families), index_col="family_id") for x in datasets}

In [53]:
threshold_dfs = {x: pd.read_csv("{}/{}/intermediate/family_thresholds.csv".format(x, families), index_col="family_id") for x in datasets}

In [54]:
raw_dfs = {x: pd.read_csv("{}.csv".format(x), index_col="gene_id") for x in datasets}

In [55]:
family_df = pd.read_csv("{}.csv".format(families))

In [56]:
panther_names = {}
def get_panther_name(pantherid):
    if pantherid in panther_names.keys():
        return panther_names[pantherid];
    else: 
        r = requests.get("http://pantherdb.org/panther/family.do?clsAccession={}".format(pantherid))
        soup = BeautifulSoup(r.text, 'html.parser')
        family_name = soup.find(id="mainBody").table.tr.find("td",class_="mainText").text.strip()
        panther_names[pantherid] = family_name
        time.sleep(1)
        return family_name

In [57]:
family_map = {}

for i, i_ds in enumerate(overlaps.index):
    for j, j_ds in zip(range(i+1,len(overlaps.columns)),overlaps.columns[i+1:]):
        x = overlaps.loc[i_ds, j_ds]
        if x is np.nan:
            continue
        for family in x.split(","):
            if family in family_map:
                family_map[family] += [i_ds, j_ds]
            else:
                family_map[family] = [i_ds, j_ds]

In [58]:
for k in family_map:
    family_map[k] = set(family_map[k])

In [60]:
import sys
sys.path.insert(0,'./bin')
from pipeline_utils.ic_utils import bootstrap_IC
def plot_heatmaps(k, k_name, v):
#     fig, axs = plt.subplots(2,len(v), figsize=(5*len(v),10))
#     fig_hist, axs_hist = plt.subplots(1,len(v), figsize=(5*len(v),5))
    summary_rows = []
    for i, dataset in enumerate(v):
        row = {
            "family_name": k_name,
            "dataset_name": dataset,
            "unit": "UMI" if ("Alveolar" in dataset or "klein" in dataset) else "TPM"
        }
        df = dfs[dataset]
        thresholds = threshold_dfs[dataset]
        raw_df = raw_dfs[dataset]
        x = raw_df.loc[family_df[family_df.family_id == k].gene_symbol].dropna().copy()#.T.describe()
        _raw_df = x.copy()
        x[x == 0] = 0.000001
#         sns.heatmap(x, norm=LogNorm(x.min().min(), x.max().max()), cbar_kws={"ticks": [0.5,1,10,100,1000]}, ax=axs[0,i])
        print(dataset,"Mean ON-cell TPM:",x[x > thresholds.loc[k].threshold].mean().mean())
        row["mean_on_cell_tpm"] = x[x > thresholds.loc[k].threshold].mean().mean()
        x[x <= thresholds.loc[k].threshold] = 0
        x[x > 0] = 1
        _dich_df = x.copy()
        bootstrap = bootstrap_IC(x)
#         sns.heatmap(x,ax=axs[1,i])
        exp_per_cell = x.sum(axis=0)
#         sns.distplot(exp_per_cell, kde=False, ax=axs_hist[i], hist_kws={"range": [exp_per_cell.min()-0.5, exp_per_cell.max()+0.5]}, bins=int(exp_per_cell.max() - exp_per_cell.min() + 1))
        row["mean_gene_per_cell"] = exp_per_cell.mean()
        row["std_gene_per_cell"] = exp_per_cell.std()
        row["threshold"] = thresholds.loc[k].threshold
        row["ic"] = df.loc[k].ic
        row["ic_mean"] = bootstrap.loc["mean"]
        row["ic_lower"] = bootstrap.lower
        row["ic_median"] = bootstrap.loc["median"]
        row["ic_upper"] = bootstrap.upper
        row["no_genes_total"] = family_df.loc[family_df.family_id == k].shape[0]
        row["no_genes_measured"] = df.loc[k].n_genes
        row["non_zero_genes"] = row["no_genes_measured"]
        row["genes"] = ", ".join(x.index)
        for i, gene in enumerate(_raw_df.index):
#             "gene_frequency", "gene_mean_on_cell"
            raw_x = _raw_df.loc[gene].dropna().copy()
            dich_x = _dich_df.loc[gene].dropna().copy()
            row["gene_name_{}".format(i)] = gene
            row["gene_frequency_{}".format(i)] = dich_x.sum() / dich_x.shape[0]
            if(row["gene_frequency_{}".format(i)] == 0):
                row["non_zero_genes"] -= 1
            row["gene_mean_on_cell_{}".format(i)] = raw_x[dich_x == 1].mean()
            
#             row[0+3*i] = gene
#             row[1+3*i] = dich_x.sum() / dich_x.shape[0]
#             row[2+3*i] = raw_x[dich_x == 1].mean()
            
#         axs[0,i].title.set_text("{} (ic={:.2f})".format(dataset, df.loc[k].ic))
#         axs_hist[i].title.set_text("{} (ic={:.2f})".format(dataset, df.loc[k].ic))
        summary_rows += [row]
        
#     plt.show()
    return summary_rows

In [61]:
family_df.loc[family_df.family_id == "PTHR23268"].shape[0]

21

In [62]:
summary_rows = []
for k in tqdm_notebook(family_map):
#     print(k)
    print(get_panther_name(k))
    summary_rows += plot_heatmaps(k, get_panther_name(k), family_map[k])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))

T-CELL RECEPTOR BETA CHAIN (PTHR23268)
Gaublomme_GSE75109_TPM_clean Mean ON-cell TPM: 454.6432965171525


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike


Gaublomme_GSE75111_TPM_clean Mean ON-cell TPM: 504.0290908100494
Gaublomme_GSE75110_TPM_clean Mean ON-cell TPM: 1121.8432320736922
POU DOMAIN (PTHR11636)
Gaublomme_GSE75109_TPM_clean Mean ON-cell TPM: 27.412746041659673
Gaublomme_GSE75110_TPM_clean Mean ON-cell TPM: 12.456513076369095
HEAT SHOCK PROTEIN 70KDA (PTHR19375)
Gaublomme_GSE75109_TPM_clean Mean ON-cell TPM: 808.1473795503863


  ics = obs_vars / pb_vars


somatosensory_converted_into_tpm Mean ON-cell TPM: 1878.1377069466207
Gaublomme_GSE75111_TPM_clean Mean ON-cell TPM: 829.5480028933174
SOLUTE CARRIER FAMILY 25 (PTHR24089)
Gaublomme_GSE75109_TPM_clean Mean ON-cell TPM: 71.38420719667536
Gaublomme_GSE75111_TPM_clean Mean ON-cell TPM: 86.29852995930051
ANION EXCHANGE PROTEIN (PTHR11453)
Gaublomme_GSE75109_TPM_clean Mean ON-cell TPM: 24.716871913943894
kakadarov_tpm Mean ON-cell TPM: 4.707337639269974
somatosensory_converted_into_tpm Mean ON-cell TPM: 30.026623893941526
LAR INTERACTING PROTEIN  LIP -RELATED PROTEIN (PTHR12587)
Gaublomme_GSE75109_TPM_clean Mean ON-cell TPM: 16.81462499492942
kakadarov_tpm Mean ON-cell TPM: 6.370612878668776
INTERCELLULAR ADHESION MOLECULE (PTHR13771)
Gaublomme_GSE75109_TPM_clean Mean ON-cell TPM: 53.37552178507702
klein Mean ON-cell TPM: 1.4227571539056458
UNCHARACTERIZED (PTHR12570)
hepat_TPM_yang_clean Mean ON-cell TPM: 29.110560560560565
somatosensory_converted_into_tpm Mean ON-cell TPM: 29.690268290115

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike


Cheng_ES_TPM Mean ON-cell TPM: 24.927657981082223
Gaublomme_GSE75110_TPM_clean Mean ON-cell TPM: 50.43864462596574
MAX DIMERIZATION, MAD (PTHR11969)
Rbp4_positive_cells Mean ON-cell TPM: 41.81177944862156
kakadarov_tpm Mean ON-cell TPM: 4.69348564843016
Gaublomme_GSE75110_TPM_clean Mean ON-cell TPM: 21.10817019772289
PROPROTEIN CONVERTASE SUBTILISIN/KEXIN-RELATED (PTHR42884)
somatosensory_converted_into_tpm Mean ON-cell TPM: 53.31018810919375
Gaublomme_GSE75110_TPM_clean Mean ON-cell TPM: 17.589024426514253
E3 UBIQUITIN-PROTEIN LIGASE KCMF1 (PTHR12268)
Alveolar_cells_Type_I_Merged_Batches Mean ON-cell TPM: 1.2
Gaublomme_GSE75110_TPM_clean Mean ON-cell TPM: 13.484628103052225
TROPOMODULIN (PTHR10901)
Yu_First_wave_endocrine_cells Mean ON-cell TPM: 56.58085115864524
Gaublomme_GSE75110_TPM_clean Mean ON-cell TPM: 18.446653330557126
CARBONIC ANHYDRASE (PTHR18952)
hepat_TPM_yang_clean Mean ON-cell TPM: 161.21630694037148
somatosensory_converted_into_tpm Mean ON-cell TPM: 75.81905857813865
G

KeyboardInterrupt: 

In [None]:
_df = pd.DataFrame(summary_rows)

In [None]:
_df

In [None]:
def filter_group(_df):
    if(np.any(_df.ic_upper < 1.0) and _df.shape[0] > 1):
        return _df
    else:
        return None

writer = pd.ExcelWriter("overlap_family_lower_analysis.xlsx")
_df.to_excel(writer, "unfiltered", index=False)
_df.groupby("family_name").apply(filter_group).dropna(how="all").to_excel(writer, "SNP-configuration", index=False)
_df.loc[_df.ic < 1.0].groupby("family_name").apply(filter_group).dropna(how="all").to_excel(writer, "SNN-configuration", index=False)
_df.loc[_df.ic_upper < 1.0].groupby("family_name").apply(filter_group).dropna(how="all").to_excel(writer, "SSS-configuration", index=False)
writer.save()