In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
tqdm_notebook().pandas()

In [None]:
from bs4 import BeautifulSoup
import requests
import time

In [None]:
panther_names = {}

In [None]:
def get_panther_name(pantherid):
    if pantherid in panther_names.keys():
        return panther_names[pantherid];
    else: 
        r = requests.get("http://pantherdb.org/panther/family.do?clsAccession={}".format(pantherid))
        soup = BeautifulSoup(r.text, 'html.parser')
        family_name = soup.find(id="mainBody").table.tr.find("td",class_="mainText").text.strip()
        panther_names[pantherid] = family_name
        time.sleep(1)
        return family_name

In [None]:
panther_names

In [None]:
datasets = [
    ("Gaublomme_GSE75109_TPM_clean", "clean_panther4march","geomean"),
    ("Gaublomme_GSE75110_TPM_clean","clean_panther4march","geomean"),
    ("Gaublomme_GSE75111_TPM_clean","clean_panther4march","geomean"),
#     "somatosensory_rpkm_suppl",
    ("kakadarov_tpm","clean_panther4march","geomean"),
    ("somatosensory_converted_into_tpm","clean_panther4march","geomean"),
    ("Dopaminergic_TPM_clean","clean_panther4march","geomean"),
    ("Rbp4_positive_cells","clean_panther4march","geomean"),
    ("Cheng_ES_TPM","clean_panther4march","geomean"),
    ("Alveolar_cells_Type_II_Merged_Batches","clean_panther4march","3max"),
    ("Alveolar_cells_Type_I_Merged_Batches","clean_panther4march","3max"),
#     ("Alveolar_cells_both_types","clean_panther4march","3max"),
    ("klein","clean_panther4march","3max"),
    ("hepat_TPM_yang_clean","clean_panther4march","geomean"),
    ("Yu_First_wave_endocrine_cells","clean_panther4march","geomean"),
#     ("lcl_european","HGNC_families"),
#     ("lcl_african","HGNC_families"),
#     ("cd4_zheng","HGNC_families"),
]



family_to_gene = pd.concat([
    pd.read_csv("clean_panther4march.csv", index_col="family_id"),
    pd.read_csv("HGNC_families.csv", index_col="family_id")
], sort=False)

In [None]:
# family_to_gene.loc[1492.0]

In [None]:
# pd.options.display.max_colwidth = 1000
pd.options.display.max_seq_items = 25

# Family-wise

* T-cell receptor beta-chain is present in all three Gaublomme sets (exclusive)
* Multiple histone families are conserved in all three Gaublomme sets (co-occurrent)
* Pcdh-clustered family is not shown as conserved (it is not exclusive in somatosensory and dopaminergic)
* A lot of Zinc finger proteins are conserved between all datasets (co-occurrent)

In [None]:
family_dfs = {x: pd.read_csv("{}/{}/results/family_IC.csv".format(x,y), index_col="family_id").dropna() for x, y, _ in datasets}

In [None]:
def get_lower(df):
    return df[df.ic <= df.ic.quantile(0.025)].index.values

def get_upper(df):
    return df[df.ic >= df.ic.quantile(0.975)].index.values

def get_intersection(l1, l2):
    return set(l1).intersection(l2)

In [None]:
dataset_names = [x[0] for x in datasets]

In [None]:
# Overlap display formaters
def format_family_names(x):
    if x is None or len(x) == 0:
        return ""
    _x = list(x.copy())
    s = "\n".join(_x[:10])
    if(len(_x) > 10):
        s += "\n+{} families".format(len(_x)-10)
    return s

def format_gene_names(x):
    if x is None or len(x) == 0:
        return ""
    
    s = ""
    for genes in x[:5]:
        genes_s = ",".join(genes[:5])
        if (len(genes) > 5):
            genes_s += " +{} genes".format(len(genes)-5)
        s += genes_s + "\n"
    if(len(x) > 5):
        s += "+{} families".format(len(x)-5)
    return s.strip()
    

In [None]:
family_overlap_lower = pd.DataFrame([[None if x == y else get_intersection(get_lower(family_dfs[x]),get_lower(family_dfs[y])) for y, _, _ in datasets] for x, _, _ in datasets], index=dataset_names, columns=dataset_names)

In [None]:
pd.options.display.max_colwidth = 1000
pd.options.display.max_seq_items = 5
writer = pd.ExcelWriter("overlap_family_lower.xlsx")
# display(family_overlap_lower.applymap(lambda x: "" if x is None else ",".join(list(x))))
family_overlap_lower.applymap(lambda x: "" if x is None else ",".join(list(x))).to_csv("overlap_family_lower_raw.csv")
display(family_overlap_lower.applymap(format_family_names))
display(family_overlap_lower.progress_applymap(lambda x: [get_panther_name(_x) for _x in x] if x is not None else None).applymap(format_family_names))
display(family_overlap_lower.progress_applymap(lambda x: [family_to_gene.loc[np.int(family) if isinstance(family, np.float64) else family].gene_symbol for family in x] if x is not None else []).applymap(format_gene_names))
display(family_overlap_lower.progress_applymap(lambda x: len(x) if x is not None else np.nan))

family_overlap_lower.applymap(format_family_names).to_excel(writer,"by IDs")
family_overlap_lower.applymap(lambda x: [get_panther_name(_x) for _x in x] if x is not None else None).applymap(format_family_names).to_excel(writer,"by family names")
family_overlap_lower.applymap(lambda x: [family_to_gene.loc[np.int(family) if isinstance(family, np.float64) else family].gene_symbol for family in x] if x is not None else []).applymap(format_gene_names).to_excel(writer,"by genes")
family_overlap_lower.applymap(lambda x: len(x) if x is not None else np.nan).to_excel(writer,"count by families")
writer.save()

In [None]:
family_overlap_upper = pd.DataFrame([[None if x == y else get_intersection(get_upper(family_dfs[x]),get_upper(family_dfs[y])) for y, _, _ in datasets] for x, _, _ in datasets], index=dataset_names, columns=dataset_names)

In [None]:
pd.options.display.max_colwidth = 1000
pd.options.display.max_seq_items = 5
writer = pd.ExcelWriter("overlap_family_upper.xlsx")
display(family_overlap_upper.applymap(format_family_names))
display(family_overlap_upper.progress_applymap(lambda x: [get_panther_name(_x) for _x in x] if x is not None else None).applymap(format_family_names))
display(family_overlap_upper.progress_applymap(lambda x: [family_to_gene.loc[np.int(family) if isinstance(family, np.float64) else family].gene_symbol for family in x] if x is not None else []).applymap(format_gene_names))
display(family_overlap_upper.progress_applymap(lambda x: len(x) if x is not None else np.nan))

family_overlap_upper.applymap(format_family_names).to_excel(writer,"by IDs")
family_overlap_upper.applymap(lambda x: [get_panther_name(_x) for _x in x] if x is not None else None).applymap(format_family_names).to_excel(writer,"by family names")
family_overlap_upper.applymap(lambda x: [family_to_gene.loc[np.int(family) if isinstance(family, np.float64) else family].gene_symbol for family in x] if x is not None else []).applymap(format_gene_names).to_excel(writer,"by genes")
family_overlap_upper.applymap(lambda x: len(x) if x is not None else np.nan).to_excel(writer,"count by families")
writer.save()

In [None]:
# print("Pcdh clustered, IC vs. 2.5 percentile")
# display("somatosensory_converted_into_tpm")
# display(family_dfs["somatosensory_converted_into_tpm"].loc["PTHR24028_clustered"].ic)
# display(family_dfs["somatosensory_converted_into_tpm"].ic.quantile(0.025))

# # display("somatosensory_rpkm_suppl")
# # display(family_dfs["somatosensory_rpkm_suppl"].loc["PTHR24028_clustered"].ic)
# # display(family_dfs["somatosensory_rpkm_suppl"].ic.quantile(0.025))

# display("dopaminergic")
# display(family_dfs["dopaminergic"].loc["PTHR24028_clustered"].ic)
# display(family_dfs["dopaminergic"].ic.quantile(0.025))

# display("Rbp4_positive_cells")
# display(family_dfs["Rbp4_positive_cells"].loc["PTHR24028_clustered"].ic)
# display(family_dfs["Rbp4_positive_cells"].ic.quantile(0.025))

# Chromosome-wise

* Some Pcdh stretches are overlapping between different sets of neurons (exclusive)
* The highest overlap of the co-occurrent in Gaublomme sets (majorly comprised of histone genes) 

In [None]:
from tqdm import tqdm_notebook 

In [None]:
chr_names = list(range(1,20)) + ["X", "Y"]
chrs = {x: pd.read_csv("somatosensory_converted_into_tpm/intermediate/chr{}_filtered.csv".format(x)) for x in chr_names}

In [None]:
def load_chromosome_wise(dataset, dich_type):
#     dich_type = "geomean"
    chr_names = list(range(1,20)) + ["X", "Y"]
    ic_dfs = [pd.read_csv("{}/results/{}/stage1_chr{}_IC.csv".format(dataset,dich_type,x)) for x in chr_names]
    chrs = {x: pd.read_csv("{}/intermediate/chr{}_filtered.csv".format(dataset, x)) for x in chr_names}
    filtered_genes = pd.read_csv("{}/intermediate/{}/filtered_dichotomised_genes.csv".format(dataset, dich_type))
    for i, name in zip(range(len(ic_dfs)), chr_names):
        ic_dfs[i].loc[:,"chromosome"] = name

    ic_df = pd.concat(ic_dfs, sort=False).reset_index(drop=True)
    return ic_df[ic_df.n_genes >= ic_df.stretch * 6 / 7]

In [None]:
def get_genes(start, end, chromosome):
    return chrs[chromosome].loc[start:end, "Name"].values

def get_all_genes(l):
    if l is None:
        return set()
    return sorted(set(np.concatenate([get_genes(*x) for x in l])))

In [None]:
chromosome_dfs = {x: load_chromosome_wise(x, dich_type) for x, _, dich_type in tqdm_notebook(datasets)}

In [None]:
chromosome_dfs["Gaublomme_GSE75109_TPM_clean"]

In [None]:
def get_lower(df):
#     print(df[df.ic <= df.ic.quantile(0.025)].shape[0])
    slices = []
    for stretch in [7, 14, 21]:
        _slice = df[df.stretch == stretch]
#         print(_slice[_slice.ic <= _slice.ic.quantile(0.025)].shape[0])
        slices += [_slice[_slice.ic <= _slice.ic.quantile(0.025)]]
    return pd.concat(slices).loc[:, ["start", "end", "chromosome"]].values

def get_upper(df):
    slices = []
    for stretch in [7, 14, 21]:
        _slice = df[df.stretch == stretch]
        slices += [_slice[_slice.ic >= _slice.ic.quantile(0.975)]]
    return pd.concat(slices).loc[:, ["start", "end", "chromosome"]].values

def get_intersection(l1, l2):
    intersection = []
    for x in l1:
        for y in l2:
            if np.all(x == y):
                intersection += [x]
    if len(intersection) == 0:
        return None
    return np.stack(intersection)


In [None]:
dataset_names = [x for x, _, _ in datasets]

In [None]:
chromosome_overlap_lower = pd.DataFrame([[None if x == y else get_intersection(get_lower(chromosome_dfs[x]),get_lower(chromosome_dfs[y])) for y in dataset_names] for x in dataset_names], index=dataset_names, columns=dataset_names)

In [None]:
def format_stretches(x):
    if x is None or len(x) == 0:
        return ""
    _x = list(x.copy())
    s = "\n".join([str(__x) for __x in _x[:5]])
    if(len(_x) > 5):
        s += "\n+{} stretches".format(len(_x)-5)
    return s

def format_stretch_genes(x):
    if x is None or len(x) == 0:
        return ""
    _x = list(x.copy())
    s = "\n".join([str(__x) for __x in _x[:25]])
    if(len(_x) > 25):
        s += "\n+{} genes".format(len(_x)-25)
    return s


In [None]:
import pickle
with open("overlap_chromosome_lower_raw.pickle", "wb") as f:
    pickle.dump(chromosome_overlap_lower, f)

writer = pd.ExcelWriter("overlap_chromosome_lower.xlsx")
chromosome_overlap_lower.applymap(get_all_genes).applymap(format_stretch_genes).to_excel(writer,"by genes")
chromosome_overlap_lower.applymap(lambda x: len(x) if x is not None else np.nan).to_excel(writer,"by count (stretches)")
chromosome_overlap_lower.applymap(get_all_genes).applymap(lambda x: len(x) if x is not None else np.nan).to_excel(writer,"by count (genes)")
chromosome_overlap_lower.applymap(format_stretches).to_excel(writer,"by locations")
writer.save()

In [None]:
chromosome_overlap_lower.applymap(format_stretches)

In [None]:
for _x in range(chromosome_overlap_lower.shape[0]):
    for _y in range(chromosome_overlap_lower.shape[1]):
        x = (chromosome_overlap_lower.iloc[_x,_y])
        if x is None:
            continue
        x = pd.DataFrame(np.hstack([x, np.reshape(x[:,1] - x[:,0], (-1,1))]), columns=["a", "b", "chr", "size"])
        x = x.sort_values("size", ascending=False)
        for i in range(x.shape[0]):
            y = x.iloc[i]
            z = x.loc[(x.chr == y.chr) & (x.a <= y.a) & (x.b >= y.b) & (x.size < y.size)]
            if(z.shape[0] != 0):
                print(_x,_y,i)
                print(z)
                break

In [None]:

chromosome_overlap_lower.applymap(get_all_genes).applymap(format_stretch_genes)

In [None]:
chromosome_overlap_lower.applymap(lambda x: len(x) if x is not None else np.nan)

In [None]:
chromosome_overlap_upper = pd.DataFrame([[None if x == y else get_intersection(get_upper(chromosome_dfs[x]),get_upper(chromosome_dfs[y])) for y in dataset_names] for x in dataset_names], index=dataset_names, columns=dataset_names)

In [None]:
writer = pd.ExcelWriter("overlap_chromosome_upper.xlsx")
chromosome_overlap_upper.applymap(get_all_genes).applymap(format_stretch_genes).to_excel(writer,"by genes")
chromosome_overlap_upper.applymap(lambda x: len(x) if x is not None else np.nan).to_excel(writer,"by count (stretches)")
chromosome_overlap_upper.applymap(get_all_genes).applymap(lambda x: len(x) if x is not None else np.nan).to_excel(writer,"by count (genes)")
chromosome_overlap_upper.applymap(format_stretches).to_excel(writer,"by locations")
writer.save()

In [None]:
# pd.options.display.max_colwidth = 1000
pd.options.display.max_seq_items = 100
chromosome_overlap_upper.applymap(get_all_genes)

In [None]:
chromosome_overlap_upper.applymap(lambda x: len(x) if x is not None else np.nan)