In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import seaborn as sns
from bs4 import BeautifulSoup
import requests
import time
from tqdm import tqdm_notebook
tqdm_notebook().pandas()
import pickle

In [None]:
sns.set(context="notebook", style="ticks", font="Helvetica")

In [None]:
with open("overlap_chromosome_lower_raw.pickle", "rb") as f:
    overlaps = pickle.load(f)

In [None]:
datasets = [
    "Gaublomme_GSE75109_TPM_clean",
    "Gaublomme_GSE75110_TPM_clean",
    "Gaublomme_GSE75111_TPM_clean",
    "kakadarov_tpm",
    "somatosensory_converted_into_tpm",
    "Dopaminergic_TPM_clean",
    "Rbp4_positive_cells",
    "Cheng_ES_TPM",
    "Alveolar_cells_Type_II_Merged_Batches",
    "Alveolar_cells_Type_I_Merged_Batches",
#     "Alveolar_cells_both_types",
    "klein",
    "hepat_TPM_yang_clean",
    "Yu_First_wave_endocrine_cells",
]

overlaps = overlaps.loc[datasets, datasets]

In [None]:
dich_methods = ["3max" if ("Alveolar" in x or "klein" in x) else "geomean" for x in datasets]

In [None]:
chromosome_map = []
for i in range(len(datasets)):
    for j in range(i+1, len(datasets)):
            x = overlaps.iloc[i,j]
            if x is None:
                continue
            for stretch in x:
                chromosome_map += [list(stretch) + [datasets[i]]]            
                chromosome_map += [list(stretch) + [datasets[j]]]
                

In [None]:
chromosome_report = pd.DataFrame(chromosome_map, columns=["start_gene", "end_gene", "chromosome", "dataset"]).drop_duplicates()

In [None]:
def get_unit(x):
    return "UMI" if ("Alveolar" in x.dataset or "klein" in x.dataset) else "TPM"

chromosome_report.loc[:,"units"] = chromosome_report.apply(get_unit,axis=1)

In [None]:
chr_names = list(range(1,20)) + ["X", "Y"]
chrs = {x: pd.read_csv("somatosensory_converted_into_tpm/intermediate/chr{}_filtered.csv".format(x)) for x in chr_names}

In [None]:
def get_genes(start, end, chromosome):
    return chrs[chromosome].loc[start:end-1, "Name"].values

chromosome_report.loc[:,"all_genes"] = chromosome_report.apply(lambda x: get_genes(x.start_gene, x.end_gene, x.chromosome),axis=1)

In [None]:
chromosome_report.loc[:, "stretch_size"] = chromosome_report.apply(lambda x: x.end_gene - x.start_gene, axis=1)

In [None]:
raw_dfs = {x: pd.read_csv("{}.csv".format(x)) for x in datasets}

In [None]:
dich_dfs = {x: pd.read_csv("{}/intermediate/{}/dichotomised_genes.csv".format(x, y)).set_index("gene_id") for x, y in zip(datasets, dich_methods)}

In [None]:
def get_present_genes(dataset, genes):
    return list(set(raw_dfs[dataset].gene_id) & set(genes))

chromosome_report.loc[:, "recorded_genes"] = chromosome_report.apply(lambda x: get_present_genes(x.dataset, x.all_genes), axis=1)

In [None]:
chromosome_report.loc[:, "n_genes"] = chromosome_report.apply(lambda x: len(x.recorded_genes), axis=1)

In [None]:
def load_chromosome_wise(dataset, dich_type):
#     dich_type = "geomean"
    chr_names = list(range(1,20)) + ["X", "Y"]
    ic_dfs = [pd.read_csv("{}/results/{}/stage1_chr{}_IC.csv".format(dataset,dich_type,x)) for x in chr_names]
    chrs = {x: pd.read_csv("{}/intermediate/chr{}_filtered.csv".format(dataset, x)) for x in chr_names}
    filtered_genes = pd.read_csv("{}/intermediate/{}/filtered_dichotomised_genes.csv".format(dataset, dich_type))
    for i, name in zip(range(len(ic_dfs)), chr_names):
        ic_dfs[i].loc[:,"chromosome"] = name

    ic_df = pd.concat(ic_dfs, sort=False).reset_index(drop=True)
    return ic_df[ic_df.n_genes >= ic_df.stretch * 6 / 7]

dfs = {x: load_chromosome_wise(x,y) for x, y in zip(datasets,dich_methods)}

In [None]:
dfs = {x: dfs[x].set_index(["start", "end", "chromosome"]) for x in dfs.keys()}

In [None]:
chromosome_report.loc[:, "ic"] = chromosome_report.apply(lambda x: dfs[x.dataset].loc[(x.start_gene, x.end_gene, x.chromosome), "ic"], axis=1)

In [None]:
def get_aux_stats(dataset, genes):
    row = {}
    raw_df = raw_dfs[dataset].set_index("gene_id")
    raw_x = raw_df.loc[genes].dropna().copy()
    dich_df = dich_dfs[dataset]
    dich_x = dich_df.loc[genes].dropna().copy()
    row["mean_on_cell_tpm"] = raw_x[dich_x == 1].mean().mean()
    exp_per_cell = dich_x.sum(axis=0)
    row["mean_gene_per_cell"] = exp_per_cell.mean()
    row["std_gene_per_cell"] = exp_per_cell.std()
#     summary_rows += [row]
    return pd.Series(row)

In [None]:
aux = chromosome_report.progress_apply(lambda x: get_aux_stats(x.dataset, x.recorded_genes), axis=1)

In [None]:
chromosome_report = chromosome_report.merge(aux, left_index=True, right_index=True)

In [None]:
def get_aux_per_gene(dataset, genes):
    names = ["gene_name", "gene_frequency", "gene_mean_on_cell"] * 21
    row = [None] * (21*3)
    raw_df = raw_dfs[dataset].set_index("gene_id")
    dich_df = dich_dfs[dataset]
    for i, gene in enumerate(genes):
        raw_x = raw_df.loc[gene].dropna().copy()
        dich_x = dich_df.loc[gene].dropna().copy()
        row[0+3*i] = gene
        row[1+3*i] = dich_x.sum() / dich_x.shape[0]
        row[2+3*i] = raw_x[dich_x == 1].mean()
    return pd.Series(row, index=names)
    

In [None]:
aux = chromosome_report.progress_apply(lambda x: get_aux_per_gene(x.dataset, x.recorded_genes), axis=1)

In [None]:
aux = pd.DataFrame((aux.gene_frequency == 0).sum(axis=1),columns=["zero_genes"]).merge(aux, left_index=True, right_index=True)

In [None]:
import sys
sys.path.insert(0,'./bin')
from pipeline_utils.ic_utils import bootstrap_IC
def get_bootstrapping(x):
    _dataset = x.dataset
    _genes = x.recorded_genes
    _to_bootstrap = dich_dfs[_dataset].loc[_genes]
    return bootstrap_IC(_to_bootstrap)

bootstraps = chromosome_report.progress_apply(get_bootstrapping, axis=1)

In [None]:
bootstraps.columns = ["ic_" + x for x in bootstraps.columns]

In [None]:
chromosome_report = chromosome_report.merge(bootstraps, left_index=True, right_index=True)

In [None]:
chromosome_report.loc[:, "coordinates"] = chromosome_report.apply(lambda x: "[{} {} {}]".format(x.chromosome, x.start_gene, x.end_gene), axis=1)

In [None]:
chromosome_report = chromosome_report.loc[:,['start_gene', 'end_gene', 'chromosome', 'coordinates', 'dataset', 'units', 'mean_on_cell_tpm', 'mean_gene_per_cell', 'std_gene_per_cell', 'ic', 'ic_lower', 'ic_upper', 'stretch_size', 'n_genes', 'recorded_genes']]

In [None]:
chromosome_report.recorded_genes = chromosome_report.recorded_genes.apply(lambda x: ", ".join(x))

In [None]:
chromosome_report = chromosome_report.merge(aux, left_index=True, right_index=True)

In [None]:
chromosome_report.zero_genes = chromosome_report.n_genes - chromosome_report.zero_genes

In [None]:
chromosome_report = chromosome_report.rename({"zero_genes": "non_zero_genes"}, axis=1)

In [None]:
chromosome_report.to_csv("overlap_chromosome_lower_analysis_unfiltered.csv", index=False)

In [None]:
def filter_group(_df):
    if(np.any(_df.ic_upper < 1.0) and _df.shape[0] > 1):
        return _df
    else:
        return None

writer = pd.ExcelWriter("overlap_chromosome_lower_analysis.xlsx")
chromosome_report.to_excel(writer, "unfiltered", index=False)
chromosome_report.groupby("coordinates").apply(filter_group).dropna(how="all").to_excel(writer, "SNP-configuration", index=False)
chromosome_report.loc[chromosome_report.ic < 1.0].groupby("coordinates").apply(filter_group).dropna(how="all").to_excel(writer, "SNN-configuration", index=False)
chromosome_report.loc[chromosome_report.ic_upper < 1.0].groupby("coordinates").apply(filter_group).dropna(how="all").to_excel(writer, "SSS-configuration", index=False)
writer.save()

In [None]:
chromosome_report.loc[chromosome_report.ic_upper < 1.0].groupby("coordinates").apply(filter_group).dropna(how="all")

In [None]:
coord_counts = chromosome_report.loc[chromosome_report.ic_upper < 1.0].coordinates.value_counts()

In [None]:
coords = coord_counts[coord_counts > 1].index.values

In [None]:
chromosome_report.loc[chromosome_report.coordinates.isin(coords) & (chromosome_report.ic_upper < 1)]

In [None]:
chromosome_report.loc[chromosome_report.coordinates.isin(coords) & (chromosome_report.ic_upper < 1)].to_csv("overlap_chromosome_lower_analysis.csv", index=False)