# Calculate confidence interval

This notebook generates the confidence interval for the plot in [stable_gene_relationships.ipynb](stable_gene_relationships.ipynb) notebook. Since this confidence interval is based on boostraping it takes a while to run so we wanted this in a separate notebook.

Existing ci calculation either assumes normality or uses bootstrapping, but since we need to make adjustments to normalize our results we cannot use the out of the box bootstrapping
https://stackoverflow.com/questions/46125182/is-seaborn-confidence-interval-computed-correctly

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import random
import scipy
import pandas as pd
import numpy as np
import textwrap
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests
from scripts import utils, paths, gene_relationships, annotations

random.seed(1)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
# User params
offset_to_bin = 10

use_operon = True
sum_increment_to_use = 1

n_boot = 50

ci = 0.95

# Output filename
pao1_figure_filename = (
    "PAO1_stablility_expression_relationships_operon_corrected_spell.svg"
)
pa14_figure_filename = (
    "PA14_stability_expression_relationships_operon_corrected_spell.svg"
)

### Import gene ids


In [3]:
# Import correlation matrix to get gene ids
pao1_corr_filename = paths.PAO1_CORR_LOG_SPELL
pa14_corr_filename = paths.PA14_CORR_LOG_SPELL

pao1_corr = pd.read_csv(pao1_corr_filename, sep="\t", index_col=0, header=0)
pa14_corr = pd.read_csv(pa14_corr_filename, sep="\t", index_col=0, header=0)

In [4]:
# Make a dataframe with gene ids
pao1_membership = pd.DataFrame(data=[], index=pao1_corr.index)
print(pao1_membership.shape)
pao1_membership.head()

(5563, 0)


PA0001
PA0002
PA0003
PA0004
PA0005


In [5]:
pa14_membership = pd.DataFrame(data=[], index=pa14_corr.index)
print(pa14_membership.shape)
pa14_membership.head()

(5887, 0)


PA14_55610
PA14_55600
PA14_55590
PA14_55580
PA14_55570


### Import and format operon data

In [6]:
pao1_operon_filename = paths.PAO1_OPERON
pa14_operon_filename = paths.PA14_OPERON

In [7]:
pao1_operon = annotations.load_format_operons(pao1_operon_filename)
pa14_operon = annotations.load_format_operons(pa14_operon_filename)

In [8]:
if use_operon:
    pao1_operon_expression_to_use = pao1_operon
    pa14_operon_expression_to_use = pa14_operon
else:
    pao1_operon_expression_to_use = None
    pa14_operon_expression_to_use = None

### Map core/accessory labels to genes

In [9]:
# Read in expression data
pao1_expression_filename = paths.PAO1_COMPENDIUM
pa14_expression_filename = paths.PA14_COMPENDIUM

In [10]:
pao1_annot_filename = paths.GENE_PAO1_ANNOT
pa14_annot_filename = paths.GENE_PA14_ANNOT

In [11]:
(
    pao1_arr,
    pa14_arr,
    pao1_core,
    pao1_acc,
    pa14_core,
    pa14_acc,
) = annotations.map_core_acc_annot(
    pao1_membership,
    pa14_membership,
    pao1_expression_filename,
    pa14_expression_filename,
    pao1_annot_filename,
    pa14_annot_filename,
)

Number of PAO1 core genes: 5366
Number of PA14 core genes: 5363
Number of PAO1 core genes in my dataset: 5361
Number of PA14 core genes in my dataset: 5357
Number of PAO1-specific genes: 202
Number of PA14-specific genes: 530


## Find relationships using expression distance

In [12]:
# Correlation matrix files
pao1_corr_filename = paths.PAO1_CORR_LOG_SPELL
pa14_corr_filename = paths.PA14_CORR_LOG_SPELL

In [13]:
# Load correlation data
pao1_corr = pd.read_csv(pao1_corr_filename, sep="\t", index_col=0, header=0)
pa14_corr = pd.read_csv(pa14_corr_filename, sep="\t", index_col=0, header=0)

In [14]:
# Load transcriptional similarity df
# These are the subset of genes that we will consider
pao1_similarity_scores_filename = (
    "../3_core_core_analysis/pao1_core_similarity_associations_final_spell.tsv"
)
pa14_similarity_scores_filename = (
    "../3_core_core_analysis/pa14_core_similarity_associations_final_spell.tsv"
)

pao1_similarity_scores = pd.read_csv(
    pao1_similarity_scores_filename, sep="\t", header=0, index_col=0
)
pa14_similarity_scores = pd.read_csv(
    pa14_similarity_scores_filename, sep="\t", header=0, index_col=0
)

In [15]:
# Get most and least stable core genes
pao1_most_stable_genes = list(
    pao1_similarity_scores[pao1_similarity_scores["label"] == "most stable"].index
)
pao1_least_stable_genes = list(
    pao1_similarity_scores[pao1_similarity_scores["label"] == "least stable"].index
)

pa14_most_stable_genes = list(
    pa14_similarity_scores[pa14_similarity_scores["label"] == "most stable"].index
)
pa14_least_stable_genes = list(
    pa14_similarity_scores[pa14_similarity_scores["label"] == "least stable"].index
)

In [16]:
%%time
expression_dist_counts_pao1_most_ci = (
    gene_relationships.get_CI_expression_relationships(
        n_boot,
        pao1_corr,
        pao1_most_stable_genes,
        pao1_arr,
        offset_to_bin,
        pao1_operon_expression_to_use,
        sum_increment_to_use,
    )
)

CPU times: user 1h 17min 8s, sys: 7.76 s, total: 1h 17min 16s
Wall time: 1h 17min 3s


In [17]:
%%time
expression_dist_counts_pao1_least_ci = (
    gene_relationships.get_CI_expression_relationships(
        n_boot,
        pao1_corr,
        pao1_least_stable_genes,
        pao1_arr,
        offset_to_bin,
        pao1_operon_expression_to_use,
        sum_increment_to_use,
    )
)

CPU times: user 1h 17min 50s, sys: 7.06 s, total: 1h 17min 57s
Wall time: 1h 17min 44s


In [18]:
%%time
expression_dist_counts_pa14_most_ci = (
    gene_relationships.get_CI_expression_relationships(
        n_boot,
        pa14_corr,
        pa14_most_stable_genes,
        pa14_arr,
        offset_to_bin,
        pa14_operon_expression_to_use,
        sum_increment_to_use,
    )
)

CPU times: user 1h 22min 44s, sys: 14.2 s, total: 1h 22min 58s
Wall time: 1h 22min 35s


In [19]:
%%time
expression_dist_counts_pa14_least_ci = (
    gene_relationships.get_CI_expression_relationships(
        n_boot,
        pa14_corr,
        pa14_least_stable_genes,
        pa14_arr,
        offset_to_bin,
        pa14_operon_expression_to_use,
        sum_increment_to_use,
    )
)

CPU times: user 1h 23min 13s, sys: 13.6 s, total: 1h 23min 27s
Wall time: 1h 23min 3s


In [20]:
expression_dist_counts_pao1_most_ci

Unnamed: 0,offset,gene type,total_0,total_1,total_2,total_3,total_4,total_5,total_6,total_7,...,total_40,total_41,total_42,total_43,total_44,total_45,total_46,total_47,total_48,total_49
0,1,acc,2.0,1.0,0.0,2.0,2.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,0.0,2.0,1.0,0.0,0.0,0.0
1,2,acc,0.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,...,1.0,0.0,2.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0
2,3,acc,1.0,3.0,2.0,1.0,0.0,0.0,1.0,2.0,...,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,2.0
3,4,acc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,acc,4.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
5,6,acc,0.0,0.0,0.0,4.0,1.0,1.0,2.0,1.0,...,1.0,0.0,2.0,0.0,1.0,3.0,0.0,0.0,1.0,2.0
6,7,acc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,acc,1.0,1.0,0.0,1.0,0.0,1.0,2.0,2.0,...,1.0,0.0,1.0,2.0,1.0,0.0,2.0,0.0,1.0,1.0
8,9,acc,3.0,0.0,0.0,0.0,3.0,1.0,3.0,4.0,...,1.0,3.0,1.0,1.0,4.0,1.0,3.0,3.0,3.0,0.0
9,10,acc,2.0,2.0,5.0,1.0,1.0,0.0,0.0,3.0,...,0.0,0.0,2.0,2.0,1.0,0.0,2.0,2.0,4.0,1.0


In [21]:
expression_dist_counts_pao1_least_ci

Unnamed: 0,offset,gene type,total_0,total_1,total_2,total_3,total_4,total_5,total_6,total_7,...,total_40,total_41,total_42,total_43,total_44,total_45,total_46,total_47,total_48,total_49
0,1,acc,25.0,22.0,19.0,21.0,27.0,28.0,21.0,22.0,...,24.0,16.0,17.0,25.0,16.0,23.0,18.0,29.0,18.0,22.0
1,2,acc,17.0,24.0,15.0,18.0,27.0,18.0,15.0,20.0,...,19.0,22.0,21.0,24.0,26.0,23.0,21.0,25.0,15.0,11.0
2,3,acc,24.0,28.0,19.0,17.0,34.0,23.0,13.0,19.0,...,26.0,20.0,18.0,21.0,26.0,28.0,21.0,20.0,18.0,19.0
3,4,acc,33.0,32.0,25.0,18.0,39.0,27.0,22.0,18.0,...,27.0,22.0,23.0,30.0,33.0,34.0,24.0,31.0,27.0,25.0
4,5,acc,25.0,32.0,25.0,18.0,26.0,21.0,19.0,22.0,...,27.0,19.0,25.0,18.0,28.0,27.0,20.0,28.0,15.0,14.0
5,6,acc,30.0,39.0,21.0,27.0,35.0,30.0,23.0,22.0,...,26.0,21.0,26.0,20.0,36.0,31.0,20.0,28.0,19.0,20.0
6,7,acc,15.0,29.0,17.0,22.0,26.0,21.0,13.0,16.0,...,12.0,20.0,11.0,24.0,24.0,23.0,17.0,20.0,14.0,11.0
7,8,acc,25.0,34.0,22.0,19.0,34.0,23.0,19.0,20.0,...,26.0,25.0,26.0,29.0,37.0,31.0,24.0,22.0,17.0,19.0
8,9,acc,22.0,32.0,20.0,18.0,32.0,23.0,16.0,16.0,...,25.0,20.0,22.0,20.0,27.0,28.0,19.0,21.0,15.0,13.0
9,10,acc,17.0,25.0,17.0,17.0,24.0,19.0,17.0,11.0,...,20.0,23.0,20.0,14.0,26.0,22.0,20.0,22.0,12.0,10.0


## Calculate percentages
Here we are taking the number of co-expressed core or accessory genes and normalizing by the number of most or least stable genes to get a percentage

In [22]:
# Get only columns with counts from bootstrapping
sampling_cols = [
    col for col in expression_dist_counts_pao1_most_ci.columns if "total" in col
]

In [23]:
pao1_acc_most_ids = expression_dist_counts_pao1_most_ci.loc[
    expression_dist_counts_pao1_most_ci["gene type"] == "acc"
].index
pao1_core_most_ids = expression_dist_counts_pao1_most_ci.loc[
    expression_dist_counts_pao1_most_ci["gene type"] == "core"
].index

pao1_acc_least_ids = expression_dist_counts_pao1_least_ci.loc[
    expression_dist_counts_pao1_least_ci["gene type"] == "acc"
].index
pao1_core_least_ids = expression_dist_counts_pao1_least_ci.loc[
    expression_dist_counts_pao1_least_ci["gene type"] == "core"
].index

In [24]:
pa14_acc_most_ids = expression_dist_counts_pa14_most_ci.loc[
    expression_dist_counts_pa14_most_ci["gene type"] == "acc"
].index
pa14_core_most_ids = expression_dist_counts_pa14_most_ci.loc[
    expression_dist_counts_pa14_most_ci["gene type"] == "core"
].index

pa14_acc_least_ids = expression_dist_counts_pa14_least_ci.loc[
    expression_dist_counts_pa14_least_ci["gene type"] == "acc"
].index
pa14_core_least_ids = expression_dist_counts_pa14_least_ci.loc[
    expression_dist_counts_pa14_least_ci["gene type"] == "core"
].index

In [25]:
# Most stable PAO1
expression_dist_counts_pao1_most_ci.loc[pao1_acc_most_ids, sampling_cols] /= len(
    pao1_most_stable_genes
)
expression_dist_counts_pao1_most_ci.loc[pao1_core_most_ids, sampling_cols] /= len(
    pao1_most_stable_genes
)

In [26]:
# Least stable PAO1
expression_dist_counts_pao1_least_ci.loc[pao1_acc_least_ids, sampling_cols] /= len(
    pao1_least_stable_genes
)
expression_dist_counts_pao1_least_ci.loc[pao1_core_least_ids, sampling_cols] /= len(
    pao1_least_stable_genes
)

In [27]:
# Most stable PA14
expression_dist_counts_pa14_most_ci.loc[pa14_acc_most_ids, sampling_cols] /= len(
    pa14_most_stable_genes
)
expression_dist_counts_pa14_most_ci.loc[pa14_core_most_ids, sampling_cols] /= len(
    pa14_most_stable_genes
)

In [28]:
# Least stable PA14
expression_dist_counts_pa14_least_ci.loc[pa14_acc_least_ids, sampling_cols] /= len(
    pa14_least_stable_genes
)
expression_dist_counts_pa14_least_ci.loc[pa14_core_least_ids, sampling_cols] /= len(
    pa14_least_stable_genes
)

In [29]:
expression_dist_counts_pao1_most_ci

Unnamed: 0,offset,gene type,total_0,total_1,total_2,total_3,total_4,total_5,total_6,total_7,...,total_40,total_41,total_42,total_43,total_44,total_45,total_46,total_47,total_48,total_49
0,1,acc,0.007491,0.003745,0.0,0.007491,0.007491,0.0,0.0,0.003745,...,0.0,0.0,0.0,0.007491,0.0,0.007491,0.003745,0.0,0.0,0.0
1,2,acc,0.0,0.0,0.0,0.0,0.007491,0.003745,0.003745,0.007491,...,0.003745,0.0,0.007491,0.0,0.0,0.003745,0.003745,0.003745,0.007491,0.003745
2,3,acc,0.003745,0.011236,0.007491,0.003745,0.0,0.0,0.003745,0.007491,...,0.0,0.0,0.003745,0.007491,0.003745,0.0,0.0,0.0,0.0,0.007491
3,4,acc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,acc,0.014981,0.0,0.003745,0.0,0.003745,0.003745,0.0,0.0,...,0.0,0.003745,0.0,0.0,0.003745,0.0,0.003745,0.003745,0.003745,0.003745
5,6,acc,0.0,0.0,0.0,0.014981,0.003745,0.003745,0.007491,0.003745,...,0.003745,0.0,0.007491,0.0,0.003745,0.011236,0.0,0.0,0.003745,0.007491
6,7,acc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,acc,0.003745,0.003745,0.0,0.003745,0.0,0.003745,0.007491,0.007491,...,0.003745,0.0,0.003745,0.007491,0.003745,0.0,0.007491,0.0,0.003745,0.003745
8,9,acc,0.011236,0.0,0.0,0.0,0.011236,0.003745,0.011236,0.014981,...,0.003745,0.011236,0.003745,0.003745,0.014981,0.003745,0.011236,0.011236,0.011236,0.0
9,10,acc,0.007491,0.007491,0.018727,0.003745,0.003745,0.0,0.0,0.011236,...,0.0,0.0,0.007491,0.007491,0.003745,0.0,0.007491,0.007491,0.014981,0.003745


In [30]:
expression_dist_counts_pao1_least_ci

Unnamed: 0,offset,gene type,total_0,total_1,total_2,total_3,total_4,total_5,total_6,total_7,...,total_40,total_41,total_42,total_43,total_44,total_45,total_46,total_47,total_48,total_49
0,1,acc,0.093633,0.082397,0.071161,0.078652,0.101124,0.104869,0.078652,0.082397,...,0.089888,0.059925,0.06367,0.093633,0.059925,0.086142,0.067416,0.108614,0.067416,0.082397
1,2,acc,0.06367,0.089888,0.05618,0.067416,0.101124,0.067416,0.05618,0.074906,...,0.071161,0.082397,0.078652,0.089888,0.097378,0.086142,0.078652,0.093633,0.05618,0.041199
2,3,acc,0.089888,0.104869,0.071161,0.06367,0.127341,0.086142,0.048689,0.071161,...,0.097378,0.074906,0.067416,0.078652,0.097378,0.104869,0.078652,0.074906,0.067416,0.071161
3,4,acc,0.123596,0.11985,0.093633,0.067416,0.146067,0.101124,0.082397,0.067416,...,0.101124,0.082397,0.086142,0.11236,0.123596,0.127341,0.089888,0.116105,0.101124,0.093633
4,5,acc,0.093633,0.11985,0.093633,0.067416,0.097378,0.078652,0.071161,0.082397,...,0.101124,0.071161,0.093633,0.067416,0.104869,0.101124,0.074906,0.104869,0.05618,0.052434
5,6,acc,0.11236,0.146067,0.078652,0.101124,0.131086,0.11236,0.086142,0.082397,...,0.097378,0.078652,0.097378,0.074906,0.134831,0.116105,0.074906,0.104869,0.071161,0.074906
6,7,acc,0.05618,0.108614,0.06367,0.082397,0.097378,0.078652,0.048689,0.059925,...,0.044944,0.074906,0.041199,0.089888,0.089888,0.086142,0.06367,0.074906,0.052434,0.041199
7,8,acc,0.093633,0.127341,0.082397,0.071161,0.127341,0.086142,0.071161,0.074906,...,0.097378,0.093633,0.097378,0.108614,0.138577,0.116105,0.089888,0.082397,0.06367,0.071161
8,9,acc,0.082397,0.11985,0.074906,0.067416,0.11985,0.086142,0.059925,0.059925,...,0.093633,0.074906,0.082397,0.074906,0.101124,0.104869,0.071161,0.078652,0.05618,0.048689
9,10,acc,0.06367,0.093633,0.06367,0.06367,0.089888,0.071161,0.06367,0.041199,...,0.074906,0.086142,0.074906,0.052434,0.097378,0.082397,0.074906,0.082397,0.044944,0.037453


## Normalize by base percentage
Here we want to normalize the percentage of co-expressed genes with the percent of accessory or core genes in the genome.

In [31]:
# Baseline/expected proportions for PAO1
pao1_total = len(pao1_core) + len(pao1_acc)
pao1_acc_expected = len(pao1_acc) / pao1_total
pao1_core_expected = len(pao1_core) / pao1_total
print("total pao1 genes", pao1_total)
print("pao1 acc baseline", pao1_acc_expected)
print("pao1 core baseline", pao1_core_expected)

total pao1 genes 5563
pao1 acc baseline 0.036311342800647135
pao1 core baseline 0.9636886571993528


In [32]:
# Baseline/expected proportions for PA14
pa14_total = len(pa14_core) + len(pa14_acc)
pa14_acc_expected = len(pa14_acc) / pa14_total
pa14_core_expected = len(pa14_core) / pa14_total
print("total pa14 genes", pa14_total)
print("pa14 acc baseline", pa14_acc_expected)
print("pa14 core baseline", pa14_core_expected)

total pa14 genes 5887
pa14 acc baseline 0.09002887718702225
pa14 core baseline 0.9099711228129778


In [33]:
# Most stable PAO1
expression_dist_counts_pao1_most_ci.loc[
    pao1_acc_most_ids, sampling_cols
] /= pao1_acc_expected
expression_dist_counts_pao1_most_ci.loc[
    pao1_core_most_ids, sampling_cols
] /= pao1_core_expected

In [34]:
# Least stable PAO1
expression_dist_counts_pao1_least_ci.loc[
    pao1_acc_least_ids, sampling_cols
] /= pao1_acc_expected
expression_dist_counts_pao1_least_ci.loc[
    pao1_core_least_ids, sampling_cols
] /= pao1_core_expected

In [35]:
# Most stable PA14
expression_dist_counts_pa14_most_ci.loc[
    pa14_acc_most_ids, sampling_cols
] /= pa14_acc_expected
expression_dist_counts_pa14_most_ci.loc[
    pa14_core_most_ids, sampling_cols
] /= pa14_core_expected

In [36]:
# Least stable PA14
expression_dist_counts_pa14_least_ci.loc[
    pa14_acc_least_ids, sampling_cols
] /= pa14_acc_expected
expression_dist_counts_pa14_least_ci.loc[
    pa14_core_least_ids, sampling_cols
] /= pa14_core_expected

In [37]:
expression_dist_counts_pao1_most_ci

Unnamed: 0,offset,gene type,total_0,total_1,total_2,total_3,total_4,total_5,total_6,total_7,...,total_40,total_41,total_42,total_43,total_44,total_45,total_46,total_47,total_48,total_49
0,1,acc,0.206289,0.103145,0.0,0.206289,0.206289,0.0,0.0,0.103145,...,0.0,0.0,0.0,0.206289,0.0,0.206289,0.103145,0.0,0.0,0.0
1,2,acc,0.0,0.0,0.0,0.0,0.206289,0.103145,0.103145,0.206289,...,0.103145,0.0,0.206289,0.0,0.0,0.103145,0.103145,0.103145,0.206289,0.103145
2,3,acc,0.103145,0.309434,0.206289,0.103145,0.0,0.0,0.103145,0.206289,...,0.0,0.0,0.103145,0.206289,0.103145,0.0,0.0,0.0,0.0,0.206289
3,4,acc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,acc,0.412578,0.0,0.103145,0.0,0.103145,0.103145,0.0,0.0,...,0.0,0.103145,0.0,0.0,0.103145,0.0,0.103145,0.103145,0.103145,0.103145
5,6,acc,0.0,0.0,0.0,0.412578,0.103145,0.103145,0.206289,0.103145,...,0.103145,0.0,0.206289,0.0,0.103145,0.309434,0.0,0.0,0.103145,0.206289
6,7,acc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,acc,0.103145,0.103145,0.0,0.103145,0.0,0.103145,0.206289,0.206289,...,0.103145,0.0,0.103145,0.206289,0.103145,0.0,0.206289,0.0,0.103145,0.103145
8,9,acc,0.309434,0.0,0.0,0.0,0.309434,0.103145,0.309434,0.412578,...,0.103145,0.309434,0.103145,0.103145,0.412578,0.103145,0.309434,0.309434,0.309434,0.0
9,10,acc,0.206289,0.206289,0.515723,0.103145,0.103145,0.0,0.0,0.309434,...,0.0,0.0,0.206289,0.206289,0.103145,0.0,0.206289,0.206289,0.412578,0.103145


In [38]:
expression_dist_counts_pao1_least_ci

Unnamed: 0,offset,gene type,total_0,total_1,total_2,total_3,total_4,total_5,total_6,total_7,...,total_40,total_41,total_42,total_43,total_44,total_45,total_46,total_47,total_48,total_49
0,1,acc,2.578615,2.269181,1.959747,2.166036,2.784904,2.888048,2.166036,2.269181,...,2.47547,1.650313,1.753458,2.578615,1.650313,2.372325,1.856603,2.991193,1.856603,2.269181
1,2,acc,1.753458,2.47547,1.547169,1.856603,2.784904,1.856603,1.547169,2.062892,...,1.959747,2.269181,2.166036,2.47547,2.681759,2.372325,2.166036,2.578615,1.547169,1.13459
2,3,acc,2.47547,2.888048,1.959747,1.753458,3.506916,2.372325,1.34088,1.959747,...,2.681759,2.062892,1.856603,2.166036,2.681759,2.888048,2.166036,2.062892,1.856603,1.959747
3,4,acc,3.403771,3.300627,2.578615,1.856603,4.022639,2.784904,2.269181,1.856603,...,2.784904,2.269181,2.372325,3.094338,3.403771,3.506916,2.47547,3.197482,2.784904,2.578615
4,5,acc,2.578615,3.300627,2.578615,1.856603,2.681759,2.166036,1.959747,2.269181,...,2.784904,1.959747,2.578615,1.856603,2.888048,2.784904,2.062892,2.888048,1.547169,1.444024
5,6,acc,3.094338,4.022639,2.166036,2.784904,3.61006,3.094338,2.372325,2.269181,...,2.681759,2.166036,2.681759,2.062892,3.713205,3.197482,2.062892,2.888048,1.959747,2.062892
6,7,acc,1.547169,2.991193,1.753458,2.269181,2.681759,2.166036,1.34088,1.650313,...,1.237735,2.062892,1.13459,2.47547,2.47547,2.372325,1.753458,2.062892,1.444024,1.13459
7,8,acc,2.578615,3.506916,2.269181,1.959747,3.506916,2.372325,1.959747,2.062892,...,2.681759,2.578615,2.681759,2.991193,3.81635,3.197482,2.47547,2.269181,1.753458,1.959747
8,9,acc,2.269181,3.300627,2.062892,1.856603,3.300627,2.372325,1.650313,1.650313,...,2.578615,2.062892,2.269181,2.062892,2.784904,2.888048,1.959747,2.166036,1.547169,1.34088
9,10,acc,1.753458,2.578615,1.753458,1.753458,2.47547,1.959747,1.753458,1.13459,...,2.062892,2.372325,2.062892,1.444024,2.681759,2.269181,2.062892,2.269181,1.237735,1.031446


## Get quantiles

In [39]:
alpha = 1 - ci
lower = alpha / 2
upper = 1 - (alpha / 2)

In [40]:
pao1_most_ci_ranges = expression_dist_counts_pao1_most_ci.quantile(
    [lower, upper], axis=1
)
pao1_least_ci_ranges = expression_dist_counts_pao1_least_ci.quantile(
    [lower, upper], axis=1
)

In [41]:
pa14_most_ci_ranges = expression_dist_counts_pa14_most_ci.quantile(
    [lower, upper], axis=1
)
pa14_least_ci_ranges = expression_dist_counts_pa14_least_ci.quantile(
    [lower, upper], axis=1
)

## Format

Merge with starting df with corr

In [42]:
pao1_most_ci = expression_dist_counts_pao1_most_ci.merge(
    pao1_most_ci_ranges.T, left_index=True, right_index=True
).drop(sampling_cols, axis=1)
pao1_least_ci = expression_dist_counts_pao1_least_ci.merge(
    pao1_least_ci_ranges.T, left_index=True, right_index=True
).drop(sampling_cols, axis=1)

pao1_most_ci.columns = ["offset", "gene type", "ymin", "ymax"]
pao1_least_ci.columns = ["offset", "gene type", "ymin", "ymax"]

In [43]:
pa14_most_ci = expression_dist_counts_pa14_most_ci.merge(
    pa14_most_ci_ranges.T, left_index=True, right_index=True
).drop(sampling_cols, axis=1)
pa14_least_ci = expression_dist_counts_pa14_least_ci.merge(
    pa14_least_ci_ranges.T, left_index=True, right_index=True
).drop(sampling_cols, axis=1)

pa14_most_ci.columns = ["offset", "gene type", "ymin", "ymax"]
pa14_least_ci.columns = ["offset", "gene type", "ymin", "ymax"]

In [44]:
pao1_most_ci.head()

Unnamed: 0,offset,gene type,ymin,ymax
0,1,acc,0.0,0.389371
1,2,acc,0.0,0.286226
2,3,acc,0.0,0.309434
3,4,acc,0.0,0.0
4,5,acc,0.0,0.412578


In [45]:
pa14_most_ci.head()

Unnamed: 0,offset,gene type,ymin,ymax
0,1,acc,0.0,0.115444
1,2,acc,0.041601,0.281849
2,3,acc,0.041601,0.32345
3,4,acc,0.0,0.249608
4,5,acc,0.092563,0.471135


In [46]:
# Save
pao1_most_ci.to_csv("pao1_most_ci.tsv", sep="\t")
pao1_least_ci.to_csv("pao1_least_ci.tsv", sep="\t")

pa14_most_ci.to_csv("pa14_most_ci.tsv", sep="\t")
pa14_least_ci.to_csv("pa14_least_ci.tsv", sep="\t")