# Explore highly co-expressed genes
In the previous [notebook](2_explore_data.ipynb) we observed that using 39 samples with 201 PAO1-specific genes, that the correlation of accessory-accessory genes is higher compared to the correlation of core-core and core-accessory genes.

Based on this finding, we want to know: *What can explain this difference in correlation distribution?*

This notebook performs a follow-up analysis. In particular this notebook performs a deeper examination of the correlation structure per group (core-core, core-accessory, accessory-accessory) by looking at the trends of the nearest neighbors (i.e. highly correlated genes) of each gene.

In [1]:
import pandas as pd
import os
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(123)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
# Input
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../"))

base_intermediate_dir = os.path.join(
    base_dir,
    "pilot_experiment",
    "data",
    "tmp")

core_gene_ids_file = os.path.join(
    base_intermediate_dir,
    "core_gene_ids.pickle")

acc_gene_ids_file = os.path.join(
    base_intermediate_dir,
    "acc_gene_ids.pickle")

real_all_corr_file = os.path.join(
    base_intermediate_dir,
    "real_all_corr.pickle")

shuffled_all_corr_file = os.path.join(
    base_intermediate_dir,
    "shuffled_all_corr.pickle")

# Import Pseudomonas operon annotations from ADAGE repo
# Original source of data is from DOOR
# https://github.com/greenelab/adage/blob/master/Genome_organization/operon_3.txt
# Operons containing at least 3 genes
operon_file = os.path.join(
    base_dir,
    "pilot_experiment",
    "data",
    "annotations",
    "DOOR_operon_3.txt")

In [3]:
# Read correlation data
core_gene_ids = pickle.load(open(core_gene_ids_file, "rb"))
acc_gene_ids = pickle.load(open(acc_gene_ids_file, "rb"))
real_all_corr = pickle.load(open(real_all_corr_file, "rb"))
shuffled_all_corr = pickle.load(open(shuffled_all_corr_file, "rb"))

In [4]:
# Read operon data
# Manually had to set names to be the max size operon
operon_data = pd.read_csv(
    operon_file,
    header=None,
    sep='\t',
    names=range(15)
)

num_operons = operon_data.shape[0]
operon_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,PA0001,PA0002,PA0003,PA0004,,,,,,,,,,,
1,PA0016,PA0017,PA0018,,,,,,,,,,,,
2,PA0026,PA0027,PA0028,,,,,,,,,,,,
3,PA0029,PA0030,PA0031,,,,,,,,,,,,
4,PA0054,PA0055,PA0056,,,,,,,,,,,,


In [5]:
# Get all gene ids
all_gene_ids = list(real_all_corr.index)

# Examine highly co-expressed gene clusters
For each core gene we will:
1. Extract the number of genes that are highly co-expressed with it
2. Determine the ratio of co-expressed genes that are core vs accessory

Repeat this for each accessory gene

In [6]:
# Define threshold for highly co-expressed genes
threshold = 0.75

In [7]:
# Apply threshold to identify which genes are co-expressed
real_all_coexpressed = real_all_corr>threshold
shuffled_all_coexpressed = shuffled_all_corr>threshold

## Get co-expressed genes using real data

In [8]:
real_all_coexpressed.head(10)

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
PA0001,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0002,False,True,False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
PA0003,False,False,True,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
PA0004,False,True,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
PA0005,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0006,False,True,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0007,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0008,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0009,False,False,True,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
PA0010,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [9]:
# Get upper triangle of correlation matrix
real_all_coexpressed_triu = pd.DataFrame(data=np.triu(real_all_coexpressed,1),
                                         index=real_all_coexpressed.index,
                                         columns=real_all_coexpressed.columns)

real_all_coexpressed_triu.head(10)

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
PA0001,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0002,False,False,False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
PA0003,False,False,False,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
PA0004,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
PA0005,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0006,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0007,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0008,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0009,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0010,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Get number of co-expressed genes

In [10]:
# Get total number of genes that are co-expressed per gene
num_coexpressed_genes = real_all_coexpressed_triu.sum(axis=1)

num_coexpressed_genes.head()

PA0001      8
PA0002    167
PA0003     85
PA0004    145
PA0005      0
dtype: int64

## Get list of co-expressed genes per gene id
## Get number of core and accessory co-expressed genes in list

In [11]:
# Given the list of co-expressed genes
# we want to differentiate between those that are core and those that are accessory

name_coexpressed_genes = {}
num_coexpressed_core_genes = {}
num_coexpressed_acc_genes = {}

all_gene_ids = list(real_all_coexpressed_triu.index)

for gene_id in all_gene_ids:
    # Get row of correlation matrix
    # The values in the row corresponds to if there exists a gene is co-expressed with the gene_id
    coexpressed_gene_values = real_all_coexpressed_triu.loc[gene_id]
    
    # Check that our calculations are consistent
    assert(num_coexpressed_genes[gene_id] == sum(coexpressed_gene_values))
    
    if num_coexpressed_genes[gene_id] > 0:
        # Get list of co-expressed genes
        lst_coexpressed_genes = list(coexpressed_gene_values[coexpressed_gene_values].index)
        name_coexpressed_genes[gene_id] = lst_coexpressed_genes
        
        # Get the number of co-expressed genes that are core, accessory
        num_core_genes = len(set(lst_coexpressed_genes).intersection(core_gene_ids))
        num_acc_genes = len(set(lst_coexpressed_genes).intersection(acc_gene_ids))
        num_coexpressed_core_genes[gene_id] = num_core_genes
        num_coexpressed_acc_genes[gene_id] = num_acc_genes
        
    else:
        name_coexpressed_genes[gene_id] = []
        num_coexpressed_core_genes[gene_id] = 0
        num_coexpressed_acc_genes[gene_id] = 0

In [12]:
# Calculate ratio of core:accessory genes in co-expressed gene sets
coexpressed_core_prop = {}
coexpressed_acc_prop = {}

for gene_id in all_gene_ids:
    num_core_genes = num_coexpressed_core_genes[gene_id]
    num_acc_genes = num_coexpressed_acc_genes[gene_id]
    if (num_core_genes == 0 & num_acc_genes == 0):
        coexpressed_core_prop[gene_id] = 0
        coexpressed_acc_prop[gene_id] = 0
    else:
        coexpressed_core_prop[gene_id] = num_core_genes/(num_core_genes + num_acc_genes)
        coexpressed_acc_prop[gene_id] = num_acc_genes/(num_core_genes + num_acc_genes)

In [13]:
# Spot check that counts for what genes are core and accessroy are correct
#for i in ['PA0648', 'PA0980', 'PA1510', 'PA2037', 'PA2756', 'PA2794', 'PA3223', 'PA3793', 'PA4295', 'PA4451', 'PA4940', 'PA5086', 'PA5087']:
#    print(i in acc_gene_ids)

### Examine co-operonic co-expressed genes

In [14]:
# Associate operons with reference gene
name_cooperonic_genes = {}

for gene_id in all_gene_ids:
    # Get operons containing reference gene_id   
    # Search for gene_id in each operon
    operon_search = operon_data.where(operon_data == gene_id).dropna(how='all').dropna(axis=1)
    
    if operon_search.empty:
        name_cooperonic_genes[gene_id] = []
    else:
        row_id = operon_search.index[0]
        name_cooperonic_genes[gene_id] = list(operon_data.loc[row_id].dropna())
        
name_cooperonic_genes    

{'PA0001': ['PA0001', 'PA0002', 'PA0003', 'PA0004'],
 'PA0002': ['PA0001', 'PA0002', 'PA0003', 'PA0004'],
 'PA0003': ['PA0001', 'PA0002', 'PA0003', 'PA0004'],
 'PA0004': ['PA0001', 'PA0002', 'PA0003', 'PA0004'],
 'PA0005': [],
 'PA0006': [],
 'PA0007': [],
 'PA0008': [],
 'PA0009': [],
 'PA0010': [],
 'PA0011': [],
 'PA0012': [],
 'PA0013': [],
 'PA0014': [],
 'PA0015': [],
 'PA0016': ['PA0016', 'PA0017', 'PA0018'],
 'PA0017': ['PA0016', 'PA0017', 'PA0018'],
 'PA0018': ['PA0016', 'PA0017', 'PA0018'],
 'PA0019': [],
 'PA0020': [],
 'PA0021': [],
 'PA0022': [],
 'PA0023': [],
 'PA0024': [],
 'PA0025': [],
 'PA0026': ['PA0026', 'PA0027', 'PA0028'],
 'PA0027': ['PA0026', 'PA0027', 'PA0028'],
 'PA0028': ['PA0026', 'PA0027', 'PA0028'],
 'PA0029': ['PA0029', 'PA0030', 'PA0031'],
 'PA0030': ['PA0029', 'PA0030', 'PA0031'],
 'PA0031': ['PA0029', 'PA0030', 'PA0031'],
 'PA0032': [],
 'PA0033': [],
 'PA0034': [],
 'PA0035': [],
 'PA0036': [],
 'PA0037': [],
 'PA0038': [],
 'PA0039': [],
 'PA0040': 

In [15]:
# Compare co-expressed gene set and co-operonic genes per reference gene id
num_non_cooperonic_coexpressed_genes = {}
num_non_cooperonic_coexpressed_core_genes = {}
num_non_cooperonic_coexpressed_acc_genes = {}

for gene_id in all_gene_ids:
    # Get co-operonic gene list
    cooperonic_genes = name_cooperonic_genes[gene_id]
    
    # Get co-expressed gene list
    coexpressed_genes = name_coexpressed_genes[gene_id]
    
    # Find non co-operonic genes
    # Find genes that DO NOT intersect between co-operonic genes and co-expressed genes
    cooperonic_coexpressed_genes = set(coexpressed_genes).intersection(cooperonic_genes)
    
    non_cooperonic_coexpressed_genes = set(coexpressed_genes) - cooperonic_coexpressed_genes
    
    # Get number of non-co-operonic genes
    num_non_cooperonic_coexpressed_genes[gene_id] = len(non_cooperonic_coexpressed_genes)
    
    if num_non_cooperonic_coexpressed_genes[gene_id] > 0:        
        # Get the number of non co-operonic co-expressed genes that are core, accessory
        num_core_genes = len(non_cooperonic_coexpressed_genes.intersection(core_gene_ids))
        num_acc_genes = len(non_cooperonic_coexpressed_genes.intersection(acc_gene_ids))
        num_non_cooperonic_coexpressed_core_genes[gene_id] = num_core_genes
        num_non_cooperonic_coexpressed_acc_genes[gene_id] = num_acc_genes
        
    else:
        num_non_cooperonic_coexpressed_core_genes[gene_id] = 0
        num_non_cooperonic_coexpressed_acc_genes[gene_id] = 0

In [16]:
# Calculate ratio of core:accessory genes in co-expressed gene sets
non_cooperonic_coexpressed_core_prop = {}
non_cooperonic_coexpressed_acc_prop = {}

for gene_id in all_gene_ids:
    num_core_genes = num_non_cooperonic_coexpressed_core_genes[gene_id]
    num_acc_genes = num_non_cooperonic_coexpressed_acc_genes[gene_id]
    if (num_core_genes == 0 & num_acc_genes == 0):
        non_cooperonic_coexpressed_core_prop[gene_id] = 0
        non_cooperonic_coexpressed_acc_prop[gene_id] = 0
    else:
        non_cooperonic_coexpressed_core_prop[gene_id] = num_core_genes/(num_core_genes + num_acc_genes)
        non_cooperonic_coexpressed_acc_prop[gene_id] = num_acc_genes/(num_core_genes + num_acc_genes)

# Summary statistics

In [17]:
# Core gene stats
core_stats_df = pd.DataFrame(data={'ref_gene':core_gene_ids,
                                  'num_coexpressed_genes':num_coexpressed_genes[core_gene_ids],
                                   'num_coexpressed_core': [num_coexpressed_core_genes[k] for k in core_gene_ids],
                                   'num_coexpressed_acc': [num_coexpressed_acc_genes[k] for k in core_gene_ids],
                                   'percent_coexpressed_core': [coexpressed_core_prop[k] for k in core_gene_ids],
                                   'percent_coexpressed_acc': [coexpressed_acc_prop[k] for k in core_gene_ids],
                                   'num_non_cooperonic_coexpressed_genes':[num_non_cooperonic_coexpressed_genes[k] 
                                                                           for k in core_gene_ids],
                                   'num_non_cooperonic_coexpressed_core': [num_non_cooperonic_coexpressed_core_genes[k] 
                                                                           for k in core_gene_ids],
                                   'num_non_cooperonic_coexpressed_acc': [num_non_cooperonic_coexpressed_acc_genes[k] 
                                                                          for k in core_gene_ids],
                                   'percent_non_cooperonic_coexpressed_core': [non_cooperonic_coexpressed_core_prop[k] 
                                                                               for k in core_gene_ids],
                                   'percent_non_cooperonic_coexpressed_acc': [non_cooperonic_coexpressed_acc_prop[k] 
                                                                              for k in core_gene_ids]
                                  }
                            )
core_stats_df.head()

Unnamed: 0,ref_gene,num_coexpressed_genes,num_coexpressed_core,num_coexpressed_acc,percent_coexpressed_core,percent_coexpressed_acc,num_non_cooperonic_coexpressed_genes,num_non_cooperonic_coexpressed_core,num_non_cooperonic_coexpressed_acc,percent_non_cooperonic_coexpressed_core,percent_non_cooperonic_coexpressed_acc
PA0001,PA0001,8,8,0,1.0,0.0,8,8,0,1.0,0.0
PA0002,PA0002,167,166,1,0.994012,0.005988,166,165,1,0.993976,0.006024
PA0003,PA0003,85,85,0,1.0,0.0,84,84,0,1.0,0.0
PA0004,PA0004,145,145,0,1.0,0.0,145,145,0,1.0,0.0
PA0005,PA0005,0,0,0,0.0,0.0,0,0,0,0.0,0.0


In [18]:
# Accessory gene stats
acc_stats_df = pd.DataFrame(data={'ref_gene':acc_gene_ids,
                                  'num_coexpressed_genes':num_coexpressed_genes[acc_gene_ids],
                                  'num_coexpressed_core': [num_coexpressed_core_genes[a] for a in acc_gene_ids],
                                  'num_coexpressed_acc': [num_coexpressed_acc_genes[a] for a in acc_gene_ids],
                                  'percent_coexpressed_core': [coexpressed_core_prop[a] for a in acc_gene_ids],
                                  'percent_coexpressed_acc': [coexpressed_acc_prop[a] for a in acc_gene_ids],
                                  'num_non_cooperonic_coexpressed_genes':[num_non_cooperonic_coexpressed_genes[a] 
                                                                          for a in acc_gene_ids],
                                  'num_non_cooperonic_coexpressed_core': [num_non_cooperonic_coexpressed_core_genes[a] 
                                                                          for a in acc_gene_ids],
                                  'num_non_cooperonic_coexpressed_acc': [num_non_cooperonic_coexpressed_acc_genes[a] 
                                                                         for a in acc_gene_ids],
                                  'percent_non_cooperonic_coexpressed_core': [non_cooperonic_coexpressed_core_prop[a] 
                                                                               for a in acc_gene_ids],
                                  'percent_non_cooperonic_coexpressed_acc': [non_cooperonic_coexpressed_acc_prop[a] 
                                                                              for a in acc_gene_ids]
                                  }
                            )
acc_stats_df.head()

Unnamed: 0,ref_gene,num_coexpressed_genes,num_coexpressed_core,num_coexpressed_acc,percent_coexpressed_core,percent_coexpressed_acc,num_non_cooperonic_coexpressed_genes,num_non_cooperonic_coexpressed_core,num_non_cooperonic_coexpressed_acc,percent_non_cooperonic_coexpressed_core,percent_non_cooperonic_coexpressed_acc
PA0053,PA0053,195,145,50,0.74359,0.25641,195,145,50,0.74359,0.25641
PA0095,PA0095,8,7,1,0.875,0.125,8,7,1,0.875,0.125
PA0100,PA0100,103,59,44,0.572816,0.427184,102,58,44,0.568627,0.431373
PA0135,PA0135,14,6,8,0.428571,0.571429,14,6,8,0.428571,0.571429
PA0187,PA0187,1,0,1,0.0,0.0,1,0,1,0.0,0.0


### Core gene statistics

In [19]:
# Print statistics about core genes
print('For a given CORE gene and using a threshold of {} to define co-expression: \n'.
     format(threshold))
print('- There is a median of {} co-expressed  genes'.
      format(np.median([num_coexpressed_genes[k] for k in core_gene_ids])))
print('- Of the co-expressed genes, the median percent of core genes is {}% and accessory genes is {}%'.
      format(np.median([coexpressed_core_prop[k] for k in core_gene_ids])*100,
             np.median([coexpressed_acc_prop[k] for k in core_gene_ids])*100))

For a given CORE gene and using a threshold of 0.75 to define co-expression: 

- There is a median of 18.0 co-expressed  genes
- Of the co-expressed genes, the median percent of core genes is 99.28057553956835% and accessory genes is 0.0%


### Accessory gene statistics

In [20]:
# Print statistics about core genes
print('For a given ACCESSORY gene and using a threshold of {} to define co-expression: \n'.
     format(threshold))
print('- There is a median of {} co-expressed  genes'.
      format(np.median([num_coexpressed_genes[a] for a in acc_gene_ids])))
print('- Of the co-expressed genes, the median percent of core genes is {}% and accessory genes is {}%'.
      format(np.median([coexpressed_core_prop[a] for a in acc_gene_ids])*100,
             np.median([coexpressed_acc_prop[a] for a in acc_gene_ids])*100))

For a given ACCESSORY gene and using a threshold of 0.75 to define co-expression: 

- There is a median of 22.0 co-expressed  genes
- Of the co-expressed genes, the median percent of core genes is 77.89473684210526% and accessory genes is 15.384615384615385%


## Using shuffled data

In [21]:
# Apply threshold to identify which genes are co-expressed
shuffled_all_coexpressed = shuffled_all_corr>threshold

In [22]:
# Get upper triangle of correlation matrix
shuffled_all_coexpressed_triu = pd.DataFrame(data=np.triu(shuffled_all_coexpressed,1),
                                         index=shuffled_all_coexpressed.index,
                                         columns=shuffled_all_coexpressed.columns)

shuffled_all_coexpressed_triu.head(10)

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
PA0001,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0002,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0003,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0004,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0005,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0006,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0007,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0008,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0009,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PA0010,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
# Get total number of genes that are co-expressed per gene
num_coexpressed_genes = shuffled_all_coexpressed_triu.sum(axis=1)

In [24]:
# Given the list of co-expressed genes
# we want to differentiate between those that are core and those that are accessory

name_coexpressed_genes = {}
num_coexpressed_core_genes = {}
num_coexpressed_acc_genes = {}

all_gene_ids = list(real_all_coexpressed_triu.index)

for gene_id in all_gene_ids:
    # Get row of correlation matrix
    # The values in the row corresponds to if there exists a gene is co-expressed with the gene_id
    coexpressed_gene_values = shuffled_all_coexpressed_triu.loc[gene_id]
    
    # Check that our calculations are consistent
    assert(num_coexpressed_genes[gene_id] == sum(coexpressed_gene_values))
    
    if num_coexpressed_genes[gene_id] > 0:
        # Get list of co-expressed genes
        lst_coexpressed_genes = list(coexpressed_gene_values[coexpressed_gene_values].index)
        name_coexpressed_genes[gene_id] = lst_coexpressed_genes
        
        # Get the number of co-expressed genes that are core, accessory
        num_core_genes = len(set(lst_coexpressed_genes).intersection(core_gene_ids))
        num_acc_genes = len(set(lst_coexpressed_genes).intersection(acc_gene_ids))
        num_coexpressed_core_genes[gene_id] = num_core_genes
        num_coexpressed_acc_genes[gene_id] = num_acc_genes
        
    else:
        name_coexpressed_genes[gene_id] = []
        num_coexpressed_core_genes[gene_id] = 0
        num_coexpressed_acc_genes[gene_id] = 0

In [25]:
# Calculate ratio of core:accessory genes in co-expressed gene sets
coexpressed_core_prop = {}
coexpressed_acc_prop = {}

for gene_id in all_gene_ids:
    num_core_genes = num_coexpressed_core_genes[gene_id]
    num_acc_genes = num_coexpressed_acc_genes[gene_id]
    if (num_core_genes == 0 & num_acc_genes == 0):
        coexpressed_core_prop[gene_id] = 0
        coexpressed_acc_prop[gene_id] = 0
    else:
        coexpressed_core_prop[gene_id] = num_core_genes/(num_core_genes + num_acc_genes)
        coexpressed_acc_prop[gene_id] = num_acc_genes/(num_core_genes + num_acc_genes)

In [27]:
# Compare co-expressed gene set and co-operonic genes per reference gene id
num_non_cooperonic_coexpressed_genes = {}
num_non_cooperonic_coexpressed_core_genes = {}
num_non_cooperonic_coexpressed_acc_genes = {}

for gene_id in all_gene_ids:
    # Get co-operonic gene list
    cooperonic_genes = name_cooperonic_genes[gene_id]
    
    # Get co-expressed gene list
    coexpressed_genes = name_coexpressed_genes[gene_id]
    
    # Find non co-operonic genes
    # Find genes that DO NOT intersect between co-operonic genes and co-expressed genes
    cooperonic_coexpressed_genes = set(coexpressed_genes).intersection(cooperonic_genes)
    
    non_cooperonic_coexpressed_genes = set(coexpressed_genes) - cooperonic_coexpressed_genes
    
    # Get number of non-co-operonic genes
    num_non_cooperonic_coexpressed_genes[gene_id] = len(non_cooperonic_coexpressed_genes)
    
    if num_non_cooperonic_coexpressed_genes[gene_id] > 0:        
        # Get the number of non co-operonic co-expressed genes that are core, accessory
        num_core_genes = len(non_cooperonic_coexpressed_genes.intersection(core_gene_ids))
        num_acc_genes = len(non_cooperonic_coexpressed_genes.intersection(acc_gene_ids))
        num_non_cooperonic_coexpressed_core_genes[gene_id] = num_core_genes
        num_non_cooperonic_coexpressed_acc_genes[gene_id] = num_acc_genes
        
    else:
        num_non_cooperonic_coexpressed_core_genes[gene_id] = 0
        num_non_cooperonic_coexpressed_acc_genes[gene_id] = 0

In [28]:
# Calculate ratio of core:accessory genes in co-expressed gene sets
non_cooperonic_coexpressed_core_prop = {}
non_cooperonic_coexpressed_acc_prop = {}

for gene_id in all_gene_ids:
    num_core_genes = num_non_cooperonic_coexpressed_core_genes[gene_id]
    num_acc_genes = num_non_cooperonic_coexpressed_acc_genes[gene_id]
    if (num_core_genes == 0 & num_acc_genes == 0):
        non_cooperonic_coexpressed_core_prop[gene_id] = 0
        non_cooperonic_coexpressed_acc_prop[gene_id] = 0
    else:
        non_cooperonic_coexpressed_core_prop[gene_id] = num_core_genes/(num_core_genes + num_acc_genes)
        non_cooperonic_coexpressed_acc_prop[gene_id] = num_acc_genes/(num_core_genes + num_acc_genes)

In [29]:
# Core gene stats
core_stats_df = pd.DataFrame(data={'ref_gene':core_gene_ids,
                                  'num_coexpressed_genes':num_coexpressed_genes[core_gene_ids],
                                   'num_coexpressed_core': [num_coexpressed_core_genes[k] for k in core_gene_ids],
                                   'num_coexpressed_acc': [num_coexpressed_acc_genes[k] for k in core_gene_ids],
                                   'percent_coexpressed_core': [coexpressed_core_prop[k] for k in core_gene_ids],
                                   'percent_coexpressed_acc': [coexpressed_acc_prop[k] for k in core_gene_ids],
                                   'num_non_cooperonic_coexpressed_genes':[num_non_cooperonic_coexpressed_genes[k] 
                                                                           for k in core_gene_ids],
                                   'num_non_cooperonic_coexpressed_core': [num_non_cooperonic_coexpressed_core_genes[k] 
                                                                           for k in core_gene_ids],
                                   'num_non_cooperonic_coexpressed_acc': [num_non_cooperonic_coexpressed_acc_genes[k] 
                                                                          for k in core_gene_ids],
                                   'percent_non_cooperonic_coexpressed_core': [non_cooperonic_coexpressed_core_prop[k] 
                                                                               for k in core_gene_ids],
                                   'percent_non_cooperonic_coexpressed_acc': [non_cooperonic_coexpressed_acc_prop[k] 
                                                                              for k in core_gene_ids]
                                  }
                            )
core_stats_df.head()

Unnamed: 0,ref_gene,num_coexpressed_genes,num_coexpressed_core,num_coexpressed_acc,percent_coexpressed_core,percent_coexpressed_acc,num_non_cooperonic_coexpressed_genes,num_non_cooperonic_coexpressed_core,num_non_cooperonic_coexpressed_acc,percent_non_cooperonic_coexpressed_core,percent_non_cooperonic_coexpressed_acc
PA0001,PA0001,0,0,0,0,0,0,0,0,0,0
PA0002,PA0002,0,0,0,0,0,0,0,0,0,0
PA0003,PA0003,0,0,0,0,0,0,0,0,0,0
PA0004,PA0004,0,0,0,0,0,0,0,0,0,0
PA0005,PA0005,0,0,0,0,0,0,0,0,0,0


In [30]:
# Accessory gene stats
acc_stats_df = pd.DataFrame(data={'ref_gene':acc_gene_ids,
                                  'num_coexpressed_genes':num_coexpressed_genes[acc_gene_ids],
                                  'num_coexpressed_core': [num_coexpressed_core_genes[a] for a in acc_gene_ids],
                                  'num_coexpressed_acc': [num_coexpressed_acc_genes[a] for a in acc_gene_ids],
                                  'percent_coexpressed_core': [coexpressed_core_prop[a] for a in acc_gene_ids],
                                  'percent_coexpressed_acc': [coexpressed_acc_prop[a] for a in acc_gene_ids],
                                  'num_non_cooperonic_coexpressed_genes':[num_non_cooperonic_coexpressed_genes[a] 
                                                                          for a in acc_gene_ids],
                                  'num_non_cooperonic_coexpressed_core': [num_non_cooperonic_coexpressed_core_genes[a] 
                                                                          for a in acc_gene_ids],
                                  'num_non_cooperonic_coexpressed_acc': [num_non_cooperonic_coexpressed_acc_genes[a] 
                                                                         for a in acc_gene_ids],
                                  'percent_non_cooperonic_coexpressed_core': [non_cooperonic_coexpressed_core_prop[a] 
                                                                               for a in acc_gene_ids],
                                  'percent_non_cooperonic_coexpressed_acc': [non_cooperonic_coexpressed_acc_prop[a] 
                                                                              for a in acc_gene_ids]
                                  }
                            )
acc_stats_df.head()

Unnamed: 0,ref_gene,num_coexpressed_genes,num_coexpressed_core,num_coexpressed_acc,percent_coexpressed_core,percent_coexpressed_acc,num_non_cooperonic_coexpressed_genes,num_non_cooperonic_coexpressed_core,num_non_cooperonic_coexpressed_acc,percent_non_cooperonic_coexpressed_core,percent_non_cooperonic_coexpressed_acc
PA0053,PA0053,0,0,0,0,0,0,0,0,0,0
PA0095,PA0095,0,0,0,0,0,0,0,0,0,0
PA0100,PA0100,0,0,0,0,0,0,0,0,0,0
PA0135,PA0135,0,0,0,0,0,0,0,0,0,0
PA0187,PA0187,0,0,0,0,0,0,0,0,0,0


In [31]:
# Print statistics about core genes
print('For a given CORE gene and using a threshold of {} to define co-expression: \n'.
     format(threshold))
print('- There is a median of {} co-expressed  genes'.
      format(np.median([num_coexpressed_genes[k] for k in core_gene_ids])))
print('- Of the co-expressed genes, the median percent of core genes is {}% and accessory genes is {}%'.
      format(np.median([coexpressed_core_prop[k] for k in core_gene_ids])*100,
             np.median([coexpressed_acc_prop[k] for k in core_gene_ids])*100))

For a given CORE gene and using a threshold of 0.75 to define co-expression: 

- There is a median of 0.0 co-expressed  genes
- Of the co-expressed genes, the median percent of core genes is 0.0% and accessory genes is 0.0%


In [33]:
# Print statistics about core genes
print('For a given ACCESSORY gene and using a threshold of {} to define co-expression: \n'.
     format(threshold))
print('- There is a median of {} co-expressed  genes'.
      format(np.median([num_coexpressed_genes[a] for a in acc_gene_ids])))
print('- Of the co-expressed genes, the median percent of core genes is {}% and accessory genes is {}%'.
      format(np.median([coexpressed_core_prop[a] for a in acc_gene_ids])*100,
             np.median([coexpressed_acc_prop[a] for a in acc_gene_ids])*100))

print('- There is a median of {} co-expressed genes that are NOT in a shared operon')
# Print same for core, acc

For a given ACCESSORY gene and using a threshold of 0.75 to define co-expression: 

- There is a median of 0.0 co-expressed  genes
- Of the co-expressed genes, the median percent of core genes is 0.0% and accessory genes is 0.0%
- There is a median of {} co-expressed genes that are in the same operon


# Plot number of co-expressed genes

In [None]:
# Distribution of number of co-expressed genes
#sns.distplot(core_stats_df['num_coexpressed_genes'].tolist(), label='core', color='red')
#sns.distplot(acc_stats_df['num_coexpressed_genes'].tolist(), label='accessory', color='blue')

#plt.legend(prop={'size': 12})
#plt.title('Distribution of number of co-expressed genes')
#plt.xlabel('Number of co-expressed genes')
#plt.ylabel('Density')

**Observation:**
* Looks like both core and accessory genes do not tend to be co-expressed with many genes
* Core genes are co-expressed with a median of 18 genes
* Accessory genes are co-expressed with a median of 22 genes
* Using 0.75 threshold to define co-expression
* If using a more strict definition...

In [None]:
# Distribution of number of non-operonic co-expressed genes
#sns.distplot(core_stats_df['num_coexpressed_genes'].tolist(), label='core', color='red')
#sns.distplot(acc_stats_df['num_coexpressed_genes'].tolist(), label='accessory', color='blue')

#plt.legend(prop={'size': 12})
#plt.title('Distribution of number of co-expressed genes')
#plt.xlabel('Number of co-expressed genes')
#plt.ylabel('Density')

# Plot distribution of core, accessory co-expressed genes

In [None]:
# Distribution plot for percent of core co-expressed genes
#sns.distplot(core_stats_df['percent_coexpressed_core'].tolist(), label='core', color='red')
#sns.distplot(acc_stats_df['percent_coexpressed_core'].tolist(), label='accessory', color='blue')

#plt.legend(prop={'size': 12})
#plt.title('Distribution of core co-expressed genes')
#plt.xlabel('Percent of core co-expressed genes')
#plt.ylabel('Density')

In [None]:
# Distribution plot for percent of accessory co-expressed genes
#sns.distplot(core_stats_df['percent_coexpressed_acc'].tolist(), label='core', color='red')
#sns.distplot(acc_stats_df['percent_coexpressed_acc'].tolist(), label='accessory', color='blue')

#plt.legend(prop={'size': 12})
#plt.title('Distribution of accessory co-expressed genes')
#plt.xlabel('Percent of accessory co-expressed genes')
#plt.ylabel('Density')

**Observation:**
* Core genes tend to be co-expressed with only other core genes
* Accessory genes tend to be co-expressed with some percent of core genes and accessory genes