# Site 734 correlations
This notebook plots the correlation of codon counts at site 734 between libraries

## Notebook setup
Imports

In [83]:
import itertools
import pandas as pd
import plotnine as p9

# print imports and versions
# copied from https://stackoverflow.com/questions/40428931/package-for-listing-version-of-packages-used-in-a-jupyter-notebook
print('\n'.join(f'{m.__name__}=={m.__version__}' \
                for m in globals().values() if getattr(m, '__version__', None)))

%run functions/enrichment.py.ipynb import calculate_enrichment

pandas==0.25.3
plotnine==0.6.0
numpy==1.19.1


Input data

In [28]:
site_734_counts = 'results/site_734/site_734_counts.csv'
wt_codon = "GAT"

Output data

In [3]:
site_734_dir = 'results/site_734/'

Notebook aesthetics

In [4]:
p9.theme_set(p9.theme_classic())
CBPALETTE_RICH = ['#648FFF', '#FFB000', '#DC267F', '#785EF0', '#FE6100']

## Load data
Load site 734 frequencies

In [5]:
codon_counts = pd.read_csv(site_734_counts)
display(codon_counts)

Unnamed: 0,name,library,source,stage,sorted,site,wildtype,codon,count,letter
0,wt-plasmid-noStage-notSorted,wt,plasmid,noStage,notSorted,734,GAT,AAA,0,K
1,wt-gDNA-noStage-notSorted,wt,gDNA,noStage,notSorted,734,GAT,AAA,0,K
2,lib1-plasmid-noStage-notSorted,lib1,plasmid,noStage,notSorted,734,GAT,AAA,0,K
3,lib2-plasmid-noStage-notSorted,lib2,plasmid,noStage,notSorted,734,GAT,AAA,0,K
4,lib3-plasmid-noStage-notSorted,lib3,plasmid,noStage,notSorted,734,GAT,AAA,0,K
...,...,...,...,...,...,...,...,...,...,...
1083,lib2-gDNA-late-notSorted,lib2,gDNA,late,notSorted,734,GAT,TTT,0,F
1084,lib3-gDNA-late-notSorted,lib3,gDNA,late,notSorted,734,GAT,TTT,0,F
1085,lib1-gDNA-late-sorted,lib1,gDNA,late,sorted,734,GAT,TTT,0,F
1086,lib2-gDNA-late-sorted,lib2,gDNA,late,sorted,734,GAT,TTT,0,F


In [6]:
codon_counts['name'].unique()

array(['wt-plasmid-noStage-notSorted', 'wt-gDNA-noStage-notSorted',
       'lib1-plasmid-noStage-notSorted', 'lib2-plasmid-noStage-notSorted',
       'lib3-plasmid-noStage-notSorted', 'lib1-gDNA-early-notSorted',
       'lib2-gDNA-early-notSorted', 'lib3-gDNA-early-notSorted',
       'lib1-gDNA-mid-notSorted', 'lib2-gDNA-mid-notSorted',
       'lib3-gDNA-mid-notSorted', 'lib1-gDNA-late-notSorted',
       'lib2-gDNA-late-notSorted', 'lib3-gDNA-late-notSorted',
       'lib1-gDNA-late-sorted', 'lib2-gDNA-late-sorted',
       'lib3-gDNA-late-sorted'], dtype=object)

In [10]:
codon_counts_wide = (
    codon_counts
    .pivot(index='codon',
           columns='name',
           values='count'))
display(codon_counts_wide)

name,lib1-gDNA-early-notSorted,lib1-gDNA-late-notSorted,lib1-gDNA-late-sorted,lib1-gDNA-mid-notSorted,lib1-plasmid-noStage-notSorted,lib2-gDNA-early-notSorted,lib2-gDNA-late-notSorted,lib2-gDNA-late-sorted,lib2-gDNA-mid-notSorted,lib2-plasmid-noStage-notSorted,lib3-gDNA-early-notSorted,lib3-gDNA-late-notSorted,lib3-gDNA-late-sorted,lib3-gDNA-mid-notSorted,lib3-plasmid-noStage-notSorted,wt-gDNA-noStage-notSorted,wt-plasmid-noStage-notSorted
codon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AAA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
AAC,82,38,32,47,129,85,70,37,49,54,77,53,20,72,83,33,0
AAG,88,59,71,126,64,71,16,64,216,65,71,53,36,56,88,0,0
AAT,75,58,62,59,28,32,53,72,35,6,38,65,27,62,10,205,20
ACA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGT,0,0,0,0,0,5,0,0,0,0,1,1,0,0,0,0,0
TTA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TTC,42,80,68,51,65,14,128,48,52,62,60,150,35,61,64,0,0
TTG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Plot correlations
First, define function to plot correlations.

In [71]:
def plot_correlation(lib_a, lib_b, source, stage, sort_status):
    r_value = (codon_counts_wide
           .query(f'codon != "{wt_codon}"')
           [[f"{lib_a}-{source}-{stage}-{sort_status}",
             f"{lib_b}-{source}-{stage}-{sort_status}"]]
           .corr()
           .iloc[0,-1])
    plot = (
        p9.ggplot((codon_counts_wide
                  .query(f'codon != "{wt_codon}"')),
                  p9.aes(x=f"{lib_a}-{source}-{stage}-{sort_status}",
                         y=f"{lib_b}-{source}-{stage}-{sort_status}")) +
        p9.geom_point(alpha=0.5) +
        p9.stat_smooth(method="lm",
                       se=False,
                       linetype='dashed',
                       color='gray') +
        p9.labs(x=f'{lib_a}',
                y=f'{lib_b}') +
        p9.ggtitle(f"{source} {stage} {sort_status} Library Correlation") +
        p9.annotate('text',
                    x=.1*(codon_counts_wide
                          .query(f'codon != "{wt_codon}"')
                          [f"{lib_a}-{source}-{stage}-{sort_status}"]
                          .max()),
                    y=(codon_counts_wide
                          .query(f'codon != "{wt_codon}"')
                          [f"{lib_b}-{source}-{stage}-{sort_status}"]
                          .max()),
                    label=f'r = {r_value:.2g}') +
        p9.theme(figure_size=(6, 2)))
    return plot

Make list of libs to iterate through:

In [76]:
libs = ['lib1', 'lib2', 'lib3']

## Plot plasmid correlations

In [82]:
source = "plasmid"
stage = None
sort_status = None

for lib in libs:
    
#     plot = plot_correlation(libs[n], libs[n+1], source, stage, sort_status)
#     display(plot)

TypeError: 'int' object is not iterable

In [85]:
print(list(itertools.permutations(libs, 2)))

[('lib1', 'lib2'), ('lib1', 'lib3'), ('lib2', 'lib1'), ('lib2', 'lib3'), ('lib3', 'lib1'), ('lib3', 'lib2')]
