# Counts of variants

This notebook analyzes the counts of the different variants.

Import Python modules:

In [1]:
import os

import Bio.SeqIO

import altair as alt

import dms_variants.codonvarianttable

import pandas as pd

import yaml

In [2]:
os.chdir('../../')

In [3]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

Get configuration information:

In [4]:
# If you are running notebook interactively rather than in pipeline that handles
# working directories, you may have to first `os.chdir` to appropriate directory.

with open("config.yaml") as f:
    config = yaml.safe_load(f)

Read information on the barcode runs:

In [5]:
barcode_runs = pd.read_csv(config["processed_barcode_runs"])

assert len(barcode_runs) == barcode_runs["library_sample"].nunique()

In [6]:
counts = pd.concat(
    [
        pd.read_csv(os.path.join(subdir, f"{library_sample}.csv")).assign(valid=valid)
        for library_sample in barcode_runs["library_sample"]
        for (subdir, valid) in [
            (config["barcode_counts_dir"], True),
            (config["barcode_counts_invalid_dir"], False),
        ]
    ]
)

In [7]:
fates = (
    pd.concat(
        [
            pd.read_csv(
                os.path.join(config["barcode_fates_dir"], f"{library_sample}.csv")
            )
            for library_sample in barcode_runs["library_sample"]
        ]
    )
    .merge(barcode_runs, on=["library", "sample"], validate="many_to_one")
    .drop(columns=["fastq_R1", "notes"])
    .assign(
        valid=lambda x: x["fate"] == "valid barcode",
        not_valid=lambda x: ~x["valid"],
    )
)

selection_cols = [
    "exclude_after_counts",
    "antibody",
    "virus_batch",
    "sample_type",
    "date",
    "library",
]

selections = [
    alt.selection_point(
        fields=[col],
        bind=alt.binding_select(
            options=[None] + fates[col].dropna().unique().tolist(),
            labels=["all"] + [str(x) for x in fates[col].dropna().unique()],
            name=col,
        ),
    )
    for col in selection_cols
]

Get which libraries each barcode maps to:

In [8]:
barcodes_by_library = (
    pd.read_csv(config["codon_variants"])
    .groupby(["barcode", "target"], as_index=False)
    .aggregate(
        libraries_w_barcode=pd.NamedAgg("library", lambda s: ", ".join(s.unique())),
        n_libraries_w_barcode=pd.NamedAgg("library", "nunique"),
    )
)

display(
    barcodes_by_library.groupby(["target", "libraries_w_barcode"]).aggregate(
        n_barcodes=pd.NamedAgg("barcode", "count")
    )
)

Unnamed: 0_level_0,Unnamed: 1_level_0,n_barcodes
target,libraries_w_barcode,Unnamed: 2_level_1
H6,"libA, libB",4
RNA_spike-in,"libA, libB",127
gene,libA,66580
gene,"libA, libB",1
gene,libB,85133


Now look at the overall barcode counts for each sample and see how many map to the expected library or to some other library.
Having many barcodes that map to a different library can be an indication of contamination unless there is a lot of expected overlap between the two libraries (which would be indicated in table above):

In [9]:
counts_by_library = (
    counts.merge(barcodes_by_library, on="barcode", validate="many_to_one")
    .groupby(
        ["library", "sample", "libraries_w_barcode", "target", "n_libraries_w_barcode"],
        as_index=False,
    )
    .aggregate(n_counts=pd.NamedAgg("count", "sum"))
    .assign(
        frac_counts=lambda x: x["n_counts"]
        / x.groupby(["library", "sample"])["n_counts"].transform("sum"),
    )
    .merge(barcode_runs)
    .assign(
        category=lambda x: x["libraries_w_barcode"].where(
            x["target"] == "gene", x["target"]
        )
    )
    .drop(
        columns=[
            "fastq_R1",
            "notes",
            "antibody_concentration",
            "target",
            "libraries_w_barcode",
        ]
    )
)

Plot which libraries overall barcode counts map to for each sample:

In [10]:
ordered_cats = (
    counts_by_library.sort_values(["n_libraries_w_barcode", "category"])["category"]
    .unique()
    .tolist()
)

category_selection = alt.selection_point(fields=["category"], bind="legend")

counts_by_library_chart = (
    alt.Chart(
        counts_by_library.assign(
            order=lambda x: x["category"].map(lambda s: ordered_cats.index(s))
        )
    )
    .encode(
        x=alt.X("frac_counts", scale=alt.Scale(domain=[0, 1])),
        y=alt.Y("library_sample", title=None),
        color=alt.Color("category", scale=alt.Scale(domain=ordered_cats)),
        order="order",
        tooltip=[
            alt.Tooltip(c, format=".2g") if c in {"n_counts", "frac_counts"} else c
            for c in counts_by_library.columns
            if c not in {"library_sample"}
        ],
    )
    .mark_bar()
    .properties(width=250, height=alt.Step(15))
    .configure_axis(labelLimit=500)
    .add_params(*selections, category_selection)
    .transform_filter(category_selection)
)
for selection in selections:
    counts_by_library_chart = counts_by_library_chart.transform_filter(selection)

counts_by_library_chart

In [44]:
counts_by_library

Unnamed: 0,library,sample,n_libraries_w_barcode,n_counts,frac_counts,date,virus_batch,sample_type,antibody,replicate,exclude_after_counts,neut_standard_name,library_sample,category
0,libA,221021_1_antibody_1C04_0.05_1,1,8086436,0.997654,221021,1,antibody,1C04,1,yes,H6,libA_221021_1_antibody_1C04_0.05_1,libA
1,libA,221021_1_antibody_1C04_0.05_1,2,19009,0.002345,221021,1,antibody,1C04,1,yes,H6,libA_221021_1_antibody_1C04_0.05_1,H6
2,libA,221021_1_antibody_1C04_0.05_1,2,0,0.0,221021,1,antibody,1C04,1,yes,H6,libA_221021_1_antibody_1C04_0.05_1,RNA_spike-in
3,libA,221021_1_antibody_1C04_0.05_1,2,0,0.0,221021,1,antibody,1C04,1,yes,H6,libA_221021_1_antibody_1C04_0.05_1,"libA, libB"
4,libA,221021_1_antibody_1C04_0.05_1,1,5,0.000001,221021,1,antibody,1C04,1,yes,H6,libA_221021_1_antibody_1C04_0.05_1,libB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,libB,230126_1_no-antibody_control_1,1,4,0.0,230126,1,no-antibody_control,,1,yes,H6,libB_230126_1_no-antibody_control_1,libA
991,libB,230126_1_no-antibody_control_1,2,12448,0.00116,230126,1,no-antibody_control,,1,yes,H6,libB_230126_1_no-antibody_control_1,H6
992,libB,230126_1_no-antibody_control_1,2,0,0.0,230126,1,no-antibody_control,,1,yes,H6,libB_230126_1_no-antibody_control_1,RNA_spike-in
993,libB,230126_1_no-antibody_control_1,2,31,0.000003,230126,1,no-antibody_control,,1,yes,H6,libB_230126_1_no-antibody_control_1,"libA, libB"


In [69]:
neut_counts = counts_by_library.loc[(counts_by_library['category'] == 'RNA_spike-in') &
                                    (counts_by_library['date'] == 230312) &
                                    (counts_by_library['sample_type'] == 'antibody')
                                   ]

neut_counts['selection_concentration'] = neut_counts['sample'].apply(lambda x: x.split('_')[4]).astype(float)

neut_counts = neut_counts[['frac_counts', 'replicate', 'selection_concentration', 'category']]

neut_counts = neut_counts.rename(columns={'category': 'neut_std'})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [70]:
spikein_mapping = {
    1: 5,
    2: 0.5,
    3: 0.05,
    4: 0.005
}

neut_counts['replicate'] = neut_counts['replicate'].map(spikein_mapping)

neut_counts = neut_counts.rename(columns={'replicate': 'ng_spike-in'})
neut_counts

Unnamed: 0,frac_counts,ng_spike-in,selection_concentration,neut_std
381,0.769593,5.0,0.00034,RNA_spike-in
385,0.245307,0.5,0.00034,RNA_spike-in
390,0.034558,0.05,0.00034,RNA_spike-in
395,0.003323,0.005,0.00034,RNA_spike-in
399,0.848303,5.0,0.00068,RNA_spike-in
403,0.356716,0.5,0.00068,RNA_spike-in
408,0.054491,0.05,0.00068,RNA_spike-in
413,0.005829,0.005,0.00068,RNA_spike-in
418,0.94227,5.0,0.00136,RNA_spike-in
422,0.631616,0.5,0.00136,RNA_spike-in


In [71]:
h6_2367_frac_counts = [0.061, 0.19, 0.60, 0.79]
h6_2367_ab_conc = [0.00034, 0.00068, 0.00136, 0.00272]

h6_2367 = pd.DataFrame({'selection_concentration': h6_2367_ab_conc,
                        'frac_counts': h6_2367_frac_counts,
                        'ng_spike-in': 0,
                        'neut_std': 'H6'
                       }    
                      )

h6_2367

Unnamed: 0,selection_concentration,frac_counts,ng_spike-in,neut_std
0,0.00034,0.061,0,H6
1,0.00068,0.19,0,H6
2,0.00136,0.6,0,H6
3,0.00272,0.79,0,H6


In [72]:
neut_counts_full = pd.concat([neut_counts, h6_2367])
# neut_counts_full.to_csv('scratch_notebooks/230313_get-spike-in-barcodes/neut_counts_h6_spikein.csv', index=False)

In [64]:
spikein_mapping = {
    1: 5,
    2: 0.5,
    3: 0.05,
    4: 0.005
}

neut_counts_full['replicate'] = neut_counts_full['replicate'].map(spikein_mapping)

In [65]:
neut_counts_full = neut_counts_full.rename(columns={'replicate': 'ng_spike-in'})
neut_counts_full

Unnamed: 0,frac_counts,ng_spike-in,selection_concentration,neut_std
381,0.769593,5.0,0.00034,RNA_spike-in
385,0.245307,0.5,0.00034,RNA_spike-in
390,0.034558,0.05,0.00034,RNA_spike-in
395,0.003323,0.005,0.00034,RNA_spike-in
399,0.848303,5.0,0.00068,RNA_spike-in
403,0.356716,0.5,0.00068,RNA_spike-in
408,0.054491,0.05,0.00068,RNA_spike-in
413,0.005829,0.005,0.00068,RNA_spike-in
418,0.94227,5.0,0.00136,RNA_spike-in
422,0.631616,0.5,0.00136,RNA_spike-in


In [76]:
# neut_counts_spikein = neut_counts.loc[neut_counts['neut_std'] == 'spike-in']

frac_neut_standard_chart = (
    alt.Chart(neut_counts_full, title='Fraction counts from neut std')
    .encode(
        x=alt.X("selection_concentration", 
                title="2367 serum selection concentration",
                scale=alt.Scale(type="log"),
               ),
        y=alt.Y(
            "frac_counts",
            title="fraction counts from neutralization standard",
            scale=alt.Scale(type="log"),
        ),
        column='neut_std:N',
        color=alt.Color('ng_spike-in:N', 
                        legend=alt.Legend(orient="right", title='ng spike-in per well'))
    )
    .mark_point(filled=True, size=50, opacity=0.7)
    .properties(width=250, height=250)
)

frac_neut_standard_chart

In [97]:
h6_2367

Unnamed: 0,selection_concentration,frac_counts,replicate,neut_std
0,0.00034,0.061,1,H6
1,0.00068,0.19,1,H6
2,0.00136,0.6,1,H6
3,0.00272,0.79,1,H6


In [57]:
frac_h6 = (
    alt.Chart(h6_2367, title='H6 as neut standard')
    .encode(
        x=alt.X("selection_concentration", 
                title="2367 serum selection concentration",
                scale=alt.Scale(type="log"),
               ),
        y=alt.Y(
            "frac_counts",
            title="fraction counts from neutralization standard",
            scale=alt.Scale(type="log"),
        ),
    )
    .mark_point(filled=True, size=50, opacity=0.7)
    .properties(width=250, height=250)
)

frac_h6

In [102]:
h6_1c04_frac_counts = [0.0023, 0.0095, 0.055, 0.14, 0.24]
h6_1c04_ab_conc = [0.05, 0.1, 0.2, 0.4, 0.8]

h6_1c04 = pd.DataFrame({'selection_concentration': h6_1c04_ab_conc,
                        'frac_counts': h6_1c04_frac_counts,
                        'replicate': 1,
                        'neut_std': 'H6'
                       }    
                      )

h6_1c04

Unnamed: 0,selection_concentration,frac_counts,replicate,neut_std
0,0.05,0.0023,1,H6
1,0.1,0.0095,1,H6
2,0.2,0.055,1,H6
3,0.4,0.14,1,H6
4,0.8,0.24,1,H6


In [103]:
frac_h6 = (
    alt.Chart(h6_1c04, title='H6 as neut standard')
    .encode(
        x=alt.X("selection_concentration", 
                title="1C04 mAb selection concentration"
               ),
        y=alt.Y(
            "frac_counts",
            title="fraction counts from neutralization standard",
            scale=alt.Scale(type="symlog", constant=0.02, domainMax=1),
        ),
    )
    .mark_point(filled=True, size=50, opacity=0.7)
    .properties(width=250, height=250)
)

frac_h6

### 3-20-23: 15-18yo sera analysis

In [77]:
neut_counts = counts_by_library.loc[((counts_by_library['category'] == 'H6') | (counts_by_library['category'] == 'RNA_spike-in')) &
                                    (counts_by_library['date'] == 230317) &
                                    (counts_by_library['sample_type'] == 'antibody')
                                   ]

neut_counts['selection_concentration'] = neut_counts['sample'].apply(lambda x: x.split('_')[4]).astype(float)

neut_counts = neut_counts[['frac_counts', 'category', 'selection_concentration', 'antibody']]

# neut_counts['neut_std'] = 'spike-in'

neut_counts = neut_counts.rename(columns={
    'category': 'neut_standard'})

neut_counts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,frac_counts,neut_standard,selection_concentration,antibody
478,0.012457,H6,0.001564,2343
479,0.022749,RNA_spike-in,0.001564,2343
483,0.012457,H6,0.001564,2343
484,0.022749,RNA_spike-in,0.001564,2343
488,0.025152,H6,0.003129,2343
...,...,...,...,...
645,0.036702,RNA_spike-in,0.001737,3866
648,0.234359,H6,0.003474,3866
649,0.133396,RNA_spike-in,0.003474,3866
653,0.234359,H6,0.003474,3866


In [78]:
# neut_counts_rna = neut_counts.loc[neut_counts['neut_standard'] == 'RNA_spike-in']

frac_neut_standard_chart = (
    alt.Chart(neut_counts, title='neut std fraction in multiple sera selections')
    .encode(
        x=alt.X("selection_concentration", 
                title="serum selection concentration",
                scale=alt.Scale(type="log"),
               ),
        y=alt.Y(
            "frac_counts",
            title="frac counts",
            scale=alt.Scale(type="log", constant=0.02, domainMax=1),
        ),
        column='antibody:N',
        row='neut_standard:N',
        color=alt.Color('antibody:N', 
                        legend=alt.Legend(orient="right", title='antibody'))
    )
    .mark_point(filled=True, size=50, opacity=0.7)
    .properties(width=200, height=150)
    # .facet(
    #     row='antibody:N',
    #     column='neut_standard:N',)
    .resolve_scale(x='independent')
)

frac_neut_standard_chart

In [14]:
counts_by_library

Unnamed: 0,library,sample,n_libraries_w_barcode,n_counts,frac_counts,date,virus_batch,sample_type,antibody,replicate,exclude_after_counts,neut_standard_name,library_sample,category
0,libA,221021_1_antibody_1C04_0.05_1,1,8086436,0.997654,221021,1,antibody,1C04,1,yes,H6,libA_221021_1_antibody_1C04_0.05_1,libA
1,libA,221021_1_antibody_1C04_0.05_1,2,19009,0.002345,221021,1,antibody,1C04,1,yes,H6,libA_221021_1_antibody_1C04_0.05_1,H6
2,libA,221021_1_antibody_1C04_0.05_1,2,0,0.0,221021,1,antibody,1C04,1,yes,H6,libA_221021_1_antibody_1C04_0.05_1,RNA_spike-in
3,libA,221021_1_antibody_1C04_0.05_1,2,0,0.0,221021,1,antibody,1C04,1,yes,H6,libA_221021_1_antibody_1C04_0.05_1,"libA, libB"
4,libA,221021_1_antibody_1C04_0.05_1,1,5,0.000001,221021,1,antibody,1C04,1,yes,H6,libA_221021_1_antibody_1C04_0.05_1,libB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,libB,230126_1_no-antibody_control_1,1,4,0.0,230126,1,no-antibody_control,,1,yes,H6,libB_230126_1_no-antibody_control_1,libA
991,libB,230126_1_no-antibody_control_1,2,12448,0.00116,230126,1,no-antibody_control,,1,yes,H6,libB_230126_1_no-antibody_control_1,H6
992,libB,230126_1_no-antibody_control_1,2,0,0.0,230126,1,no-antibody_control,,1,yes,H6,libB_230126_1_no-antibody_control_1,RNA_spike-in
993,libB,230126_1_no-antibody_control_1,2,31,0.000003,230126,1,no-antibody_control,,1,yes,H6,libB_230126_1_no-antibody_control_1,"libA, libB"


In [15]:
h6_neut_counts = counts_by_library.loc[(counts_by_library['category'] == 'H6') &
                                    (counts_by_library['date'] == 221223) &
                                    (counts_by_library['sample_type'] == 'antibody')
                                   ]

h6_neut_counts['selection_concentration'] = h6_neut_counts['sample'].apply(
    lambda x: x.split('_')[4]).astype(float)

h6_neut_counts = h6_neut_counts[['frac_counts', 'category', 'selection_concentration', 'antibody']]

# neut_counts['neut_std'] = 'spike-in'

h6_neut_counts = h6_neut_counts.rename(columns={
    'category': 'neut_standard'})

h6_neut_counts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,frac_counts,neut_standard,selection_concentration,antibody
166,0.010571,H6,0.004916,AUSAB-05
171,0.034605,H6,0.007374,AUSAB-05
176,0.112462,H6,0.011062,AUSAB-05
181,0.139439,H6,0.016593,AUSAB-05
186,0.070133,H6,0.024889,AUSAB-05
191,0.026385,H6,0.037333,AUSAB-05
196,0.011701,H6,0.056,AUSAB-05
201,0.00724,H6,0.000681,AUSAB-07
206,0.014861,H6,0.001022,AUSAB-07
211,0.042153,H6,0.001533,AUSAB-07


In [30]:
# neut_counts_rna = neut_counts.loc[neut_counts['neut_standard'] == 'RNA_spike-in']

frac_neut_standard_chart = (
    alt.Chart(h6_neut_counts, title='H6 neut std fraction in adult sera selections')
    .encode(
        x=alt.X("selection_concentration", 
                title="serum selection concentration",
                scale=alt.Scale(type="log"),
               ),
        y=alt.Y(
            "frac_counts",
            title="frac counts",
            scale=alt.Scale(type="log", constant=0.02, domainMax=1),
        ),
        column='antibody:N',
        color=alt.Color('antibody:N', 
                        legend=alt.Legend(orient="right", title='antibody'))
    )
    .mark_point(filled=True, size=50, opacity=0.7)
    .properties(width=200, height=150)
    # .facet(
    #     row='antibody:N',
    #     column='neut_standard:N',)
    .resolve_scale(x='independent')
)

frac_neut_standard_chart