# Title

# Introduction

State notebook purpose here.

## Imports

In [1]:
import sys
import os
import time
import copy
import random
import math
import itertools

import numpy as np
import scipy
import pandas as pd

# Set pandas display options
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns

# Set plot display options
params = {
    'font.size': 12,
    'axes.titlesize': 12,
    'axes.labelsize': 12,
    'legend.fontsize': 12,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'font.family': "Helvetica",
    'pdf.fonttype': 42,
    'ps.fonttype': 42,
    'figure.dpi': 300
   }
mpl.rcParams.update(params)
sns.set_style("ticks", {'axes.grid' : False})

# Set plot output options
output_dir = "outs/"
output_suffix = ""
output_formats = [".png", ".pdf"]

mpl.rc('savefig', dpi=300)

# Define convenience function for saving figures

savefig = True
savefig_args = {"dpi": 300, "bbox_inches": "tight", "pad_inches": 0.1}

def save_figure(fig, name, output_dir=output_dir, output_suffix=output_suffix, output_formats=output_formats, savefig_args=savefig_args):
    if savefig:
        for output_format in output_formats:
            fig.savefig(output_dir + "/" + name + output_suffix + output_format, **savefig_args)
    return None

In [2]:
# import additional libraries and settings here
import glob

# Load data

State data here.

In [3]:
%%time

infile = "/scratch/CellFreeReporter/analysis/Demo6/clones.tsv.gz"

counts_raw = pd.read_csv(infile, sep="\t", compression="gzip")

print(counts_raw.shape)
counts_raw.head()

(5453959, 4)
CPU times: user 4.85 s, sys: 518 ms, total: 5.37 s
Wall time: 5.37 s


Unnamed: 0,lib,library_barcode_call,clone_barcode,count
0,sFH11-1,LB1,AGACAGTGTGACAGTGTGTGACTCTGA,552
1,sFH11-1,LB1,TGAGAGTGTCTGACTCAGTCTCTGTGT,510
2,sFH11-1,LB1,AGAGTCTGACTCACTCAGTGTCTGTCT,465
3,sFH11-1,LB1,TGTCTGAGTCTGTCTGTGTCACTGTCT,461
4,sFH11-1,LB1,ACTGTCTGTCAGACACTCTGTCACTCT,440


# Subsample data for rapid iteration during development

In [4]:
# counts_raw = counts_raw.sample(n=100000, random_state=0)

# Build map from barcode to error-corrected consensus barcode

In [5]:
# Find input files

infiles_dir = "/scratch/CellFreeReporter/pipelines/220520_Demo6_consensus_error_correct_starcode"

infiles = sorted(glob.glob(infiles_dir + "/*.out"))

print(len(infiles), "files")

54 files


In [6]:
%%time

# Build mapping from (sample, library barcode, clone barcode) to consensus barcode

barcode_to_consensus = {}

for infile in infiles:

    lib = os.path.basename(infile).split(".")[1]  # Get library from filename
    library_barcode = os.path.basename(infile).split(".")[2]  # Get viral library barcode from filename

    with open(infile) as f:
        for line in f:

            fields = line.split("\t")

            consensus = fields[0]  # consensus sequence of cluster
            # count = int(fields[1])
            members = fields[2].rstrip().split(",")  # members of cluster

            key = (lib, library_barcode, consensus)
            barcode_to_consensus[key] = consensus

            for member in members:

                key = (lib, library_barcode, member)
                barcode_to_consensus[key] = consensus

print(len(barcode_to_consensus), "items")

5434262 items
CPU times: user 8.05 s, sys: 712 ms, total: 8.76 s
Wall time: 8.76 s


# Map clone barcode to consensus error-corrected barcode

In [7]:
counts_raw = counts_raw.set_index(["lib", "library_barcode_call", "clone_barcode"])

In [8]:
keys = counts_raw.index

In [9]:
%%time

# Map each clone barcode to consensus error-corrected clone barcode (actually do the mapping)

clone_barcode_consensus = []

for key in keys:
        
    try:
        consensus = barcode_to_consensus[key]
    except: 
        # consensus is not found, use barcode itself
        consensus = key[2]
    
    clone_barcode_consensus.append(consensus)

CPU times: user 7.28 s, sys: 537 ms, total: 7.82 s
Wall time: 7.8 s


In [10]:
counts_raw["clone_barcode_consensus"] = clone_barcode_consensus

In [11]:
counts_raw

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,clone_barcode_consensus
lib,library_barcode_call,clone_barcode,Unnamed: 3_level_1,Unnamed: 4_level_1
sFH11-1,LB1,AGACAGTGTGACAGTGTGTGACTCTGA,552,AGACAGTGTGACAGTGTGTGACTCTGA
sFH11-1,LB1,TGAGAGTGTCTGACTCAGTCTCTGTGT,510,TGAGAGTGTCTGACTCAGTCTCTGTGT
sFH11-1,LB1,AGAGTCTGACTCACTCAGTGTCTGTCT,465,AGAGTCTGACTCACTCAGTGTCTGTCT
sFH11-1,LB1,TGTCTGAGTCTGTCTGTGTCACTGTCT,461,TGTCTGAGTCTGTCTGTGTCACTGTCT
sFH11-1,LB1,ACTGTCTGTCAGACACTCTGTCACTCT,440,ACTGTCTGTCAGACACTCTGTCACTCT
...,...,...,...,...
sFH12-9,STD,TGTTTGGGAGTGGCAAGAACGGCATCG,1,TGTTTGGGAGTGGCAAGAACGGCATCG
sFH12-9,STD,TGTTTGGGTATTGTAAGACTATCCCTG,1,TGTTTGGGTATTGTAAGACTATCCCTG
sFH12-9,STD,TGTTTGTGCGTGGTCCGGCGGTCGCCT,1,TGTTTGTGCGTGGTCCGGCGGTCGCCT
sFH12-9,STD,TTAGGCGGGACTGTGAGAAAGAGAGGG,1,TTAGGCGGGACTGTGAGAAAGAGAGGG


# Sum counts for consensus error-corrected barcodes

In [12]:
counts = counts_raw.reset_index().groupby(["lib", "library_barcode_call", "clone_barcode_consensus"])["count"].sum()

In [13]:
counts.sort_values(ascending=False)

lib       library_barcode_call  clone_barcode_consensus    
sFH11-3   STD                   GACTGAGTCACTGTCAGACTGTCACTG    29569
sFH11-2   STD                   GACTGAGTCACTGTCAGACTGTCACTG    24221
sFH11-8   STD                   GACTGAGTCACTGTCAGACTGTCACTG    20619
sFH11-9   STD                   GACTGAGTCACTGTCAGACTGTCACTG    17361
sFH11-14  LB2                   TGTCAGACTGTCAGTGTCTGAGACTCT    17185
                                                               ...  
sFH11-21  LB2                   TGAGAGTCTCTGTGTGACAGTGAGTGT        1
                                TGAGAGTCTCTGTGTGAGTCTCACTGT        1
                                TGAGAGTCTCTGTGTGAGTGTCTCTGA        1
                                TGAGAGTCTCTGTGTGTGTCAGACAGT        1
sFH12-9   STD                   TTCCGAGCGCCGTTAAGACGATTACTC        1
Name: count, Length: 4249646, dtype: int64

## Validate result

Confirm that counts are the sum of cluster member counts.

In [14]:
# Index counts by consensus sequence for easy lookup for checking
counts_raw_index_by_consensus = counts_raw.reset_index().set_index(["lib", "library_barcode_call", "clone_barcode_consensus"])

In [15]:
index_query = counts.sort_values(ascending=False).index[6]
index_query

('sFH11-14', 'LB2', 'AGTCTCAGAGAGTGACTGAGACTCTGA')

In [16]:
# Sum of cluster
counts.loc[index_query]

16173

In [17]:
# Canonical sequence of cluster
counts_raw.loc[index_query]

count                                            14481
clone_barcode_consensus    AGTCTCAGAGAGTGACTGAGACTCTGA
Name: (sFH11-14, LB2, AGTCTCAGAGAGTGACTGAGACTCTGA), dtype: object

In [18]:
# Cluster members
counts_raw_index_by_consensus.loc[index_query]

  


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,clone_barcode,count
lib,library_barcode_call,clone_barcode_consensus,Unnamed: 3_level_1,Unnamed: 4_level_1
sFH11-14,LB2,AGTCTCAGAGAGTGACTGAGACTCTGA,AGTCTCAGAGAGTGACTGAGACTCTGA,14481
sFH11-14,LB2,AGTCTCAGAGAGTGACTGAGACTCTGA,AATCTCAGAGAGTGACTGAGACTCTGA,116
sFH11-14,LB2,AGTCTCAGAGAGTGACTGAGACTCTGA,AGTCTCAGAGAGTGATTGAGACTCTGA,90
sFH11-14,LB2,AGTCTCAGAGAGTGACTGAGACTCTGA,AGTCACAGAGAGTGACTGAGACTCTGA,73
sFH11-14,LB2,AGTCTCAGAGAGTGACTGAGACTCTGA,AGTCTCAGAGAGTGACTGAGACTGTGA,72
sFH11-14,LB2,...,...,...
sFH11-14,LB2,AGTCTCAGAGAGTGACTGAGACTCTGA,TGTCTCAGAGAGTGTCTGAGACTGTGA,1
sFH11-14,LB2,AGTCTCAGAGAGTGACTGAGACTCTGA,TGTCTCAGAGTGTGACTGAGACTCTGA,1
sFH11-14,LB2,AGTCTCAGAGAGTGACTGAGACTCTGA,TGTGACAGAGAGTGACTGAGACTCTGA,1
sFH11-14,LB2,AGTCTCAGAGAGTGACTGAGACTCTGA,TGTGTCAGAGAGTGACTGAGACTCTGA,1


In [19]:
# Looks reasonable

# Reformat counts (sort rows)

In [20]:
counts = counts.reset_index()

In [21]:
# Set row order

index_ordered = ["sFH11-1",
                "sFH11-2",
                "sFH11-3",
                "sFH11-4",
                "sFH11-5",
                "sFH11-6",
                "sFH11-7",
                "sFH11-8",
                "sFH11-9",
                "sFH11-10",
                "sFH11-11",
                "sFH11-12",
                "sFH11-13",
                "sFH11-14",
                "sFH11-15",
                "sFH11-16",
                "sFH11-17",
                "sFH11-18",
                "sFH11-19",
                "sFH11-20",
                "sFH11-21",
                "sFH12-9",
                "sFH12-10",
                "sFH12-11",
                "sFH12-12",
                "sFH12-13",
                "sFH12-14"]

In [22]:
# Convert lib to categorical for sorting
counts["lib"] = pd.Categorical(counts["lib"], categories=index_ordered, ordered=True)

In [26]:
# Sort rows
counts = counts.sort_values(by=["lib", "library_barcode_call", "count"], ascending=[True, True, False])

# Export to file

In [27]:
%%time

outfile = "/scratch/CellFreeReporter/analysis/Demo6/clones.error_corrected.tsv.gz"

counts.to_csv(outfile, sep="\t", header=True, index=False)

CPU times: user 2min 11s, sys: 367 ms, total: 2min 12s
Wall time: 2min 12s
