# Title

# Introduction

State notebook purpose here.

## Imports

In [1]:
import sys
import os
import time
import copy
import random
import math
import itertools

import numpy as np
import scipy
import pandas as pd

# Set pandas display options
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns

# Set plot display options
params = {
    'font.size': 12,
    'axes.titlesize': 12,
    'axes.labelsize': 12,
    'legend.fontsize': 12,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'font.family': "Helvetica",
    'pdf.fonttype': 42,
    'ps.fonttype': 42,
    'figure.dpi': 300
   }
mpl.rcParams.update(params)
sns.set_style("ticks", {'axes.grid' : False})

# Set plot output options
output_dir = "outs/"
output_suffix = ""
output_formats = [".png", ".pdf"]

mpl.rc('savefig', dpi=300)

# Define convenience function for saving figures

savefig = True
savefig_args = {"dpi": 300, "bbox_inches": "tight", "pad_inches": 0}

def save_figure(fig, name, output_dir=output_dir, output_suffix=output_suffix, output_formats=output_formats, savefig_args=savefig_args):
    if savefig:
        for output_format in output_formats:
            fig.savefig(output_dir + "/" + name + output_suffix + output_format, **savefig_args)
    return None

In [2]:
# import additional libraries and settings here

# Load data

State data here.

In [3]:
%%time

infile = "/scratch/CellFreeReporter/analysis/Demo6/clones.tsv.gz"

counts_raw = pd.read_csv(infile, sep="\t", compression="gzip")

print(counts_raw.shape)
counts_raw.head()

(5453959, 4)
CPU times: user 4.87 s, sys: 527 ms, total: 5.4 s
Wall time: 5.4 s


Unnamed: 0,lib,library_barcode_call,clone_barcode,count
0,sFH11-1,LB1,AGACAGTGTGACAGTGTGTGACTCTGA,552
1,sFH11-1,LB1,TGAGAGTGTCTGACTCAGTCTCTGTGT,510
2,sFH11-1,LB1,AGAGTCTGACTCACTCAGTGTCTGTCT,465
3,sFH11-1,LB1,TGTCTGAGTCTGTCTGTGTCACTGTCT,461
4,sFH11-1,LB1,ACTGTCTGTCAGACACTCTGTCACTCT,440


# Preprocess barcodes

## Filter for library barcodes of interest (LB1, LB2)

In [4]:
library_barcode_calls_to_keep = ["LB1", "LB2"]

counts = counts_raw.set_index(["lib", "library_barcode_call", "clone_barcode"])

counts = counts.loc[:, library_barcode_calls_to_keep, :]

counts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
lib,library_barcode_call,clone_barcode,Unnamed: 3_level_1
sFH11-1,LB1,AGACAGTGTGACAGTGTGTGACTCTGA,552
sFH11-1,LB1,TGAGAGTGTCTGACTCAGTCTCTGTGT,510
sFH11-1,LB1,AGAGTCTGACTCACTCAGTGTCTGTCT,465
sFH11-1,LB1,TGTCTGAGTCTGTCTGTGTCACTGTCT,461
sFH11-1,LB1,ACTGTCTGTCAGACACTCTGTCACTCT,440
...,...,...,...
sFH12-9,LB2,TTGGAGAGGCAGTGAGACAGTGAGAGT,1
sFH12-9,LB2,TTTCAGTGAGACTCAGTCTGTGTGAGA,1
sFH12-9,LB2,TTTCTGTGTGTCTCTCTCAGAGTGTGA,1
sFH12-9,LB2,TTTGAGACTCACAGTCTGACTCACTGT,1


## Reset index

In [5]:
counts = counts.reset_index()

In [6]:
counts

Unnamed: 0,lib,library_barcode_call,clone_barcode,count
0,sFH11-1,LB1,AGACAGTGTGACAGTGTGTGACTCTGA,552
1,sFH11-1,LB1,TGAGAGTGTCTGACTCAGTCTCTGTGT,510
2,sFH11-1,LB1,AGAGTCTGACTCACTCAGTGTCTGTCT,465
3,sFH11-1,LB1,TGTCTGAGTCTGTCTGTGTCACTGTCT,461
4,sFH11-1,LB1,ACTGTCTGTCAGACACTCTGTCACTCT,440
...,...,...,...,...
5434257,sFH12-9,LB2,TTGGAGAGGCAGTGAGACAGTGAGAGT,1
5434258,sFH12-9,LB2,TTTCAGTGAGACTCAGTCTGTGTGAGA,1
5434259,sFH12-9,LB2,TTTCTGTGTGTCTCTCTCAGAGTGTGA,1
5434260,sFH12-9,LB2,TTTGAGACTCACAGTCTGACTCACTGT,1


# Export clone barcodes in format for Starcode

In [7]:
# Report number of barcodes per group

for (lib, library_barcode_call), group in counts.groupby(["lib", "library_barcode_call"]):
    
    print(lib, "\t", library_barcode_call, "\t", group.shape[0])

sFH11-1 	 LB1 	 11335
sFH11-1 	 LB2 	 33614
sFH11-10 	 LB1 	 5132
sFH11-10 	 LB2 	 129428
sFH11-11 	 LB1 	 5372
sFH11-11 	 LB2 	 85094
sFH11-12 	 LB1 	 1598
sFH11-12 	 LB2 	 75038
sFH11-13 	 LB1 	 2237
sFH11-13 	 LB2 	 58917
sFH11-14 	 LB1 	 3965
sFH11-14 	 LB2 	 208115
sFH11-15 	 LB1 	 10519
sFH11-15 	 LB2 	 28189
sFH11-16 	 LB1 	 15736
sFH11-16 	 LB2 	 59740
sFH11-17 	 LB1 	 38371
sFH11-17 	 LB2 	 110290
sFH11-18 	 LB1 	 55247
sFH11-18 	 LB2 	 141169
sFH11-19 	 LB1 	 60502
sFH11-19 	 LB2 	 146507
sFH11-2 	 LB1 	 25140
sFH11-2 	 LB2 	 33076
sFH11-20 	 LB1 	 71045
sFH11-20 	 LB2 	 159611
sFH11-21 	 LB1 	 115970
sFH11-21 	 LB2 	 251097
sFH11-3 	 LB1 	 28975
sFH11-3 	 LB2 	 11400
sFH11-4 	 LB1 	 24809
sFH11-4 	 LB2 	 4650
sFH11-5 	 LB1 	 43649
sFH11-5 	 LB2 	 6528
sFH11-6 	 LB1 	 54976
sFH11-6 	 LB2 	 9517
sFH11-7 	 LB1 	 52979
sFH11-7 	 LB2 	 2970
sFH11-8 	 LB1 	 14733
sFH11-8 	 LB2 	 32846
sFH11-9 	 LB1 	 3113
sFH11-9 	 LB2 	 66813
sFH12-10 	 LB1 	 448755
sFH12-10 	 LB2 	 14870
sFH12-1

In [9]:
%%time

output_dir = "/scratch/CellFreeReporter/pipelines/220520_Demo6_consensus_error_correct_starcode/"

for (lib, library_barcode_call), group in counts.groupby(["lib", "library_barcode_call"]):

    outfile = output_dir + "/" + "clone_barcodes." + lib + "." + library_barcode_call + ".tsv.tmp"
    
    group_clean = group[["clone_barcode", "count"]]
    
    print(lib, "\t", library_barcode_call, "\t", group.shape[0])
    print("Writing... " + outfile)
    
    group_clean.to_csv(outfile, sep="\t", index=False, header=False)

    print("Done!")
    print()


sFH11-1 	 LB1 	 11335
Writing... /scratch/CellFreeReporter/pipelines/220520_Demo6_consensus_error_correct_starcode//clone_barcodes.sFH11-1.LB1.tsv.tmp
Done!

sFH11-1 	 LB2 	 33614
Writing... /scratch/CellFreeReporter/pipelines/220520_Demo6_consensus_error_correct_starcode//clone_barcodes.sFH11-1.LB2.tsv.tmp
Done!

sFH11-10 	 LB1 	 5132
Writing... /scratch/CellFreeReporter/pipelines/220520_Demo6_consensus_error_correct_starcode//clone_barcodes.sFH11-10.LB1.tsv.tmp
Done!

sFH11-10 	 LB2 	 129428
Writing... /scratch/CellFreeReporter/pipelines/220520_Demo6_consensus_error_correct_starcode//clone_barcodes.sFH11-10.LB2.tsv.tmp
Done!

sFH11-11 	 LB1 	 5372
Writing... /scratch/CellFreeReporter/pipelines/220520_Demo6_consensus_error_correct_starcode//clone_barcodes.sFH11-11.LB1.tsv.tmp
Done!

sFH11-11 	 LB2 	 85094
Writing... /scratch/CellFreeReporter/pipelines/220520_Demo6_consensus_error_correct_starcode//clone_barcodes.sFH11-11.LB2.tsv.tmp
Done!

sFH11-12 	 LB1 	 1598
Writing... /scratch/Cel

Next step is running starcode on these barcodes using `starcode.sh`