In [1]:
import getpass

import fsspec
import pandas as pd

In [2]:
import barqs
import fasta
import fastq

In [3]:
# _password = getpass.getpass(prompt="Enter password for ssh host: ")
# fs = fsspec.filesystem("ssh", host="gesu.nygenome.org", username="collinsd", password=_password)

In [4]:
barcode_size = 16
umi_size = 12

In [5]:
raw_features = [
    "TGGTATCAGTCTCGT",
    "ACGTAGTGCACTAGA",
    "ATTGCCTCAGGGTTT",
    "AGAGCGACAGAATAC",
    "TGAGAGAAGGGACTA",
    "GCATCTCTCAAGCAA",
    "CACCCAATAGTAAGG",
    "TTAACCACTGTGTAC",
    "TCGTTATAACATCTG",
    "GCAGTTAAAGTCGGG",
    "AGAATAGGAGCCGCA",
    "ATACACAGTTTACCG",
    "CATCTAGACCTAACT",
    "GAGATACTGCCATAG",
]

In [6]:
features = [
    (seq, seq) for seq in raw_features
]

In [7]:
feature_map = {seq: name for name, seq in features}

In [8]:
# read1_file = fs.open("/brahms/kowalskim/data/hsc/multiome_072924/fastq/Multiome-HTO_S5_L001_R1_001.fastq.gz")
read1_file = "/Users/dcollins/workspace/data/jyun/HTO_R1.fastq.gz"
read1 = fastq.load(read1_file)

In [9]:
# read2_file = fs.open("/brahms/kowalskim/data/hsc/multiome_072924/fastq/Multiome-HTO_S5_L001_R2_001.fastq.gz")
read2_file = "/Users/dcollins/workspace/data/jyun/HTO_R2.fastq.gz"
read2 = fastq.load(read2_file)

In [10]:
identifiers = (
    barqs.extract(read, umi_size=umi_size, barcode_size=barcode_size)
    for read in read1
)

In [11]:
reads = (
    barqs.tag(read, barcode, umi, trim = False)
    for read, (barcode, umi) in zip(read2, identifiers)
)

In [12]:
trimmed_reads = (
    barqs.trim_by_index(
        read, 
        feature_lookup=feature_map,
        region=(0, 15)
    ) 
    for read in reads
)

In [13]:
trimmed_reads = (
    barqs.trim_by_regex(
        read, 
        feature_lookup=feature_map,
        tolerance=1,
    )
    for read in trimmed_reads
)

In [14]:
observed_features = (barqs.filter_duplicates(trimmed_reads))

In [15]:
counts = barqs.quantify(observed_features, features)

In [16]:
counts_df = pd.DataFrame(counts).fillna(0)

In [17]:
counts_df.to_csv("/Users/dcollins/workspace/data/jyun/HTO_counts_v3.csv")

In [18]:
counts_df

Unnamed: 0,CATCANTCAGGCTTGT,CAAGGNTTCCCTGTTA,ATTACNCGTAAGCACC,GAAAGNCTCGACAAAG,GCCCTNATCAATAGCC,GCCTGNTGTACTTCAC,CGATTNCTCCTAATGA,GGCATNAGTTTGTTGC,GGTTGNCGTGTTGTAG,ACAGGNTGTAATCGGC,...,TTAAAACAGAAAGCAT,AAGCATATCGTTAACA,CGTGCTGCATCCAGGT,GGATACTTCATGGCCA,TACGGATTCGCAGGCT,GCAGGCAAGCTTTTTT,TACAAGCTCTTTTACG,AGGTTACTCATCACTT,TTAGATGTTTACCGTT,GATGCGGGTGTTTGAG
AGAGCGACAGAATAC,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CATCTAGACCTAACT,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
TCGTTATAACATCTG,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
GAGATACTGCCATAG,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ATACACAGTTTACCG,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
TTAACCACTGTGTAC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
GCAGTTAAAGTCGGG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
TGGTATCAGTCTCGT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CACCCAATAGTAAGG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCATCTCTCAAGCAA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
prev_counts = pd.read_csv("/Users/dcollins/workspace/data/jyun/HTO_counts_v2.csv")

In [21]:
prev_counts = prev_counts.set_index("Unnamed: 0")

In [27]:
prev_counts.index

Index(['AGAGCGACAGAATAC', 'CATCTAGACCTAACT', 'GAGATACTGCCATAG',
       'TCGTTATAACATCTG', 'ATACACAGTTTACCG', 'TTAACCACTGTGTAC',
       'GCAGTTAAAGTCGGG', 'TGGTATCAGTCTCGT', 'CACCCAATAGTAAGG',
       'GCATCTCTCAAGCAA', 'AGAATAGGAGCCGCA', 'ATTGCCTCAGGGTTT',
       'ACGTAGTGCACTAGA', 'TGAGAGAAGGGACTA'],
      dtype='object', name='Unnamed: 0')

In [None]:
counts_df.inde