In [1]:
import os
import sys
import gzip
import argparse
import numpy as np
import pandas as pd
from collections import Counter
from Bio.SeqIO.QualityIO import FastqGeneralIterator as FGI
from Bio.Seq import reverse_complement

In [2]:
# '/home/yike/phd/cancer_cells_img_seq/scripts/NextSeq_pipeline/create_sample_sheet.ipynb'

In [3]:
def read_designed_indices():
    idx_fn = '/home/yike/phd/cancer_cells_img_seq/data/20220201_NextSeq/SampleSheet_content.csv'
    df = pd.read_csv(idx_fn, sep=',', index_col=0)
    idx_dict = {ID: index + '+' + index2 for ID, index, index2 in zip(df['Index_ID'], df['index'], df['index2'])}
    
    return idx_dict


In [8]:
def get_read_lengths():
    muxed_fdn = '/home/yike/phd/cancer_cells_img_seq/data/20220201_NextSeq/muxed/'
    fnd = {
        'i1': f'{muxed_fdn}Undetermined_S0_I1_001.fastq.gz',
        'i2': f'{muxed_fdn}Undetermined_S0_I2_001.fastq.gz',
        'r1': f'{muxed_fdn}Undetermined_S0_R1_001.fastq.gz',
        'r2': f'{muxed_fdn}Undetermined_S0_R2_001.fastq.gz',
        }
    read_lengths = {}
    for key, fn in fnd.items():
        with gzip.open(fn, 'rt') as f:
            readiter = FGI(f)
            for name, seq, qual in readiter:
                read_lengths[key] = len(seq)
                break
    return read_lengths

In [9]:
if __name__ == '__main__':

#     pa = argparse.ArgumentParser()
#     pa.add_argument('--maxreads', type=int, default=100000)
#     args = pa.parse_args()

    read_length_dict = get_read_lengths()

    idx_dict = read_designed_indices()

    #sys.exit()
    
#     maxreads = args.maxreads
    maxreads = 100000
    
    muxed_fdn = '/home/yike/phd/cancer_cells_img_seq/data/20220201_NextSeq/muxed/'
    fn_i1 = f'{muxed_fdn}Undetermined_S0_I1_001.fastq.gz'
    fn_i2 = f'{muxed_fdn}Undetermined_S0_I2_001.fastq.gz'

    counts = {
        'i1': Counter(),
        'i2': Counter(),
        'combo': Counter(),
    }

    with gzip.open(fn_i1, 'rt') as f1, gzip.open(fn_i2, 'rt') as f2:
        readiter1 = FGI(f1)
        readiter2 = FGI(f2)
        for ir, (idx1, idx2) in enumerate(zip(readiter1, readiter2)):
            if ir == maxreads:
                break

            # FIXME: add quality filter?
            name1, seq1, qual1 = idx1
            name2, seq2, qual2 = idx2

            counts['i1'][seq1] += 1
            counts['i2'][seq2] += 1
            counts['combo'][seq1+'+'+seq2] += 1

    for key, val in counts.items():
        counts[key] = pd.Series(val).sort_values(ascending=False).to_frame(name='n_reads')
        counts[key]['designed'] = ''

In [10]:
    idx_corr = {
        #'i1': 'i7',
        #'i2': 'i5',
        'combo': 'combo',
    }
    for key, key2 in idx_corr.items():
        count = counts[key]
        idx_des = idx_dict    #####################
        for name, seq in idx_des.items():
            if seq in count.index:
                count.at[seq, 'designed'] = name
                continue
            seqrc = reverse_complement(seq)
            if seqrc in count.index:
                count.at[seqrc, 'designed'] = name+' (RC)'
                continue

    # Index 2: all good, mostly our i7 but ~25% Shruti's, extended by 4 bases
    # Index 1: all good except for 1 barcode (TCTTTCCCTACA), let's figure out if it's
    # at close Hamming distance from another barcode
    if True:
        i1mat = np.array(counts['i2'].index[:130].str.split('').str.slice(1, -1).tolist())
        dmat = np.zeros((130, 130), np.int32)
        for i, seq in enumerate(i1mat):
            for j, seq2 in enumerate(i1mat[:i]):
                dmat[i, j] = dmat[j, i] = (seq != seq2).sum()
        dmat = pd.DataFrame(dmat, index=counts['i2'].index[:130], columns=counts['i2'].index[:130])
        # The intruder index is not close to anything... ??? not a biggie, but what the heck?


In [159]:
count = pd.read_csv('/home/yike/phd/cancer_cells_img_seq/scripts/NextSeq_pipeline/data/qc_count.tsv', sep='\t', index_col=0)
idx_fn = '/home/yike/phd/cancer_cells_img_seq/data/20220201_NextSeq/SampleSheet_content.csv'
idx_df = pd.read_csv(idx_fn, sep=',', index_col=0)

i5_ls = []
i7_ls = []
for idx in count[:100].index:
    i1, i2 = idx.split('+')
    for i, row in idx_df.iterrows():
        i7_name = row['Index_ID'].split('_')[0] + '_' + row['Index_ID'].split('_')[1]
        i5_name = row['Index_ID'].split('_')[2] + '_' + row['Index_ID'].split('_')[3]
        if i1 == row['index']: ##i1
            i7_ls.append([i7_name, count.loc[idx]['n_reads'], row['index']]) # i1
        if i2 == row['index2']: ## i2
            i5_ls.append([i5_name, count.loc[idx]['n_reads'], row['index2']]) 
            
i5_df = pd.DataFrame(i5_ls, columns=['i5_name', 'n_reads', 'index_seq'])
i5_df = i5_df[~i5_df.duplicated()]

ls = []
for idx in i5_df['i5_name'].unique():
    ls.append([idx, i5_df.set_index('i5_name').loc[idx]['n_reads'].sum(), np.unique(i5_df.set_index('i5_name').loc[idx]['index_seq'])[0]])
    
i5_df_new = pd.DataFrame(ls, columns=['i5_name', 'n_reads', 'index_seq'])

In [160]:
count

Unnamed: 0,n_reads,designed
GGGGGGGGGGGG+AGATCTCGGTGG,23393,
GGGGGGGGGGGG+TGCTGTTATTTG,1342,
AGGCAGAAATGG+GAGGATACACAT,1088,
GGGGGGGGGGGG+TATGACTCGGAA,1022,
GGGGGGGGGGGG+CAACTGATACAG,1016,
...,...,...
CTCGCTACCTCT+TCCGCTCGGTAA,1,
AGGCAGATCTCG+CGCTGCGGTTCC,1,
CTGTGTCGATCG+AAGATACAAGAG,1,
GGTGAGGAATGG+GAGGATACACAT,1,


In [134]:
i5_df_new.sort_values('n_reads', ascending=False)

Unnamed: 0,i5_name,n_reads,index_seq
0,FZi5_4,2436,TGCTGTTATTTG
4,FZi5_9,2156,TCCGCTCGGTAA
2,FZi5_6,2012,TATGACTCGGAA
1,FZi5_5,1939,GAGGATACACAT
3,FZi5_12,1778,CAACTGATACAG
5,FZi5_11,1565,CCTCACGCATCG
8,FZi5_8,1508,CGCTGCGGTTCC
6,FZi5_10,1493,CCCGCTTCGGTG
7,FZi5_3,1253,AAGATACAAGAG
12,FZi5_7,1234,CCGCCCGCTCTA


In [None]:
# Index 1 (i7): all good
# Index 2 (i5): all good except for 1 barcode (AGATCTCGGTGG), but it's not our designed index

In [None]:
####pilot experiment

In [157]:
count2 = pd.read_csv('/home/yike/phd/resources/cell_celector/scripts/image-seq/imseqpy/pilots/qc_count.tsv', sep='\t', index_col=0)
idx_fn2 = '/home/yike/phd/resources/cell_celector/data/imaging-sequencing/202106_MCF7_30cells/MiSeq_sequencing/SampleSheet.csv'
idx_df2 = pd.read_csv(idx_fn2, sep=',', index_col=0, header=16)

i5_ls = []
i7_ls = []
for idx in count[:100].index:
    i1, i2 = idx.split('+')
    for i, row in idx_df.iterrows():
        i7_name = row['I7_Index_ID']
        i5_name = row['I5_Index_ID']
        if i1 == row['index']: ##i1
            i7_ls.append([i7_name, count.loc[idx]['n_reads'], row['index']]) # i1
        if i2 == row['index2']: ## i2
            i5_ls.append([i5_name, count.loc[idx]['n_reads'], row['index2']]) 

#############################           
i7_df = pd.DataFrame(i7_ls, columns=['i7_name', 'n_reads', 'index_seq'])
i7_df = i7_df[~i7_df.duplicated()]

ls1 = []
for idx in i7_df['i7_name'].unique():
    ls1.append([idx, 
                i7_df.set_index('i7_name').loc[idx]['n_reads'].sum(), 
                np.unique(i7_df.set_index('i7_name').loc[idx]['index_seq'])[0]])
    
i7_df_new = pd.DataFrame(ls1, columns=['i7_name', 'n_reads', 'index_seq'])

#############################
i5_df = pd.DataFrame(i5_ls, columns=['i5_name', 'n_reads', 'index_seq'])
i5_df = i5_df[~i5_df.duplicated()]

ls2 = []
for idx in i5_df['i5_name'].unique():
    ls2.append([idx, 
               i5_df.set_index('i5_name').loc[idx]['n_reads'].sum(), 
               np.unique(i5_df.set_index('i5_name').loc[idx]['index_seq'])[0]])
    
i5_df_new = pd.DataFrame(ls2, columns=['i5_name', 'n_reads', 'index_seq'])

In [158]:
count2

Unnamed: 0,n_reads,designed
CCACACAAGAGA+TCTTTCCCTACA,3211,
NNNNNNNNNNNN+NNNNNNNNNNNN,966,
CGTACTAGATCT+TCTTTCCCTACA,624,
CGTACTAATCTC+TTCCGAGTCATA,394,
CACACAAGAGAA+CGACTACACTGT,346,
...,...,...
TTCCTTCCCGCT+TCTTTCCCTACA,1,
CCACACAAGAGA+CGCGGAGTCCGG,1,
TGCTTTCCCCTC+TCTTTCCCTACA,1,
GAGTTGCTCAAA+TCTTTCCCTACA,1,


In [154]:
i5_df_new

Unnamed: 0,i5_name,n_reads,index_seq
0,FZi5_6,1025.0,TTCCGAGTCATA
1,FZi5_28,1087.0,CGACTACACTGT
2,FZi5_31,819.0,ATGGTTAGTGTA
3,FZi5_1,596.0,TCAATGACTAAA
4,FZi5_25,1107.0,TCTAAAGCACTT
5,FZi5_8,445.0,GGAACCGCAGCG
6,FZi5_18,437.0,TAGTGAGTTTCG
7,FZi5_32,717.0,ATCCAGTTGGGA
8,FZi5_19,569.0,CCTCCGCCTCAT
9,FZi5_9,284.0,TTACCGAGCGGA


In [155]:
i7_df_new 

Unnamed: 0,i7_name,n_reads,index_seq
0,i7_test,3679.0,CCACACAAGAGA
1,I7_2.2,624.0,CGTACTAGATCT
