In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import glob
from tqdm import tqdm

# Single-cell signatures

In [3]:
pwd

'/home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks'

In [4]:
PYTHON = "/home/olga/miniconda3/envs/immune-evolution/bin/python"
PWD = "/home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks"
SIG2KMER = f"{PWD}/sig2kmer.py"
sig2kmer_template = (
    f"{PYTHON} {SIG2KMER} "
    + r"--quiet {moltype_flags} --output-sequences {output_fasta} --output-kmers {output_kmer_csv} {query_sig} {input_fasta}"
)
sig2kmer_template

'/home/olga/miniconda3/envs/immune-evolution/bin/python /home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks/sig2kmer.py --quiet {moltype_flags} --output-sequences {output_fasta} --output-kmers {output_kmer_csv} {query_sig} {input_fasta}'

In [5]:
fasta_type_to_sketch_ids = {
    "nucleotides": ["alphabet-DNA__ksize-21__scaled-10"],
    "peptides": [
        "alphabet-dayhoff__ksize-51__scaled-10",
        "alphabet-protein__ksize-30__scaled-10",
    ],
}

sketch_id_to_flags = {
    "alphabet-DNA__ksize-21__scaled-10": "--dna --ksize 21",
    "alphabet-dayhoff__ksize-51__scaled-10": "--dayhoff --no-dna --ksize 51 --input-is-protein",
    "alphabet-protein__ksize-30__scaled-10": "--protein --no-dna --ksize 30 --input-is-protein",
}

In [6]:
%%time


def clean_fasta_name(basename, strings_to_remove=["__aligned", '__possorted_genome_bam', '_possorted_genome_bam', '__unaligned']):
    new_name = None
    for to_remove in strings_to_remove:
        if new_name is None:
            # First time --> take original basename
            new_name = basename.replace(to_remove, '')
        else:
            new_name = new_name.replace(to_remove, '')
            
    new_name = new_name.split('coding_reads')[0].strip("_")
    return new_name


def fasta_file_contains_read_names(output_fasta):
    if os.path.exists(output_fasta):
        with open(output_fasta) as f:
            for line in f.readlines():
                # IF there already exists a file with a gene name tag
                if 'GN:Z:' in line:
                    return True
    return False

def csv_is_valid(output_csv):
    try:
        pd.read_csv(csv)
    except:
        return False
    return True


# Add "--t" to glob command for train and test data
for species_folder in glob.glob(
#     "/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/*--t*/"
 "/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/*lemur*/"
):
    fasta_base = os.path.join(species_folder, "0--fastas")
    sig_base = os.path.join(species_folder, "1--single-cell-sigs")
    output_base = os.path.join(species_folder, "2--single-cell-kmers")
    if not os.path.exists(output_base):
        os.makedirs(output_base)
    
    
    command_txt = os.path.join(output_base, 'sig2kmer_commands.txt')
    
    with open(command_txt, 'w') as f:
        for fasta_type, sketch_ids in fasta_type_to_sketch_ids.items():
            for sketch_id in sketch_ids:
                print(sketch_id)
                globber = os.path.join(fasta_base, fasta_type, '*')
                input_fastas = glob.iglob(globber)
                moltype_flags = sketch_id_to_flags[sketch_id]
                sketch_id_sig_folder = os.path.join(sig_base, sketch_id)

                sketch_id_output_base = os.path.join(output_base, sketch_id)
                fasta_output_base = os.path.join(sketch_id_output_base, "fastas")
                csv_output_base = os.path.join(sketch_id_output_base, "csvs")
                
#                 if not os.path.exists(csv_output_base):
#                     os.makedirs(fasta_output_base)
#                     os.makedirs(csv_output_base)
                total = sum(1 for _ in glob.iglob(globber))
                for input_fasta in tqdm(input_fastas, total=total):
                    basename = os.path.basename(input_fasta)
                    alignment_status = 'aligned' if '_aligned_' in basename else 'unaligned'
                    fasta_alignment_status_base = os.path.join(fasta_output_base, alignment_status)
                    csv_alignment_status_base = os.path.join(csv_output_base, alignment_status)
                    
                    if not os.path.exists(csv_alignment_status_base):
                        os.makedirs(fasta_alignment_status_base)
                        os.makedirs(csv_alignment_status_base)
                    
                    cell_id = clean_fasta_name(basename)
                

                    # Create inputs
                    query_sig = os.path.join(sketch_id_sig_folder, f"{cell_id}.sig")

                    if not os.path.exists(query_sig):
                        continue
                    output_fasta = os.path.join(fasta_alignment_status_base, f"{cell_id}.fasta")
                    output_kmer_csv = os.path.join(csv_alignment_status_base, f"{cell_id}.csv")

#                     # If we already found k-mers for this one, skip
#                     if os.path.exists(output_kmer_csv) and os.path.getsize(output_fasta) > 0:
#                         continue
                    

#                     if fasta_file_contains_read_names(output_fasta) and csv_is_valid(output_kmer_csv):
#                         # If the fasta file already has read names, skip it!
#                         continue

                    command = sig2kmer_template.format(
                        moltype_flags=moltype_flags,
                        output_fasta=output_fasta,
                        output_kmer_csv=output_kmer_csv,
                        query_sig=query_sig,
                        input_fasta=input_fasta,
                    )
                    f.write(command + '\n')

    print(f'parallel --progress --eta --jobs 96 < {command_txt}')
    ! wc -l $command_txt
    print('---')
    ! head -n 1 $command_txt
    print('---')
                


alphabet-DNA__ksize-21__scaled-10


100%|██████████| 1964/1964 [00:06<00:00, 326.47it/s]


alphabet-dayhoff__ksize-51__scaled-10


100%|██████████| 1964/1964 [00:04<00:00, 463.81it/s] 
  0%|          | 2/1964 [00:00<01:46, 18.34it/s]

alphabet-protein__ksize-30__scaled-10


100%|██████████| 1964/1964 [00:03<00:00, 578.60it/s] 


parallel --progress --eta --jobs 96 < /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/2--single-cell-kmers/sig2kmer_commands.txt
4824 /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/2--single-cell-kmers/sig2kmer_commands.txt
---
/home/olga/miniconda3/envs/immune-evolution/bin/python /home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks/sig2kmer.py --quiet --dna --ksize 21 --output-sequences /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/2--single-cell-kmers/alphabet-DNA__ksize-21__scaled-10/fastas/aligned/ANTOINE_LUNG_P3__AATCGGTCACGGCTAC.fasta --output-kmers /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/2--single-cell-kmers/alphabet-DNA__ksize-21__scaled-10/csvs/aligned/ANTOINE_LUNG_P3__AATCGGTCACGGCTAC.csv /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/1--single-cell-sigs/alphabet-DNA__ksize-21__scaled-10

In [14]:
! grep dayhoff $command_txt | head -n 1

/home/olga/miniconda3/envs/immune-evolution/bin/python /home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks/sig2kmer.py --quiet --dayhoff --no-dna --ksize 51 --input-is-protein --output-sequences /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/2--single-cell-kmers/alphabet-dayhoff__ksize-51__scaled-10/fastas/unaligned/ANTOINE_LUNG_P3__GCACATATCGGAATCT.fasta --output-kmers /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/2--single-cell-kmers/alphabet-dayhoff__ksize-51__scaled-10/csvs/unaligned/ANTOINE_LUNG_P3__GCACATATCGGAATCT.csv /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/1--single-cell-sigs/alphabet-dayhoff__ksize-51__scaled-10/ANTOINE_LUNG_P3__GCACATATCGGAATCT.sig /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/0--fastas/peptides/ANTOINE_LUNG_P3__possorted_genome_bam__unaligned__unaligned__GCACATATCGGAATCT__coding_reads_peptides.fasta

In [7]:
    print(f'parallel --progress --eta --jobs 96 < {command_txt}')
    ! wc -l $command_txt
    print('---')
    ! head -n 1 $command_txt
    print('---')

parallel --progress --eta --jobs 96 < /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/2--single-cell-kmers/sig2kmer_commands.txt
4824 /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/2--single-cell-kmers/sig2kmer_commands.txt
---
/home/olga/miniconda3/envs/immune-evolution/bin/python /home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks/sig2kmer.py --quiet --dna --ksize 21 --output-sequences /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/2--single-cell-kmers/alphabet-DNA__ksize-21__scaled-10/fastas/aligned/ANTOINE_LUNG_P3__AATCGGTCACGGCTAC.fasta --output-kmers /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/2--single-cell-kmers/alphabet-DNA__ksize-21__scaled-10/csvs/aligned/ANTOINE_LUNG_P3__AATCGGTCACGGCTAC.csv /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/1--single-cell-sigs/alphabet-DNA__ksize-21__scaled-10

In [8]:
1+1

2

In [9]:
! grep -c unaligned /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/1--train-mouse/2--single-cell-kmers/sig2kmer_commands.txt

15028


In [10]:
basename

'ANTOINE_LUNG_P3__possorted_genome_bam__aligned__aligned__TCAATCTAGCTACCGC__coding_reads_peptides.fasta'

## Are any of the fastas nonzero??

In [11]:
sketch_ids

['alphabet-dayhoff__ksize-51__scaled-10',
 'alphabet-protein__ksize-30__scaled-10']

In [12]:
nucleotide_protein_sketch_ids = [
    "alphabet-DNA__ksize-21__scaled-10",
    "alphabet-dayhoff__ksize-51__scaled-10",
    "alphabet-protein__ksize-30__scaled-10",
]

In [13]:
aligned_unaligned = 'aligned', 'unaligned'

for species_dir in glob.glob('/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/*--t*'):
    for sketch_id in nucleotide_protein_sketch_ids:
        for alignment_status in aligned_unaligned:
            n = 0
            fastas = glob.iglob(os.path.join(species_dir, f'2--single-cell-kmers/{sketch_id}/fastas/{alignment_status}/*.fasta'))
            for fasta in fastas:
                if os.path.getsize(fasta) > 0:
                    n += 1
            print(f'{os.path.basename(species_dir)}, {sketch_id},\t{alignment_status} has\t{n} nonzero byte fasta files')

2--test-human, alphabet-DNA__ksize-21__scaled-10,	aligned has	14153 nonzero byte fasta files
2--test-human, alphabet-DNA__ksize-21__scaled-10,	unaligned has	13046 nonzero byte fasta files
2--test-human, alphabet-dayhoff__ksize-51__scaled-10,	aligned has	14154 nonzero byte fasta files
2--test-human, alphabet-dayhoff__ksize-51__scaled-10,	unaligned has	12261 nonzero byte fasta files
2--test-human, alphabet-protein__ksize-30__scaled-10,	aligned has	14154 nonzero byte fasta files
2--test-human, alphabet-protein__ksize-30__scaled-10,	unaligned has	12181 nonzero byte fasta files
3--test-bat, alphabet-DNA__ksize-21__scaled-10,	aligned has	7695 nonzero byte fasta files


KeyboardInterrupt: 