In [6]:
import os
import glob
import re

from tqdm import tqdm
import pandas as pd

from sig2kmer import read_kmer_csv
from sig_utils import SKETCH_INFO_PATTERN

import sqlite3

In [7]:
sketch_id_to_flags = {
    "alphabet-DNA__ksize-21__scaled-10": "--dna --ksize 21",
    "alphabet-dayhoff__ksize-51__scaled-10": "--dayhoff --no-dna --ksize 51 --input-is-protein",
    "alphabet-protein__ksize-30__scaled-10": "--protein --no-dna --ksize 30 --input-is-protein",
}

sketch_ids = sketch_id_to_flags.keys()

In [4]:
ls -lha /home/olga/data_sm/immune-evolution/kmer-signatures/4--test-lemur/5--celltype-kmers--merged-celltype-remove-common-kmers--min-kmer-count--10-percent/alphabet-dayhoff__ksize-51__scaled-10/fastas/aligned | head

total 1.0K
drwxr-xr-x 2 olga czb 4.0K Apr 21 09:49 [0m[01;34m.[0m/
drwxr-xr-x 3 olga czb 4.0K Apr 21 09:49 [01;34m..[0m/


In [5]:
aligned_unaligned = 'aligned', 'unaligned'

for species_dir in glob.glob('/home/olga/data_sm/immune-evolution/kmer-signatures/*--t*'):
    for sketch_id in sketch_ids:
        for alignment_status in aligned_unaligned:
            n = 0
            fastas = glob.iglob(os.path.join(species_dir, f'2--single-cell-kmers/{sketch_id}/fastas/{alignment_status}/*.fasta'))
            for fasta in fastas:
                if os.path.getsize(fasta) > 0:
                    n += 1
            print(f'{os.path.basename(species_dir)}\t {sketch_id}\t{alignment_status} has\t{n} nonzero byte fasta files')

2--test-human	 alphabet-DNA__ksize-21__scaled-10	aligned has	14153 nonzero byte fasta files
2--test-human	 alphabet-DNA__ksize-21__scaled-10	unaligned has	13046 nonzero byte fasta files
2--test-human	 alphabet-dayhoff__ksize-51__scaled-10	aligned has	14154 nonzero byte fasta files
2--test-human	 alphabet-dayhoff__ksize-51__scaled-10	unaligned has	12261 nonzero byte fasta files
2--test-human	 alphabet-protein__ksize-30__scaled-10	aligned has	14154 nonzero byte fasta files
2--test-human	 alphabet-protein__ksize-30__scaled-10	unaligned has	12181 nonzero byte fasta files
3--test-bat	 alphabet-DNA__ksize-21__scaled-10	aligned has	7695 nonzero byte fasta files


KeyboardInterrupt: 

# Single-cell signatures

In [None]:
# ll /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/2--single-cell-kmers/alphabet-DNA__ksize-21__scaled-10/csvs/

In [None]:
1+1

In [None]:
globber = os.path.join(
    "/home/olga/data_sm/immune-evolution/kmer-signatures/",
    "*",
    "2--single-cell-kmers",
    "*",
    "csvs",
    "*",
    "*.csv",
)
total = sum(1 for _ in glob.iglob(globber))


dfs = []
for csv in tqdm(glob.iglob(globber), total=total):
    try:
        df = read_kmer_csv(csv)
    except pd.errors.EmptyDataError:
        print(f"Empty file: {csv}")
        continue
        
    split = csv.split('/')
    test_species = re.findall('((test|train)-\w+)', csv)[0][0]
    species = test_species.split('-')[-1]
    df['species'] = species
    dfs.append(df)
    df['cell_id'] = os.path.basename(csv).split('.')[0]
    mol_or_alpha, moltype, ksize, style, value = re.findall(SKETCH_INFO_PATTERN, csv)[0]
    df['sketch_id'] = split[-4]
    df['moltype'] = moltype
    df['ksize'] = ksize
    df[style] = int(value)
    alignment_status = split[-2]
    df['alignment_status'] = alignment_status
    dfs.append(df)
#     break
kmers = pd.concat(dfs)
print(kmers.shape)
kmers.head()

In [None]:
kmers.query('species == "bat"')

In [19]:
re.findall('((test|train)-\w+)', csv)[0][0]

'test-human'

'/home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/2--single-cell-kmers/alphabet-dayhoff__ksize-51__scaled-10/csvs/unaligned/P3_4__GAGCAGAAGTGGACGT.csv'

In [23]:
# import pandas as pd
# import sqlite3

# con = sqlite3.connect("data/portal_mammals.sqlite")

# # Load the data into a DataFrame
# surveys_df = pd.read_sql_query("SELECT * from surveys", con)

# # Select only data for 2002
# surveys2002 = surveys_df[surveys_df.year == 2002]

# # Write the new DataFrame to a new SQLite table
# surveys2002.to_sql("surveys2002", con, if_exists="replace")

# con.close()

# Write as a command line tool per species

In [22]:
%%file aggregate_per_species_sig2kmer.py

import argparse
import os
import glob
import re

from tqdm import tqdm
import pandas as pd
from joblib import Parallel, delayed
from sig2kmer import read_kmer_csv
from sig_utils import SKETCH_INFO_PATTERN
from sourmash.logging import error, notify, set_quiet
import sqlite3


def process_single_kmer_csv(csv):
    try:
        df = read_kmer_csv(csv)
    except (pd.errors.EmptyDataError, pd.errors.ParserError):
        #         print(f"Empty file: {csv}")
        return

    split = csv.split("/")
    test_species = re.findall("((test|train)-\w+)", csv)[0][0]
    species = test_species.split("-")[-1]
    df["species"] = species
    df["cell_id"] = os.path.basename(csv).split(".")[0]
    mol_or_alpha, moltype, ksize, style, value = re.findall(SKETCH_INFO_PATTERN, csv)[0]
    df["sketch_id"] = split[-4]
    df["moltype"] = moltype
    df["ksize"] = ksize
    df[style] = int(value)
    alignment_status = split[-2]
    df["alignment_status"] = alignment_status
    return df


def main():
    p = argparse.ArgumentParser()
    # base directory containing a 2--single-cell-kmers folder which contains sketch id directories with sig2kmer csvs
    p.add_argument("species_base_dir")
    p.add_argument(
        "--n-jobs",
        type=int,
        default=16,
        help="Number of processes to use",
    )
    p.add_argument(
        "--kmer-subdir",
        default="2--single-cell-kmers",
        type=str,
        help="Subdirectory containing csvs within each per-sketch id subdirectory",
    )
    p.add_argument(
        "--no-aligned-unaligned-subdir",
        action="store_true",
        help=(
            "If not set, looks for files in {species_base_dir}/{kmer_subdir}/{sketch_id}/csvs/aligned and "
            "{species_base_dir}/{kmer_subdir}/{sketch_id}/csvs/unaligned. Otherwise, "
            "looks in {species_base_dir}/{kmer_subdir}/{sketch_id}/csvs/"
        ),
    )
#     p.add_argument(
#         "--gene-name-tag",
#         default="GN",
#         help=(
#             "Set the bam file tag to look for gene names in the read information"
#         ),
#     )
    args = p.parse_args()

    kmer_dir = os.path.join(args.species_base_dir, args.kmer_subdir)

    sketch_globber = os.path.join(
        kmer_dir,
        "alphabet-*ksize-*",
    )

    for sketch_dir in glob.glob(sketch_globber):
        notify(f"Reading hash2kmer csvs from {sketch_dir}")
        if args.no_aligned_unaligned_subdir:
            csv_globber = os.path.join(
                sketch_dir,
                "csvs",
                "*.csv",
            )
        else:
            csv_globber = os.path.join(
                sketch_dir,
                "csvs",
                "*",  # "aligned" or "unaligned" directory
                "*.csv",
            )
        total = sum(1 for _ in glob.iglob(csv_globber))

        dfs = Parallel(n_jobs=args.n_jobs)(
            delayed(process_single_kmer_csv)(csv) for csv in glob.iglob(csv_globber)
        )
        try:
            kmers = pd.concat(dfs)
        except ValueError:
            # No objects to contatenate, continue
            notify("No hash2kmer files found, skipping")
            continue
        parquet = os.path.join(sketch_dir, "hash2kmer.parquet")
        notify(f"Writing {parquet}")
        kmers.to_parquet(parquet)


if __name__ == "__main__":
    main()

Overwriting aggregate_per_species_sig2kmer.py


## Write out commands

In [8]:
PYTHON = '/home/olga/miniconda3/envs/immune-evolution/bin/python'
PWD = '/home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks'
AGGREGATE_SIG2KMER = f"{PWD}/aggregate_per_species_sig2kmer.py"
sig2kmer_template = f"{PYTHON} {AGGREGATE_SIG2KMER} " + r"{species_dir} --n-jobs 16"
sig2kmer_template

'/home/olga/miniconda3/envs/immune-evolution/bin/python /home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks/aggregate_per_species_sig2kmer.py {species_dir} --n-jobs 16'

In [9]:
species_globber = os.path.join('/home/olga/data_sm/immune-evolution/kmer-signatures/', '*--t*')

for species_dir in glob.glob(species_globber):
    print(sig2kmer_template.format(species_dir=species_dir))

/home/olga/miniconda3/envs/immune-evolution/bin/python /home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks/aggregate_per_species_sig2kmer.py /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human --n-jobs 16
/home/olga/miniconda3/envs/immune-evolution/bin/python /home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks/aggregate_per_species_sig2kmer.py /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat --n-jobs 16
/home/olga/miniconda3/envs/immune-evolution/bin/python /home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks/aggregate_per_species_sig2kmer.py /home/olga/data_sm/immune-evolution/kmer-signatures/4--test-lemur --n-jobs 16
/home/olga/miniconda3/envs/immune-evolution/bin/python /home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks/aggregate_per_species_sig2kmer.py /home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse --n-jobs 16


###  --> ran these on the command line

### For mouse2mouse, have to add additional `1--mouse2mouse/` column

In [32]:
ll /home/olga/data_sm/immune-evolution/kmer-signatures/0--mouse2mouse

total 1
drwxr-xr-x  8 olga 4096 Mar  5 11:46 [0m[01;34m0--self2self-bootstrapped[0m/
drwxr-xr-x 18 olga 4096 Mar 29 06:53 [01;34m1--mouse2mouse[0m/


## Per-celltype comands

In [16]:
PYTHON = '/home/olga/miniconda3/envs/immune-evolution/bin/python'
PWD = '/home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks'
AGGREGATE_SIG2KMER = f"{PWD}/aggregate_per_species_sig2kmer.py"

species_dir = '/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse'
sig2kmer_db_template = f"{PYTHON} {AGGREGATE_SIG2KMER} {species_dir} --n-jobs 16 --no-aligned-unaligned-subdir" + r" --kmer-subdir {kmer_subdir}"
sig2kmer_db_template

kmer_db_globber = os.path.join('/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/', '5--celltype-kmers--merged*')

for kmer_subdir in glob.glob(kmer_db_globber):
    print(sig2kmer_db_template.format(kmer_subdir=os.path.basename(kmer_subdir)))

/home/olga/miniconda3/envs/immune-evolution/bin/python /home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks/aggregate_per_species_sig2kmer.py /home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse --n-jobs 16 --no-aligned-unaligned-subdir --kmer-subdir 5--celltype-kmers--merged-celltype-remove-common-kmers--min-kmer-count--10-percent
/home/olga/miniconda3/envs/immune-evolution/bin/python /home/olga/code/immune-evolution--olgabot/analyze-kmermaid-bladder/notebooks/aggregate_per_species_sig2kmer.py /home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse --n-jobs 16 --no-aligned-unaligned-subdir --kmer-subdir 5--celltype-kmers--merged-celltype-remove-common-kmers--min-kmer-count--5-percent


In [None]:
1+1