In [2]:
import itertools
import os
import re

import screed
import pandas as pd

# Filter fastas for for dissociation associated genes

In [3]:
dissociation_genes = set(pd.read_csv('/home/olga/data_sm/immune-evolution/databases/nuisance-genes/dissociation_genes_3Nov2020.csv', index_col=0, squeeze=True))
len(dissociation_genes)

1880

In [4]:
! ls -lha /home/olga/data_sm/immune-evolution/rawdata/ensembl/v101/orthologs

total 3.9M
drwxrwxr-x 2 olga czb 4.0K Sep 25 08:46 .
drwxrwxr-x 3 olga czb 4.0K Sep 25 08:38 ..
-rwxrwxr-x 1 olga czb 842K Sep 25 08:39 mRhiFer1_v1__gene_info.tsv.gz
-rwxrwxr-x 1 olga czb 3.0M Sep 25 08:39 mRhiFer1_v1__human__mouse-lemur__mouse__orthologs.tsv.gz


In [5]:
human_protein_fasta = '/home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.pc_translations.fa.gz'
mouse_protein_fasta = '/home/olga/data_lg/czbiohub-reference/gencode/mouse/vM21/gencode.vM21.pc_translations.fa.gz'

In [6]:
! zcat $human_protein_fasta |head

>ENSP00000493376.2|ENST00000641515.2|ENSG00000186092.6|OTTHUMG00000001094.4|OTTHUMT00000003223.4|OR4F5-202|OR4F5|326
MKKVTAEAISWNESTSETNNSMVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIV
ITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRKVISFKGCLVQIFLLHF
FGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVTWGIGFLHSVSQLAFAVHL
LFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQ
HRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPII
YTLRNKDMKTAIRQLRKWDAHSSVKF
>ENSP00000334393.3|ENST00000335137.4|ENSG00000186092.6|OTTHUMG00000001094.4|-|OR4F5-201|OR4F5|305
MVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLS
LIDLSLSSVTAPKMITDFFSQRKVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAIC

gzip: stdout: Broken pipe


## Constants (ksizes)

In [2]:
ksizes_str = ','.join(map( str, range(21, 91, 3)))
ksizes_str

'21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,66,69,72,75,78,81,84,87,90'

In [7]:
# dissociation_associated_gene_ids

## Utility functions

In [25]:
def write_records_to_fasta(records, fasta):
    with open(fasta, 'w') as f:
        for record in records:
            f.write(f'>{record["name"]}\n{record["sequence"]}\n')

def filter_records(fasta, pattern,):
    filtered_records = []
    with screed.open(fasta) as records:
        for record in records:
            name = record['name']
            if re.findall(pattern, name, flags=re.I):
                filtered_records.append(record)
    return filtered_records


def check_gene_label_in_ids_or_names(gene_name, gene_names, seqtype_id, seqtype_ids):
    try:
        in_seqtype_ids = seqtype_id in seqtype_ids
    except TypeError:
        in_seqtype_ids = False
    
    try:
        in_gene_names = gene_name in gene_names
    except TypeError:
        in_gene_names = False
        
    return in_gene_names | in_seqtype_ids


def filter_records_by_ids(fasta, seqtype_ids=None, gene_names=None, seqtype='protein'):
    filtered_records = []
    with screed.open(fasta) as records:
        for record in records:
            name = record['name']
            split = record['name'].split('|')
            if seqtype == 'protein':
                seqtype_id = gene_id = split[0].split('.')[0]
                gene_name = split[-2]
            elif seqtype == 'transcript':
                transcript_id = split[0].split('.')[0]
                seqtype_id = transcript_id
                gene_id = split[1].split('.')[0]
                gene_name = split[-4]
#             import pdb; pdb.set_trace()
            if check_gene_label_in_ids_or_names(gene_name, gene_names, seqtype_id, seqtype_ids):
                filtered_records.append(record)
    return filtered_records


def filter_fasta_with_regex(fasta_to_filter, out_fasta, regex=None, gene_ids=None, gene_names=None, seqtype=None):
    if regex is None:
        record_subset = filter_records_by_ids(fasta_to_filter, gene_ids, gene_names, seqtype)
    else:
        record_subset = filter_records(fasta_to_filter, regex)
    write_records_to_fasta(record_subset, out_fasta)
    
    


In [11]:
# ! head $human_transcript_fasta

# Human

## Filter human records for dissociation associated records

In [12]:
dissociation_associated_records = []
dissociation_associated_gene_ids = []

with screed.open(human_protein_fasta) as records:
    for record in records:
        split = record['name'].split('|')
        gene_id = split[2].split('.')[0]
        gene_name = split[-2]
        if gene_name in dissociation_genes:
            dissociation_associated_records.append(record)
            dissociation_associated_gene_ids.append(gene_id)
            
len(dissociation_associated_records)

9068

### Write to fasta

In [13]:
human_dissociation_fasta = human_protein_fasta.replace('.fa.gz', '__dissociation_genes_3Nov2020.fasta')
write_records_to_fasta(dissociation_associated_records, human_dissociation_fasta)

## Filter human sequences for mitochondrial sequences

In [14]:
human_mitochondrial_fasta = human_protein_fasta.replace('.fa.gz', '__mitochondrial_genes.fasta')
filter_fasta_with_regex(human_protein_fasta, human_mitochondrial_fasta, 'MT-\w+')
! wc -l $human_mitochondrial_fasta
! grep -c '>' $human_mitochondrial_fasta

362 /home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.pc_translations__mitochondrial_genes.fasta
181


# Mouse

## Get mouse orthologs

In [15]:
human_mouse_orthologs = pd.read_csv('/home/olga/googledrive/TabulaMicrocebus/data/orthologous-genes/ensembl98__human__to__mouse__orthologs.txt.gz', sep='\t')
print(human_mouse_orthologs.shape)
human_mouse_orthologs.head()

(265341, 20)


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Mouse gene stable ID,Mouse gene name,Mouse protein or transcript stable ID,Mouse chromosome/scaffold name,Mouse chromosome/scaffold start (bp),Mouse chromosome/scaffold end (bp),Query protein or transcript ID,Last common ancestor with Mouse,Mouse homology type,%id. target Mouse gene identical to query gene,%id. query gene identical to target Mouse gene,Mouse Gene-order conservation score,Mouse Whole-genome alignment coverage,dN with Mouse,dS with Mouse,"Mouse orthology confidence [0 low, 1 high]"
0,ENSG00000210049,ENSG00000210049.1,ENST00000387314,ENST00000387314.1,,,,,,,,,,,,,,,,
1,ENSG00000211459,ENSG00000211459.2,ENST00000389680,ENST00000389680.2,,,,,,,,,,,,,,,,
2,ENSG00000210077,ENSG00000210077.1,ENST00000387342,ENST00000387342.1,,,,,,,,,,,,,,,,
3,ENSG00000210082,ENSG00000210082.2,ENST00000387347,ENST00000387347.2,,,,,,,,,,,,,,,,
4,ENSG00000209082,ENSG00000209082.1,ENST00000386347,ENST00000386347.1,,,,,,,,,,,,,,,,


In [16]:
rows = human_mouse_orthologs['Gene stable ID'].isin(dissociation_associated_gene_ids)
human_mouse_orthologs_dissociation = human_mouse_orthologs.loc[rows]
# human_mouse_orthologs_dissociation = human_mouse_orthologs_dissociation.drop_duplicates()
human_mouse_orthologs_dissociation.shape

(14505, 20)

In [17]:
human_mouse_orthologs_dissociation

Unnamed: 0,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Mouse gene stable ID,Mouse gene name,Mouse protein or transcript stable ID,Mouse chromosome/scaffold name,Mouse chromosome/scaffold start (bp),Mouse chromosome/scaffold end (bp),Query protein or transcript ID,Last common ancestor with Mouse,Mouse homology type,%id. target Mouse gene identical to query gene,%id. query gene identical to target Mouse gene,Mouse Gene-order conservation score,Mouse Whole-genome alignment coverage,dN with Mouse,dS with Mouse,"Mouse orthology confidence [0 low, 1 high]"
138,ENSG00000067992,ENSG00000067992.15,ENST00000379162,ENST00000379162.8,ENSMUSG00000035232,Pdk3,ENSMUSP00000036604,X,93764607.0,93832201.0,ENSP00000498864,Euarchontoglires,ortholog_one2one,97.1084,97.1084,50.0,100.00,0.0151,0.5213,1.0
139,ENSG00000067992,ENSG00000067992.15,ENST00000648777,ENST00000648777.1,ENSMUSG00000035232,Pdk3,ENSMUSP00000036604,X,93764607.0,93832201.0,ENSP00000498864,Euarchontoglires,ortholog_one2one,97.1084,97.1084,50.0,100.00,0.0151,0.5213,1.0
140,ENSG00000067992,ENSG00000067992.15,ENST00000568479,ENST00000568479.2,ENSMUSG00000035232,Pdk3,ENSMUSP00000036604,X,93764607.0,93832201.0,ENSP00000498864,Euarchontoglires,ortholog_one2one,97.1084,97.1084,50.0,100.00,0.0151,0.5213,1.0
141,ENSG00000067992,ENSG00000067992.15,ENST00000493226,ENST00000493226.1,ENSMUSG00000035232,Pdk3,ENSMUSP00000036604,X,93764607.0,93832201.0,ENSP00000498864,Euarchontoglires,ortholog_one2one,97.1084,97.1084,50.0,100.00,0.0151,0.5213,1.0
674,ENSG00000138606,ENSG00000138606.19,ENST00000560540,ENST00000560540.5,ENSMUSG00000033256,Shf,ENSMUSP00000106160,2,122348892.0,122369162.0,ENSP00000290894,Eutheria,ortholog_one2one,54.1371,96.2185,75.0,94.31,0.0167,0.3423,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250870,ENSG00000184867,ENSG00000184867.14,ENST00000431597,ENST00000431597.5,ENSMUSG00000033436,Armcx2,ENSMUSP00000127305,X,134804145.0,134809221.0,ENSP00000349281,Euarchontoglires,ortholog_one2one,78.0063,62.8827,75.0,95.19,0.1401,0.5306,1.0
250871,ENSG00000184867,ENSG00000184867.14,ENST00000479333,ENST00000479333.5,ENSMUSG00000033436,Armcx2,ENSMUSP00000127305,X,134804145.0,134809221.0,ENSP00000349281,Euarchontoglires,ortholog_one2one,78.0063,62.8827,75.0,95.19,0.1401,0.5306,1.0
250872,ENSG00000184867,ENSG00000184867.14,ENST00000496581,ENST00000496581.1,ENSMUSG00000033436,Armcx2,ENSMUSP00000127305,X,134804145.0,134809221.0,ENSP00000349281,Euarchontoglires,ortholog_one2one,78.0063,62.8827,75.0,95.19,0.1401,0.5306,1.0
250873,ENSG00000184867,ENSG00000184867.14,ENST00000488982,ENST00000488982.1,ENSMUSG00000033436,Armcx2,ENSMUSP00000127305,X,134804145.0,134809221.0,ENSP00000349281,Euarchontoglires,ortholog_one2one,78.0063,62.8827,75.0,95.19,0.1401,0.5306,1.0


In [18]:
mouse_dissociation_gene_names = set(human_mouse_orthologs_dissociation['Mouse gene name'])
mouse_dissociation_protein_ids = set(human_mouse_orthologs_dissociation['Mouse protein or transcript stable ID'])

## Get records for mouse protein sequences of dissociation genes

In [None]:
mouse_dissociation_associated_records = []

with screed.open(mouse_protein_fasta) as records:
    for record in records:
        split = record['name'].split('|')
        protein_id = split[0].split('.')[0]
        gene_name = split[-2]
        if gene_name in mouse_dissociation_gene_names or protein_id in mouse_dissociation_protein_ids:
            mouse_dissociation_associated_records.append(record)
            
len(mouse_dissociation_associated_records)

In [None]:
mouse_dissociation_fasta = human_protein_fasta.replace('.fa.gz', '__dissociation_genes_3Nov2020.fasta')
write_records_to_fasta(mouse_dissociation_associated_records, mouse_dissociation_fasta)

## Filter mouse sequences for mitochondrial sequences

In [None]:
mouse_mitochondrial_fasta = mouse_protein_fasta.replace('.fa.gz', '__mitochondrial_genes.fasta')
filter_fasta_with_regex(mouse_protein_fasta, mouse_mitochondrial_fasta, 'mt-\w+')
! wc -l $mouse_mitochondrial_fasta
! grep -c '>' $mouse_mitochondrial_fasta

# Make signatures

## Human dissociation genes

In [None]:
human_dissociation_sig = human_dissociation_fasta.replace('.fasta', '.sig')
! sourmash compute --protein --dayhoff --input-is-protein --no-dna -k $ksizes_str -o $human_dissociation_sig --scaled 10 $human_dissociation_fasta 

## Mouse dissociation genes

In [None]:
mouse_dissociation_sig = mouse_dissociation_fasta.replace('.fasta', '.sig')
! sourmash compute --protein --dayhoff --input-is-protein --no-dna -k $ksizes_str -o $mouse_dissociation_sig --scaled 10 $mouse_dissociation_fasta 

## Concatenate dissociation fastas together

In [None]:
catted_dissociation_fasta = '/home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation_genes_3Nov2020.fasta'

In [None]:
! cat $mouse_dissociation_fasta $human_dissociation_fasta > $catted_dissociation_fasta
! wc -l $catted_dissociation_fasta

## Concatenate mitochondrial fastas together

In [None]:
catted_mitochondrial_fasta = '/home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__mitochondrial_genes.fasta'
! cat $mouse_mitochondrial_fasta $human_mitochondrial_fasta > $catted_mitochondrial_fasta
! wc -l $catted_mitochondrial_fasta

## Concatenate newly made dissociation, mitochondrial fastas ~with old refseq mammalian ribosomal  fasta~

In [None]:
ribosomal_fasta = '/home/olga/data_lg/czbiohub-reference/ncbi/refseq/releases/refseq-release98-2020-02-06/vertebrate_mammalian/vertebrate_mammalian_concatenated__ribosomal.faa'

# catted_dissociation_with_mitochondrial_fasta = '/home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation__ribosomal__refseq_mammalian_ribosomal.fasta'
catted_dissociation_with_mitochondrial_fasta = '/home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation__mitochondrial.fasta'

! cat $catted_dissociation_fasta $catted_mitochondrial_fasta > $catted_dissociation_with_mitochondrial_fasta
! wc -l $catted_dissociation_with_mitochondrial_fasta

## Iterate over protein fastas, dissociation genes, mitochondrial and ribosomal regexes

In [26]:
protein_fastas = {'mouse': mouse_protein_fasta, 'human': human_protein_fasta}

genetype_kwargs = {
    'mitochondrial': {
        'mouse': dict(regex='mt-\w+'), 'human': dict(regex='MT-\w+')
    },
    'dissociation': {
        'mouse': dict(
            gene_names=mouse_dissociation_gene_names, 
            gene_ids=mouse_dissociation_protein_ids,
            seqtype='protein'
        ), 
        'human': dict(gene_names=dissociation_genes, seqtype='protein')
    },
    'ribosomal': {
        # Flags to re.findall ignore case so we can use the same regex for both
        'mouse': dict(regex='M*RP[LS]\d+[[\w\d-]*]?', ),
        # e.g. RPL14, RPS24, RPL3L2
        'human': dict(regex='M*RP[LS]\d+[[\w\d-]*]?')
    }
}

def filter_fastas_with_regex(species_fastas, genetype_kwargs, concatenated_output_fasta):

    filtered_fastas = []
    for (species, fasta), (genetype, kwarg_dict) in itertools.product(species_fastas.items(), genetype_kwargs.items()):
        root, extension = os.path.splitext(fasta)
        filtered_fasta = fasta.replace(extension, f'__{genetype}_genes.fasta')
        filtered_fastas.append(filtered_fasta)

        kwargs = kwarg_dict[species]
        filter_fasta_with_regex(fasta, filtered_fasta, **kwargs)
        ! wc -l $filtered_fasta
        ! grep -c '>' $filtered_fasta
    fastas_to_cat = ' '.join(filtered_fastas)
    ! cat $fastas_to_cat > $concatenated_output_fasta
    ! wc -l $concatenated_output_fasta
    ! grep -c '>' $concatenated_output_fasta
    
    
catted_protein_fasta = '/home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation3Nov2020__ribosomal__mitochondrial__protein.fasta'
filter_fastas_with_regex(protein_fastas, genetype_kwargs, catted_protein_fasta)

172 /home/olga/data_lg/czbiohub-reference/gencode/mouse/vM21/gencode.vM21.pc_translations.fa__mitochondrial_genes.fasta
86
10300 /home/olga/data_lg/czbiohub-reference/gencode/mouse/vM21/gencode.vM21.pc_translations.fa__dissociation_genes.fasta
5150
1034 /home/olga/data_lg/czbiohub-reference/gencode/mouse/vM21/gencode.vM21.pc_translations.fa__ribosomal_genes.fasta
517
362 /home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.pc_translations.fa__mitochondrial_genes.fasta
181
18136 /home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.pc_translations.fa__dissociation_genes.fasta
9068
1870 /home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.pc_translations.fa__ribosomal_genes.fasta
935
31874 /home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation3Nov2020__ribosomal__mitochondrial__protein.fasta
15937


# Build protein signatures

In [4]:
! sourmash info

[Ksourmash version 2.1.1.dev5+g878540f
[K- loaded from path: /usr/local/lib/python3.6/dist-packages/sourmash-2.1.1.dev5+g878540f-py3.6-linux-x86_64.egg/sourmash
[K


In [5]:
! which -a sourmash

/usr/local/bin/sourmash
/usr/local/bin/sourmash
/home/olga/miniconda3/envs/immune-evolution/bin/sourmash
/usr/local/bin/sourmash


In [7]:
catted_protein_fasta = '/home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation3Nov2020__ribosomal__mitochondrial__protein.fasta'
catted_protein_fasta_sig = catted_protein_fasta.replace('.fasta', '.sig')
! /home/olga/miniconda3/envs/immune-evolution/bin/sourmash \
    compute \
    --protein --dayhoff --hp --input-is-protein --no-dna \
    -k $ksizes_str \
    -o $catted_protein_fasta_sig \
    --scaled 10 \
    $catted_protein_fasta 

[K
== This is sourmash version 3.5.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Ksetting num_hashes to 0 because --scaled is set
[Kcomputing signatures for files: /home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation3Nov2020__ribosomal__mitochondrial__protein.fasta
[KComputing signature for ksizes: [21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90]
[KComputing only protein (and not nucleotide) signatures.
[KComputing a total of 24 signature(s).
[K... reading sequences from /home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation3Nov2020__ribosomal__mitochondrial__protein.fasta
[Kcalculated 1 signatures for 15937 sequences in /home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation3Nov2020__ribosomal__mitochondrial__protein.fasta
[Ksaved signature(s) to /home/olga/data_sm/immune-evolution/databases/nuisance-g

In [43]:
catted_dissociation_with_ribosomal_sig

'/home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation_genes__refseq_mammalian_ribosomal.sig'

# Make DNA signatures of dissociation and mitochondrial genes

## Iterate over trancript fastas and dissociation genes

In [29]:
human_transcript_fasta = '/home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.transcripts.fa'
mouse_transcript_fasta = '/home/olga/data_lg/czbiohub-reference/gencode/mouse/vM21/gencode.vM21.transcripts.fa'

transcript_fastas = {'mouse': mouse_transcript_fasta, 'human': human_transcript_fasta}

genetype_kwargs = {
    'mitochondrial': {
        'mouse': dict(regex='mt-\w+'), 'human': dict(regex='MT-\w+')
    },
    'dissociation': {
        'mouse': dict(
            gene_names=mouse_dissociation_gene_names, 
            gene_ids=mouse_dissociation_protein_ids,
            seqtype='transcript'
        ), 
        'human': dict(gene_names=dissociation_genes, seqtype='transcript')
    },
    'ribosomal': {
        # Flags to re.findall ignore case so we can use the same regex for both
        'mouse': dict(regex='M*RP[LS]\d+[[\w\d-]*]?', ),
        # e.g. RPL14, RPS24, RPL3L2
        'human': dict(regex='M*RP[LS]\d+[[\w\d-]*]?')
    }
}

filtered_fastas = []
for (species, transcript_fasta), (genetype, kwarg_dict) in itertools.product(transcript_fastas.items(), genetype_kwargs.items()):
    filtered_fasta = transcript_fasta.replace('.fa', f'__{genetype}_genes.fasta')
    filtered_fastas.append(filtered_fasta)

    if genetype == 'mitochondrial':
        continue

    kwargs = kwarg_dict[species]
    filter_fasta_with_regex(transcript_fasta, filtered_fasta, **kwargs)
    ! wc -l $filtered_fasta
    ! grep -c '>' $filtered_fasta

16542 /home/olga/data_lg/czbiohub-reference/gencode/mouse/vM21/gencode.vM21.transcripts__dissociation_genes.fasta
8271
2486 /home/olga/data_lg/czbiohub-reference/gencode/mouse/vM21/gencode.vM21.transcripts__ribosomal_genes.fasta
1243
28332 /home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.transcripts__dissociation_genes.fasta
14166
5364 /home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.transcripts__ribosomal_genes.fasta
2682


## Concatenate all the fastas together

In [31]:
catted_dissociation_with_mitochondrial_nucleotide_fasta = '/home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation3Nov2020__ribosomal__mitochondrial__nucleotide.fasta'

In [37]:
filtered_fastas

['/home/olga/data_lg/czbiohub-reference/gencode/mouse/vM21/gencode.vM21.transcripts__mitochondrial_genes.fasta',
 '/home/olga/data_lg/czbiohub-reference/gencode/mouse/vM21/gencode.vM21.transcripts__dissociation_genes.fasta',
 '/home/olga/data_lg/czbiohub-reference/gencode/mouse/vM21/gencode.vM21.transcripts__ribosomal_genes.fasta',
 '/home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.transcripts__mitochondrial_genes.fasta',
 '/home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.transcripts__dissociation_genes.fasta',
 '/home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.transcripts__ribosomal_genes.fasta']

In [36]:
fastas_to_cat

'/home/olga/data_lg/czbiohub-reference/gencode/mouse/vM21/gencode.vM21.transcripts__mitochondrial_genes.fasta /home/olga/data_lg/czbiohub-reference/gencode/mouse/vM21/gencode.vM21.transcripts__dissociation_genes.fasta /home/olga/data_lg/czbiohub-reference/gencode/mouse/vM21/gencode.vM21.transcripts__ribosomal_genes.fasta /home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.transcripts__mitochondrial_genes.fasta /home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.transcripts__dissociation_genes.fasta /home/olga/data_lg/czbiohub-reference/gencode/human/v30/gencode.v30.transcripts__ribosomal_genes.fasta'

In [33]:
fastas_to_cat = ' '.join(filtered_fastas)
! cat $fastas_to_cat > $catted_dissociation_with_mitochondrial_nucleotide_fasta

In [35]:
! wc -l $catted_dissociation_with_mitochondrial_nucleotide_fasta

53664 /home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation3Nov2020__ribosomal__mitochondrial__nucleotide.fasta


## Build DNA signatures

In [34]:
catted_dissociation_with_mitochondrial_nucleotide_sig = catted_dissociation_with_mitochondrial_nucleotide_fasta.replace('.fasta', '.sig')
! sourmash compute \
    --dna \
    -k $ksizes_str \
    -o $catted_dissociation_with_mitochondrial_nucleotide_sig \
    --scaled 10 \
    $catted_dissociation_with_mitochondrial_nucleotide_fasta 

[K== This is sourmash version 2.1.1.dev5+g878540f. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Ksetting num_hashes to 0 because --scaled is set
[Kcomputing signatures for files: /home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation3Nov2020__ribosomal__mitochondrial__nucleotide.fasta
[KComputing signature for ksizes: [21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90]
[KComputing only nucleotide (and not protein) signatures.
[KComputing a total of 24 signature(s).
[K... reading sequences from /home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation3Nov2020__ribosomal__mitochondrial__nucleotide.fasta
[Kcalculated 24 signatures for 26832 sequences in /home/olga/data_sm/immune-evolution/databases/nuisance-genes/mouse-human__dissociation3Nov2020__ribosomal__mitochondrial__nucleotide.fasta
[Ksaved signature(s) to /home/olga/data_sm/immune-evoluti

In [77]:
# mouse_dissociation_gene_names