In [25]:
import os

DATA_FOLDER = os.path.join('..', 'data')
FIGURE_FOLDER = os.path.join('..', 'figures')

notebook_name = '021_get_nucleus_cytoplasm_sequences'

data_folder = os.path.join(DATA_FOLDER, notebook_name)
figure_folder = os.path.join(FIGURE_FOLDER, notebook_name)

! mkdir -p $data_folder
! mkdir -p $figure_folder

input_folder = os.path.join(DATA_FOLDER, '020_get_nucleus_cytoplasm_genes')

In [2]:
import pandas as pd

csv = os.path.join(input_folder, 'nucleus_cytoplasm_single_ensg.csv')
nucleus_cytoplasm_single_ensg = pd.read_csv(csv)
print(nucleus_cytoplasm_single_ensg.shape)
nucleus_cytoplasm_single_ensg.head()

(4553, 5)


Unnamed: 0.1,Unnamed: 0,antibody_id,ensg_id,level_c,prest_id
0,0,243,ENSG00000036473,cytoplasm,230049
1,3,244,ENSG00000049769,nucleus,230064
2,15,266,ENSG00000156504,nucleus,231887
3,18,285,ENSG00000102096,cytoplasm,231642
4,21,287,ENSG00000068394,nucleus,230080


## Download and filter sequence data

In [3]:
cd /mnt/data

/mnt/data


In [4]:
mkdir -p genome/hg38/ensembl/v92

In [5]:
cd genome/hg38/ensembl/v92/

/mnt/data/genome/hg38/ensembl/v92


Download ENSEMBL fasta files. I don't understand the difference between `cds` and `cdna` so I downloaded them both. `pep` is the protein sequence

In [12]:
# %%bash

# wget ftp://ftp.ensembl.org/pub/release-92/fasta/homo_sapiens/cds/Homo_sapiens.GRCh38.cds.all.fa.gz
# wget ftp://ftp.ensembl.org/pub/release-92/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
# wget ftp://ftp.ensembl.org/pub/release-92/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz

In [13]:
ls

Homo_sapiens.GRCh38.cdna.all.fa       [0m[01;31mHomo_sapiens.GRCh38.cds.all.fa.gz[0m
[01;31mHomo_sapiens.GRCh38.cdna.all.fa.gz[0m    Homo_sapiens.GRCh38.cds.all.fa.gz.1
Homo_sapiens.GRCh38.cdna.all.fa.gz.1  Homo_sapiens.GRCh38.pep.all.fa
Homo_sapiens.GRCh38.cds.all.fa        [01;31mHomo_sapiens.GRCh38.pep.all.fa.gz[0m


Unzip the files

In [14]:
# ! gunzip --keep *.gz

In [15]:
ls -lha

total 702M
drwxrwxr-x 2 ubuntu ubuntu 4.0K May 16 21:31 [0m[01;34m.[0m/
drwxrwxr-x 3 ubuntu ubuntu 4.0K May 16 21:20 [01;34m..[0m/
-rw-rw-r-- 1 ubuntu ubuntu 341M May 16 21:21 Homo_sapiens.GRCh38.cdna.all.fa
-rw-rw-r-- 1 ubuntu ubuntu  64M May 16 21:21 [01;31mHomo_sapiens.GRCh38.cdna.all.fa.gz[0m
-rw-rw-r-- 1 ubuntu ubuntu  31M May 16 21:32 Homo_sapiens.GRCh38.cdna.all.fa.gz.1
-rw-rw-r-- 1 ubuntu ubuntu 145M May 16 21:21 Homo_sapiens.GRCh38.cds.all.fa
-rw-rw-r-- 1 ubuntu ubuntu  21M May 16 21:21 [01;31mHomo_sapiens.GRCh38.cds.all.fa.gz[0m
-rw-rw-r-- 1 ubuntu ubuntu  21M May 16 21:31 Homo_sapiens.GRCh38.cds.all.fa.gz.1
-rw-rw-r-- 1 ubuntu ubuntu  69M May 16 21:21 Homo_sapiens.GRCh38.pep.all.fa
-rw-rw-r-- 1 ubuntu ubuntu  14M May 16 21:21 [01;31mHomo_sapiens.GRCh38.pep.all.fa.gz[0m


In [16]:
! head *.fa

==> Homo_sapiens.GRCh38.cdna.all.fa <==
>ENST00000434970.2 cdna chromosome:GRCh38:14:22439007:22439015:1 gene:ENSG00000237235.2 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD2 description:T cell receptor delta diversity 2 [Source:HGNC Symbol;Acc:HGNC:12255]
CCTTCCTAC
>ENST00000448914.1 cdna chromosome:GRCh38:14:22449113:22449125:1 gene:ENSG00000228985.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD3 description:T cell receptor delta diversity 3 [Source:HGNC Symbol;Acc:HGNC:12256]
ACTGGGGGATACG
>ENST00000415118.1 cdna chromosome:GRCh38:14:22438547:22438554:1 gene:ENSG00000223997.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRDD1 description:T cell receptor delta diversity 1 [Source:HGNC Symbol;Acc:HGNC:12254]
GAAATAGT
>ENST00000632684.1 cdna chromosome:GRCh38:7:142786213:142786224:1 gene:ENSG00000282431.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:TRBD1 description:T cell receptor beta diversit

Good, the fasta descriptione includes the ENSG ID

In [17]:
fastas = ['Homo_sapiens.GRCh38.cdna.all.fa', 'Homo_sapiens.GRCh38.cds.all.fa', 'Homo_sapiens.GRCh38.pep.all.fa']

In [19]:
%%time

from Bio import SeqIO

for fasta in fastas:
    filtered_fasta = f"/mnt/data/{fasta}.nuclear_or_cytoplasmic"

    nuclear_or_cytoplasmic = []
    for record in SeqIO.parse(fasta, "fasta"):
        for ensg_id in nucleus_cytoplasm_single_ensg['ensg_id']:
            if ensg_id in record.description:
                nuclear_or_cytoplasmic.append(record)
    SeqIO.write(nuclear_or_cytoplasmic, filtered_fasta, "fasta")


CPU times: user 5min 34s, sys: 384 ms, total: 5min 34s
Wall time: 6min 24s


In [20]:
ls -lha /mnt/data

total 130M
drwxrwxrwx 10 ubuntu root   4.0K May 16 21:38 [0m[34;42m.[0m/
drwxrwxrwx  4 root   root   4.0K Apr 24 23:53 [34;42m..[0m/
drwxr-xr-x  3 root   root   4.0K Apr 25 00:38 [01;34mdata[0m/
drwxrwxrwx  3 ubuntu ubuntu 4.0K Apr 24 23:25 [34;42mfastq[0m/
drwxrwxr-x  2 ubuntu ubuntu 4.0K May 16 20:45 [01;34mfastq_dump_v2[0m/
drwxrwxr-x  4 ubuntu ubuntu 4.0K May 16 21:20 [01;34mgenome[0m/
drwxr-xr-x  2 root   root   4.0K Apr 24 23:53 [01;34mhca[0m/
-rw-rw-r--  1 ubuntu ubuntu  80M May 16 21:35 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic
-rw-rw-r--  1 ubuntu ubuntu  34M May 16 21:37 Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic
-rw-rw-r--  1 ubuntu ubuntu  17M May 16 21:38 Homo_sapiens.GRCh38.pep.all.fa.nuclear_or_cytoplasmic
drwxrwxr-x 25 ubuntu ubuntu 4.0K May 16 20:18 [01;34mrawdata[0m/
drwxrwxr-x 14 ubuntu ubuntu 4.0K May 14 17:17 [01;34msourmash[0m/
drwxrwxr-x  3 ubuntu ubuntu 4.0K Apr 25 17:20 [01;34mtrinity-output[0m/


In [28]:
! cp /mnt/data/*nuclear_or_cytoplasmic $data_folder

In [34]:
! ls -lha $data_folder

total 130M
drwxrwxr-x 2 ubuntu ubuntu 4.0K May 16 21:44 .
drwxrwxr-x 3 ubuntu ubuntu 4.0K May 16 21:43 ..
-rw-rw-r-- 1 ubuntu ubuntu  80M May 16 21:43 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic
-rw-rw-r-- 1 ubuntu ubuntu  34M May 16 21:44 Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic
-rw-rw-r-- 1 ubuntu ubuntu  17M May 16 21:44 Homo_sapiens.GRCh38.pep.all.fa.nuclear_or_cytoplasmic


In [32]:
! head /mnt/data/*nuclear_or_cytoplasmic

==> /mnt/data/Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic <==
>ENST00000419783.2 cdna chromosome:GRCh38:3:49357171:49358600:-1 gene:ENSG00000233276.4 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
GAGCCCTCGAGGGCCCCAGCCCTTGGAAGGGTAACCTGGACCGCTGCCGCCTGGTTGCCT
GGGCCAGACCAGACATGCCTGCTGCTCCTTCCGGCTTAGGAGGAGCACGCGTCCCGCTCG
GGCGCACTCTCCAGCCTTTTCCTGGCTGAGGAGGGGCCGAGCCCTCCGGGTAGGGCGGGG
GCCGGATGAGGCGGGACCCTCAGGCCCGGAAAACTGCCTGTGCCACGTGACCCGCCGCCG
GCCAGTTAAAAGGAGGCGCCTGCTGGCCTCCCCTTACAGTGCTTGTTCGGGGCGCTCCGC
TGGCTTCTTGGACAATTGCGCCATGTGTGCTGCTCGGCTAGCGGCGGCGGCGGCGGCGGC
CCAGTCGGTGTATGCCTTCTCGGCGCGCCCGCTGGCCGGCGGGGAGCCTGTGAGCCTGGG
CTCCCTGCGGGGCAAGGTACTACTTATCGAGAATGTGGCGTCCCTCTGAGGCACCACGGT
CCGGGACTACACCCAGATGAACGAGCTGCAGCGGCGCCTCGGACCCCGGGGCCTGGTGGT

==> /mnt/data/Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic <==
>ENST00000419783.2 cds chromosome:GRCh38:3:49

In [35]:
! grep '>' /mnt/data/*nuclear_or_cytoplasmic | head

/mnt/data/Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic:>ENST00000419783.2 cdna chromosome:GRCh38:3:49357171:49358600:-1 gene:ENSG00000233276.4 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
/mnt/data/Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic:>ENST00000419349.2 cdna chromosome:GRCh38:3:49357178:49358312:-1 gene:ENSG00000233276.4 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
/mnt/data/Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic:>ENST00000643797.1 cdna chromosome:GRCh38:3:49357201:49358325:-1 gene:ENSG00000233276.4 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
/mnt/data/Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplas