In [19]:
import os

DATA_FOLDER = os.path.abspath(os.path.join('..', 'data'))
FIGURE_FOLDER = os.path.abspath(os.path.join('..', 'figures'))

notebook_name = '021_get_nucleus_cytoplasm_sequences'

data_folder = os.path.join(DATA_FOLDER, notebook_name)
figure_folder = os.path.join(FIGURE_FOLDER, notebook_name)

! mkdir -p $data_folder
! mkdir -p $figure_folder

input_folder = os.path.join(DATA_FOLDER, '020_get_nucleus_cytoplasm_genes')

In [20]:
data_folder

'/src/myhome/code/sequence-localization/notebooks/genome/hg38/ensembl/data/021_get_nucleus_cytoplasm_sequences'

In [4]:
import pandas as pd

csv = os.path.join(input_folder, 'nucleus_cytoplasm_single_ensg.csv')
nucleus_cytoplasm_single_ensg = pd.read_csv(csv)
print(nucleus_cytoplasm_single_ensg.shape)
nucleus_cytoplasm_single_ensg.head()

(2383, 2)


Unnamed: 0,ensg_id,level_b
0,ENSG00000049769,nucleoplasm
1,ENSG00000156504,nucleoplasm
2,ENSG00000102096,cytosol
3,ENSG00000068394,nucleoplasm
4,ENSG00000133131,nucleoplasm


In [5]:
nucleus_cytoplasm_single_ensg.empty

False

In [6]:
LEVEL = 'level_b'

nucleus_cytoplasm_single_ensg[f'{LEVEL}_bool'] = 1

In [7]:
target_df = nucleus_cytoplasm_single_ensg.pivot(index='ensg_id', columns=LEVEL)
target_df = target_df.fillna(0)
target_df = target_df.astype(int)
target_df.head()

Unnamed: 0_level_0,level_b_bool,level_b_bool
level_b,cytosol,nucleoplasm
ensg_id,Unnamed: 1_level_2,Unnamed: 2_level_2
ENSG00000000003,1,0
ENSG00000001167,0,1
ENSG00000001460,0,1
ENSG00000001461,0,1
ENSG00000002746,1,0


### Check that it's correct with the `head` of the initial data

In [8]:
target_df.loc[nucleus_cytoplasm_single_ensg.ensg_id.head()]

Unnamed: 0_level_0,level_b_bool,level_b_bool
level_b,cytosol,nucleoplasm
ensg_id,Unnamed: 1_level_2,Unnamed: 2_level_2
ENSG00000049769,0,1
ENSG00000156504,0,1
ENSG00000102096,1,0
ENSG00000068394,0,1
ENSG00000133131,0,1


## Download and filter sequence data

In [9]:
cd /mnt/data

[Errno 2] No such file or directory: '/mnt/data'
/src/myhome/code/sequence-localization/notebooks


In [10]:
mkdir -p genome/hg38/ensembl/v92

In [11]:
cd genome/hg38/ensembl/v92/

/src/myhome/code/sequence-localization/notebooks/genome/hg38/ensembl/v92


Download ENSEMBL fasta files. I don't understand the difference between `cds` and `cdna` so I downloaded them both. `pep` is the protein sequence

In [12]:
# %%bash

# wget ftp://ftp.ensembl.org/pub/release-92/fasta/homo_sapiens/cds/Homo_sapiens.GRCh38.cds.all.fa.gz
# wget ftp://ftp.ensembl.org/pub/release-92/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
# wget ftp://ftp.ensembl.org/pub/release-92/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz

In [13]:
ls

Unzip the files

In [14]:
# ! gunzip --keep *.gz

In [15]:
ls -lha

total 8.0K
drwxr-xr-x 2 root root 4.0K May 26 02:35 [0m[01;34m.[0m/
drwxr-xr-x 3 root root 4.0K May 26 02:35 [01;34m..[0m/


In [16]:
! head *.fa

head: cannot open '*.fa' for reading: No such file or directory


Good, the fasta descriptione includes the ENSG ID

In [17]:
fastas = ['Homo_sapiens.GRCh38.cdna.all.fa', 'Homo_sapiens.GRCh38.cds.all.fa', 'Homo_sapiens.GRCh38.pep.all.fa']

In [18]:
%%time

from Bio import SeqIO



for fasta in fastas:
    print(fasta)
#     input_fasta = os.path.join('/mnt/data/', fasta)
    output_fasta = os.path.join(data_folder, f"{fasta}.nuclear_or_cytoplasmic")
    ensgs = []
    lengths = pd.Series()

    nuclear_or_cytoplasmic = []
    for record in SeqIO.parse(fasta, "fasta"):
        for ensg_id in nucleus_cytoplasm_single_ensg['ensg_id']:
            if ensg_id in record.description:
                nuclear_or_cytoplasmic.append(record)
                ensgs.append(ensg_id)
                lengths[record.id] = len(record)
    print(f'\t{len(nuclear_or_cytoplasmic)} sequences found for {len(set(ensgs))} ENSEMBL ids') 
    csv = os.path.join(data_folder, f"{fasta}.nuclear_or_cytoplasmic.target.tsv")
    target_df.loc[ensgs].to_csv(csv, index=False, header=False, sep='\t')
    SeqIO.write(nuclear_or_cytoplasmic, output_fasta, "fasta")



Homo_sapiens.GRCh38.cdna.all.fa


FileNotFoundError: [Errno 2] No such file or directory: 'Homo_sapiens.GRCh38.cdna.all.fa'

In [17]:
target_df.head()

Unnamed: 0_level_0,level_b_bool,level_b_bool
level_b,cytosol,nucleoplasm
ensg_id,Unnamed: 1_level_2,Unnamed: 2_level_2
ENSG00000000003,1,0
ENSG00000001167,0,1
ENSG00000001460,0,1
ENSG00000001461,0,1
ENSG00000002746,1,0


In [20]:
ls -lha /mnt/data

total 130M
drwxrwxrwx 13 ubuntu root   4.0K May 18 19:07 [0m[34;42m.[0m/
drwxrwxrwx  4 root   root   4.0K Apr 24 23:53 [34;42m..[0m/
drwxr-xr-x  3 root   root   4.0K Apr 25 00:38 [01;34mdata[0m/
drwxrwxrwx  3 ubuntu ubuntu 4.0K Apr 24 23:25 [34;42mfastq[0m/
drwxrwxr-x  2 ubuntu ubuntu 4.0K May 18 18:55 [01;34mfastq_dump_v2[0m/
drwxrwxr-x  2 ubuntu ubuntu 4.0K May 18 18:54 [01;34mfastq_dump_v3[0m/
drwxrwxr-x  3 ubuntu ubuntu 4.0K May 18 19:07 [01;34mfigures[0m/
drwxrwxr-x  4 ubuntu ubuntu 4.0K May 16 21:20 [01;34mgenome[0m/
drwxr-xr-x  2 root   root   4.0K Apr 24 23:53 [01;34mhca[0m/
-rw-rw-r--  1 ubuntu ubuntu  80M May 16 21:35 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic
-rw-rw-r--  1 ubuntu ubuntu  34M May 16 21:37 Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic
-rw-rw-r--  1 ubuntu ubuntu  17M May 16 21:38 Homo_sapiens.GRCh38.pep.all.fa.nuclear_or_cytoplasmic
drwxrwxr-x 25 ubuntu ubuntu 4.0K May 16 20:18 [01;34mrawdata[0m/
drwxrwxr-

In [21]:
! echo $data_folder

/home/ubuntu/code/sequence-localization/data/021_get_nucleus_cytoplasm_sequences


In [22]:
! cp /mnt/data/*nuclear_or_cytoplasmic $data_folder

In [23]:
! ls -lha $data_folder

total 226M
drwxrwxr-x 2 ubuntu ubuntu 4.0K May 16 23:10 .
drwxrwxr-x 8 ubuntu ubuntu 4.0K May 16 21:31 ..
-rw-rw-r-- 1 ubuntu ubuntu  80M May 18 19:54 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic
-rw-rw-r-- 1 ubuntu ubuntu 169K May 18 19:53 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.target.tsv
-rw-rw-r-- 1 ubuntu ubuntu  25K May 16 23:09 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.target.tsv.test
-rw-rw-r-- 1 ubuntu ubuntu  97K May 16 23:09 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.target.tsv.train
-rw-rw-r-- 1 ubuntu ubuntu  12M May 16 23:09 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.test
-rw-rw-r-- 1 ubuntu ubuntu  48M May 16 23:09 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.train
-rw-rw-r-- 1 ubuntu ubuntu  34M May 18 19:54 Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic
-rw-rw-r-- 1 ubuntu ubuntu 106K May 18 19:53 Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic.target.tsv
-rw-rw-r-- 1 ubuntu

In [24]:
! head $data_folder/*

==> /home/ubuntu/code/sequence-localization/data/021_get_nucleus_cytoplasm_sequences/Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic <==
>ENST00000419783.2 cdna chromosome:GRCh38:3:49357171:49358600:-1 gene:ENSG00000233276.4 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
GAGCCCTCGAGGGCCCCAGCCCTTGGAAGGGTAACCTGGACCGCTGCCGCCTGGTTGCCT
GGGCCAGACCAGACATGCCTGCTGCTCCTTCCGGCTTAGGAGGAGCACGCGTCCCGCTCG
GGCGCACTCTCCAGCCTTTTCCTGGCTGAGGAGGGGCCGAGCCCTCCGGGTAGGGCGGGG
GCCGGATGAGGCGGGACCCTCAGGCCCGGAAAACTGCCTGTGCCACGTGACCCGCCGCCG
GCCAGTTAAAAGGAGGCGCCTGCTGGCCTCCCCTTACAGTGCTTGTTCGGGGCGCTCCGC
TGGCTTCTTGGACAATTGCGCCATGTGTGCTGCTCGGCTAGCGGCGGCGGCGGCGGCGGC
CCAGTCGGTGTATGCCTTCTCGGCGCGCCCGCTGGCCGGCGGGGAGCCTGTGAGCCTGGG
CTCCCTGCGGGGCAAGGTACTACTTATCGAGAATGTGGCGTCCCTCTGAGGCACCACGGT
CCGGGACTACACCCAGATGAACGAGCTGCAGCGGCGCCTCGGACCCCGGGGCCTGGTGGT

==> /home/ubuntu/code/sequence-localization/dat