# Create QMAP file from all chromosomes in TUMOR samples

In [None]:
## With the clonal mutations that we have obtained 

In [None]:
# Create a file with this information for all samples

In [1]:
import os, sys

# Paths from VEP input directories
path_sar = "/workspace/projects/sjd_melos/vep/vep_input_files/sarcoma/"
path_lung = "/workspace/projects/sjd_melos/vep/vep_input_files/lung/"

# listdir from os permits displaying all files from a specific directory
files_sar = os.listdir(path_sar)
files_lung = os.listdir(path_lung)

files_sar

['chr1.tsv.gz',
 'chr2.tsv.gz',
 'chr3.tsv.gz',
 'chr4.tsv.gz',
 'chr5.tsv.gz',
 'chr6.tsv.gz',
 'chr7.tsv.gz',
 'chr8.tsv.gz',
 'chr9.tsv.gz',
 'chr10.tsv.gz',
 'chr11.tsv.gz',
 'chr12.tsv.gz',
 'chr13.tsv.gz',
 'chr14.tsv.gz',
 'chr15.tsv.gz',
 'chr16.tsv.gz',
 'chr17.tsv.gz',
 'chr18.tsv.gz',
 'chr19.tsv.gz',
 'chr20.tsv.gz',
 'chr21.tsv.gz',
 'chr22.tsv.gz',
 'chrX.tsv.gz',
 'chrY.tsv.gz']

In [2]:
# Output path to be created with VEP

path_sar_out = "/workspace/projects/sjd_melos/vep/vep_output_files/sarcoma/"
path_lung_out = "/workspace/projects/sjd_melos/vep/vep_output_files/lung/"

In [3]:
# Parameters to be aggregated (check in: https://www.ensembl.org/info/docs/tools/vep/script/vep_options.html) 

params = " -tab --assembly GRCh38 --no_stats --cache --symbol --protein --canonical --mane --offline --af_1kg "
vep_dir = "/workspace/datasets/vep/"
gnomad = "--custom /workspace/datasets/gnomad/data/v4.0/hg38/gnomad.genomes.v4.0.sites."
format = " --format vcf"

command = "/workspace/datasets/vep/homo_sapiens/ensembl-vep_111.0.sif vep "

In [4]:
# Make a list of chromosomes
chrom = []
for c in range(1,23): 
    chrom.append('chr' + str(c) + '.tsv.gz')
chrom.append('chrX' + '.tsv.gz')
chrom.append('chrY' + '.tsv.gz')
chrom

['chr1.tsv.gz',
 'chr2.tsv.gz',
 'chr3.tsv.gz',
 'chr4.tsv.gz',
 'chr5.tsv.gz',
 'chr6.tsv.gz',
 'chr7.tsv.gz',
 'chr8.tsv.gz',
 'chr9.tsv.gz',
 'chr10.tsv.gz',
 'chr11.tsv.gz',
 'chr12.tsv.gz',
 'chr13.tsv.gz',
 'chr14.tsv.gz',
 'chr15.tsv.gz',
 'chr16.tsv.gz',
 'chr17.tsv.gz',
 'chr18.tsv.gz',
 'chr19.tsv.gz',
 'chr20.tsv.gz',
 'chr21.tsv.gz',
 'chr22.tsv.gz',
 'chrX.tsv.gz',
 'chrY.tsv.gz']

In [7]:
print(type(files_sar))

<class 'list'>


In [6]:
print(type(chrom))

<class 'list'>


In [9]:
# First prove code: 
prova = files_sar[0:2]
serie = []
for c in chrom: 
    if c in prova: # add elements in the serie based on chromosome file names
        serie.append(command + str('--dir ') + vep_dir  + str(' -i ') + path_sar + c + format + str(' -o ') + path_sar_out + c + params + gnomad + c + str('.vcf.bgz') + str(',gnomADg,vcf,exact,0,AF,NFE'))

# Substitute tsv.gz.vcf.bgz for vcf.bgz in the series
serie = list(map(lambda x: x.replace("tsv.gz.vcf.bgz", "vcf.bgz"), serie))
serie

['/workspace/datasets/vep/homo_sapiens/ensembl-vep_111.0.sif vep --dir /workspace/datasets/vep/ -i /workspace/projects/sjd_melos/vep/vep_input_files/sarcoma/chr1.tsv.gz --format vcf -o /workspace/projects/sjd_melos/vep/vep_output_files/sarcoma/chr1.tsv.gz -tab --assembly GRCh38 --no_stats --cache --symbol --protein --canonical --mane --offline --af_1kg --custom /workspace/datasets/gnomad/data/v4.0/hg38/gnomad.genomes.v4.0.sites.chr1.vcf.bgz,gnomADg,vcf,exact,0,AF,NFE',
 '/workspace/datasets/vep/homo_sapiens/ensembl-vep_111.0.sif vep --dir /workspace/datasets/vep/ -i /workspace/projects/sjd_melos/vep/vep_input_files/sarcoma/chr2.tsv.gz --format vcf -o /workspace/projects/sjd_melos/vep/vep_output_files/sarcoma/chr2.tsv.gz -tab --assembly GRCh38 --no_stats --cache --symbol --protein --canonical --mane --offline --af_1kg --custom /workspace/datasets/gnomad/data/v4.0/hg38/gnomad.genomes.v4.0.sites.chr2.vcf.bgz,gnomADg,vcf,exact,0,AF,NFE']

## Sarcoma VEP input files

In [10]:
# Now apply the code to the sarcoma input samples
serie_sar = []
for c in chrom: 
    if c in files_sar: # add elements in the serie based on chromosome file names
        serie_sar.append(command + str('--dir ') + vep_dir  + str(' -i ') + path_sar + c + format + str(' -o ') + path_sar_out + c + params + gnomad + c + str('.vcf.bgz') + str(',gnomADg,vcf,exact,0,AF,NFE'))

# Substitute tsv.gz.vcf.bgz for vcf.bgz in the series
serie_sar = list(map(lambda x: x.replace("tsv.gz.vcf.bgz", "vcf.bgz"), serie_sar))        

In [11]:
# To change the separator of the list from comma to \n we can use join function that joins the elements of a string list
sar_result = '\n'.join(serie_sar)
print(sar_result)

/workspace/datasets/vep/homo_sapiens/ensembl-vep_111.0.sif vep --dir /workspace/datasets/vep/ -i /workspace/projects/sjd_melos/vep/vep_input_files/sarcoma/chr1.tsv.gz --format vcf -o /workspace/projects/sjd_melos/vep/vep_output_files/sarcoma/chr1.tsv.gz -tab --assembly GRCh38 --no_stats --cache --symbol --protein --canonical --mane --offline --af_1kg --custom /workspace/datasets/gnomad/data/v4.0/hg38/gnomad.genomes.v4.0.sites.chr1.vcf.bgz,gnomADg,vcf,exact,0,AF,NFE
/workspace/datasets/vep/homo_sapiens/ensembl-vep_111.0.sif vep --dir /workspace/datasets/vep/ -i /workspace/projects/sjd_melos/vep/vep_input_files/sarcoma/chr2.tsv.gz --format vcf -o /workspace/projects/sjd_melos/vep/vep_output_files/sarcoma/chr2.tsv.gz -tab --assembly GRCh38 --no_stats --cache --symbol --protein --canonical --mane --offline --af_1kg --custom /workspace/datasets/gnomad/data/v4.0/hg38/gnomad.genomes.v4.0.sites.chr2.vcf.bgz,gnomADg,vcf,exact,0,AF,NFE
/workspace/datasets/vep/homo_sapiens/ensembl-vep_111.0.sif v

## Lung VEP input files

In [12]:
# Now apply the code to the lung input samples
serie_lung = []
for c in chrom: 
    if c in files_lung: # add elements in the serie based on chromosome file names
        serie_lung.append(command + str('--dir ') + vep_dir  + str(' -i ') + path_lung + c + format + str(' -o ') + path_lung_out + c + params + gnomad + c + str('.vcf.bgz') + str(',gnomADg,vcf,exact,0,AF,NFE'))

# Substitute tsv.gz.vcf.bgz for vcf.bgz in the series
serie_lung = list(map(lambda x: x.replace("tsv.gz.vcf.bgz", "vcf.bgz"), serie_lung))      

In [13]:
# To change the separator of the list from comma to \n we can use join function that joins the elements of a string list
lung_result = '\n'.join(serie_lung)
print(lung_result)

/workspace/datasets/vep/homo_sapiens/ensembl-vep_111.0.sif vep --dir /workspace/datasets/vep/ -i /workspace/projects/sjd_melos/vep/vep_input_files/lung/chr1.tsv.gz --format vcf -o /workspace/projects/sjd_melos/vep/vep_output_files/lung/chr1.tsv.gz -tab --assembly GRCh38 --no_stats --cache --symbol --protein --canonical --mane --offline --af_1kg --custom /workspace/datasets/gnomad/data/v4.0/hg38/gnomad.genomes.v4.0.sites.chr1.vcf.bgz,gnomADg,vcf,exact,0,AF,NFE
/workspace/datasets/vep/homo_sapiens/ensembl-vep_111.0.sif vep --dir /workspace/datasets/vep/ -i /workspace/projects/sjd_melos/vep/vep_input_files/lung/chr2.tsv.gz --format vcf -o /workspace/projects/sjd_melos/vep/vep_output_files/lung/chr2.tsv.gz -tab --assembly GRCh38 --no_stats --cache --symbol --protein --canonical --mane --offline --af_1kg --custom /workspace/datasets/gnomad/data/v4.0/hg38/gnomad.genomes.v4.0.sites.chr2.vcf.bgz,gnomADg,vcf,exact,0,AF,NFE
/workspace/datasets/vep/homo_sapiens/ensembl-vep_111.0.sif vep --dir /wo

## Add header and join all samples

In [14]:
# Include information of the header as a list
header = ['[pre]', '[params]', 'cores = 1', 'memory = 8G', '[jobs]']
header = '\n'.join(header)
print(header)

[pre]
[params]
cores = 1
memory = 8G
[jobs]


In [15]:
# Add header to samples and this is the file to export

qmap = header + '\n' + sar_result + '\n' + lung_result  # adding \n might be unnecessary if you follow the saving instructions below
print(qmap)

[pre]
[params]
cores = 1
memory = 8G
[jobs]
/workspace/datasets/vep/homo_sapiens/ensembl-vep_111.0.sif vep --dir /workspace/datasets/vep/ -i /workspace/projects/sjd_melos/vep/vep_input_files/sarcoma/chr1.tsv.gz --format vcf -o /workspace/projects/sjd_melos/vep/vep_output_files/sarcoma/chr1.tsv.gz -tab --assembly GRCh38 --no_stats --cache --symbol --protein --canonical --mane --offline --af_1kg --custom /workspace/datasets/gnomad/data/v4.0/hg38/gnomad.genomes.v4.0.sites.chr1.vcf.bgz,gnomADg,vcf,exact,0,AF,NFE
/workspace/datasets/vep/homo_sapiens/ensembl-vep_111.0.sif vep --dir /workspace/datasets/vep/ -i /workspace/projects/sjd_melos/vep/vep_input_files/sarcoma/chr2.tsv.gz --format vcf -o /workspace/projects/sjd_melos/vep/vep_output_files/sarcoma/chr2.tsv.gz -tab --assembly GRCh38 --no_stats --cache --symbol --protein --canonical --mane --offline --af_1kg --custom /workspace/datasets/gnomad/data/v4.0/hg38/gnomad.genomes.v4.0.sites.chr2.vcf.bgz,gnomADg,vcf,exact,0,AF,NFE
/workspace/datas

## Save as a QMAP file

In [16]:
with open('/workspace/projects/sjd_melos/vep/VEP_analysis_repeat_sarlung_singularity.qmap', 'w') as f:
    for item in qmap:
        f.write(item) #this respects the format from previous code