Skip to content

Commit

Permalink
Merge 2e554bf into 9221a19
Browse files Browse the repository at this point in the history
  • Loading branch information
kcotto committed May 6, 2020
2 parents 9221a19 + 2e554bf commit 8a3d6ed
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 16 deletions.
37 changes: 37 additions & 0 deletions scripts/run_stats_modified.py
@@ -0,0 +1,37 @@
import subprocess
import os
import glob
import shutil
import sys

def run(cmd: str) -> None:
subprocess.run(cmd, shell=True, check=True, stdout=sys.stdout)

cohorts = ['SKCM', 'GBM', 'READ', 'ESCA', 'PAAD', 'SARC',
'OV', 'KIRP', 'CESC', 'KIRC', 'LIHC', 'STAD', 'BLCA',
'COAD', 'LUSC', 'HNSC', 'LGG', 'LUAD', 'UCEC', 'BRCA']]

for cohort in cohorts:
os.makedirs(f'{cohort}/samples', exist_ok=True)
os.makedirs(f'{cohort}/compare_junctions/hist', exist_ok=True)
bed_files = glob.glob(f'{cohort}*modified.bed')
for bed_file in bed_files:
shutil.copy(bed_file, f'{cohort}/{bed_file}')
os.chdir(f'{cohort}/samples')
run(f'aws s3 cp s3://regtools-results-unstranded/{cohort}/ . --recursive')
tar_files = glob.glob('*.tar.gz')
for tar_file in tar_files:
run(f'tar xzf {tar_file}')
os.remove(tar_file)
run('rm -rf all*; rm -rf compare_junctions*')
os.chdir('..')
run('ls samples/ > dir_names.tsv')
bed_files = glob.glob(f'*modified.bed')
for bed_file in bed_files:
tag = bed_file.split('_')[1]
os.rename(bed_file, f'all_splicing_variants_{tag}.bed')
run(f'python3 /home/ec2-user/workspace/regtools/scripts/stats_wrapper.py {tag}')
run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/filter_and_BH.R {tag}')
run(f'aws s3 cp compare_junctions/ s3://regtools-results-unstranded/{cohort}/compare_junctions3/ --recursive)')
os.chdir('..')
shutil.rmtree(cohort)
47 changes: 31 additions & 16 deletions scripts/stats_wrapper.py
Expand Up @@ -17,21 +17,37 @@
tag = args.tag
cwd = os.getcwd()

lines_per_file = 25000
smallfile = None
with open(f'all_splicing_variants_{tag}.bed', 'r') as bigfile:
header = bigfile.readline()
for lineno, line in enumerate(bigfile):
if lineno % lines_per_file == 0:
if smallfile:
smallfile.close()
small_filename = 'small_file_{}.txt'.format(lineno + lines_per_file)
smallfile = open(small_filename, "w")
smallfile.write(header)
target_lines_per_file = 25000
lines_per_file = 0
input_file = f'all_splicing_variants_{tag}.bed'
lines = open(input_file).readlines()
count = len(lines)
if count <= lines_per_file:
subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/data/compare_junctions_hist_v2.R {tag} {input_file}')
else:
header = lines[0]
lines.pop(0)
lines.sort()
filenum = 1
small_filename = f'small_file_{filenum}.txt'
smallfile = open(small_filename, "w")
smallfile.write(header)
lines_per_file += target_lines_per_file
for lineno, line in enumerate(lines):
smallfile.write(line)
if smallfile:
smallfile.close()
#get chunks
if lineno >= lines_per_file:
fields1 = line.split('\t')
variant1 = f'{fields1[0]}_{fields1[1]}_{fields1[2]}'
fields2 = lines[lineno+1].split('\t')
variant2 = f'{fields2[0]}_{fields2[1]}_{fields2[2]}'
if variant1 != variant2:
smallfile.close()
filenum += 1
small_filename = f'small_file_{filenum}.txt'
smallfile = open(small_filename, "w")
smallfile.write(header)
lines_per_file += target_lines_per_file
# get chunks
files = glob.glob('small_file_*')
files.sort()
number_of_in_files = len(files)
Expand All @@ -53,5 +69,4 @@
print("Number of output files doesn't match the number of input files that should have been processed")
files = glob.glob('small_file_*')
for file in files:
os.remove(file)

os.remove(file)

0 comments on commit 8a3d6ed

Please sign in to comment.