# Exploring Other Sources of Reports 



In [1]:
import json
import sys
import pandas as pd
from glob import glob
from os.path import join

def get_family_participants(report_paths:list) -> list:
    mapping_list = []
    for report in report_paths:

        if report.endswith(".csv"):
            sep = ","
        elif report.endswith(".tsv"):
            sep = "\t"
        else:
            print("Unknown fileformat and delimiter")
            sys.exit(1)

        try:
            df = pd.read_csv(report, sep=sep)
        except UnicodeDecodeError:
            print("UnicodeDecodeError on %s. Trying latin-1 decoding." % report)
            df = pd.read_csv(report, encoding="latin-1", sep=sep)
        except Exception as e:
            print(
                "Report '{}' could not be read in, please double check this is a valid csv!".format(
                    report
                )
            )
            sys.exit(1)

        # these columns are 'wide' wrt the variants
        d = {"Zygosity": [], "Burden": [], "Alt_depths": []}

        # get all columns with Zygosity, Burden, or Alt_depths - these are unique for each participant
        for key in d:
            d[key].extend([col for col in df.columns if col.startswith(key)])

        # get sample names - preserved order for genotype and trio coverage
        samples = [col.replace("Zygosity.", "").strip() for col in d["Zygosity"]]

        family = basename(report).split(".")[0]

        mapping_list.append({'family': family, 'samples': samples, 'report_name': report})
        
    return mapping_list
    

## Remaining Reports from 2022-05-03

In [2]:
folders = ['/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//1071', '/hpf/largeprojects/ccm_dccforge/dccforge/results/17x/1761', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//1851', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//1869', '/hpf/largeprojects/ccm_dccforge/dccforge/results/19x/1900', '/hpf/largeprojects/ccm_dccforge/dccforge/results/18x/1843', '/hpf/largeprojects/ccm_dccforge/dccforge/results/16x/1673', '/hpf/largeprojects/ccm_dccforge/dccforge/results/20x/2000', '/hpf/largeprojects/ccm_dccforge/dccforge/results/20x/2036', '/hpf/largeprojects/ccm_dccforge/dccforge/results/21x/2151', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2099', '/hpf/largeprojects/ccm_dccforge/dccforge/results/11x/1110R', '/hpf/largeprojects/ccm_dccforge/dccforge/results/2x/207;/hpf/largeprojects/ccm_dccforge/dccforge/results/10x/1032R', '/hpf/largeprojects/ccm_dccforge/dccforge/results/2x/207', '/hpf/largeprojects/ccm_dccforge/dccforge/results/15x/1548', '/hpf/largeprojects/ccm_dccforge/dccforge/results/15x/1557', '/hpf/largeprojects/ccm_dccforge/dccforge/results/15x/1556', '/hpf/largeprojects/ccm_dccforge/dccforge/results/15x/1581', '/hpf/largeprojects/ccm_dccforge/dccforge/results/16x/1602', '/hpf/largeprojects/ccm_dccforge/dccforge/results/16x/1639', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2388', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2344', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2367', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2379', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2384', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2385', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2484', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2486', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2487', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2488', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2489', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2491', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2492', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2493', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2494', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2495', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2496', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2497', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2502', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2507', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2555', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2601', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2627', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2636', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2638', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2639', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2646', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2650', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2667', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2677', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2689', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2692', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2696', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2709', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2711', '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2714']

In [3]:
from pprint import pprint
report_paths = []
for path in folders:
    
    reports = glob(join(path, '**/*wes*csv'), recursive = True)
    reports = sorted([x for x in reports if 'synonymous' not in x and 'clinical' not in x and 'old' not in x])
    if len(reports) > 1:
        print(path)
        pprint(reports)
    elif len(reports) == 0:
        print(f'No reports found for {path}')
    else:
        report_paths.append({pareports[-1])

No reports found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//1071
No reports found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//1851
No reports found for /hpf/largeprojects/ccm_dccforge/dccforge/results/16x/1673
No reports found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2099
/hpf/largeprojects/ccm_dccforge/dccforge/results/11x/1110R
['/hpf/largeprojects/ccm_dccforge/dccforge/results/11x/1110R/1110R.wes.2018-09-29.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/11x/1110R/1110R.wes.2018-09-30.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/11x/1110R/1110R.wes.2018-11-27.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/11x/1110R/1110R.wes.2019-07-24.csv']
No reports found for /hpf/largeprojects/ccm_dccforge/dccforge/results/2x/207;/hpf/largeprojects/ccm_dccforge/dccforge/results/10x/1032R
/hpf/largeprojects/ccm_dccforge/dccforge/results/2x/207
['/hpf/largeprojects/ccm_dccforge/dccforge/results/

In [4]:
report_paths

['/hpf/largeprojects/ccm_dccforge/dccforge/results/17x/1761/1761.wes.regular.2020-07-22.csv',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//1869/1869.wes.regular.2021-01-08.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/19x/1900/1900.wes.regular.2020-06-01.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/18x/1843/1843.wes.regular.2020-06-03.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/20x/2000/2000.wes.regular.2021-07-21.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/20x/2036/2036.wes.regular.2021-02-22.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/21x/2151/2151.wes.regular.2021-10-22.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/15x/1557/1557.wes.2019-09-27.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/15x/1556/1556.wes.2019-09-27.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/15x/1581/1581.wes.2019-09-27.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/16x/1602/1602.wes.regul

## Madeline's expected matches


In [2]:
missing = pd.read_csv('madeline-missing-2022-04-24.csv')
missing

Unnamed: 0,family_folder
0,/hpf/largeprojects/ccm_dccforge/dccforge/resul...
1,/hpf/largeprojects/ccm_dccforge/dccforge/resul...
2,/hpf/largeprojects/ccmbio/ccmmarvin_shared/exo...
3,/hpf/largeprojects/ccm_dccforge/dccforge/resul...
4,/hpf/largeprojects/ccm_dccforge/dccforge/resul...
5,/hpf/largeprojects/ccm_dccforge/dccforge/resul...
6,/hpf/largeprojects/ccm_dccforge/dccforge/resul...
7,/hpf/largeprojects/ccm_dccforge/dccforge/resul...
8,/hpf/largeprojects/ccm_dccforge/dccforge/resul...
9,/hpf/largeprojects/ccm_dccforge/dccforge/resul...


In [17]:
from pprint import pprint
report_paths = []
for path in missing['family_folder'].tolist():
    
    reports = glob(join(path, '**/*wes*csv'), recursive = True)
    reports = sorted([x for x in reports if 'synonymous' not in x and 'clinical' not in x and 'old' not in x])
    if len(reports) > 1:
        print(path)
        pprint(reports)
    elif len(reports) == 0:
        print(f'No reports found for {path}')
    else:
        report_paths.append(reports[-1])

No reports found for /hpf/largeprojects/ccm_dccforge/dccforge/results/16x/1673
No reports found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress//2099
/hpf/largeprojects/ccm_dccforge/dccforge/results/12x/1220R
['/hpf/largeprojects/ccm_dccforge/dccforge/results/12x/1220R/1220R.wes.2018-09-30.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/12x/1220R/1220R.wes.2018-11-12.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/12x/1220R/1220R.wes.2018-11-27.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/12x/1220R/1220R.wes.2019-07-25.csv']
/hpf/largeprojects/ccm_dccforge/dccforge/results/10x/1087R
['/hpf/largeprojects/ccm_dccforge/dccforge/results/10x/1087R/1087R.wes.2018-09-25.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/10x/1087R/1087R.wes.2018-09-29.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/10x/1087R/1087R.wes.2018-11-27.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/10x/1087R/1087R.wes.2019-07-24.csv']
/hpf/large

In [18]:
len(report_paths)

36

In [19]:
report_paths

['/hpf/largeprojects/ccm_dccforge/dccforge/results/20x/2036/2036.wes.regular.2021-02-22.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/23x/2321/report/coding/2321/2321.wes.regular.2022-02-05.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/24x/2403/report/coding/2403/2403.wes.regular.2022-02-05.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/24x/2439/report/coding/2439/2439.wes.regular.2022-02-08.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/24x/2440/report/coding/2440/2440.wes.regular.2022-02-05.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/24x/2442/report/coding/2442/2442.wes.regular.2022-02-05.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/24x/2443/report/coding/2443/2443.wes.regular.2022-02-05.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/24x/2446/report/coding/2446/2446.wes.regular.2022-02-09.csv',
 '/hpf/largeprojects/ccm_dccforge/dccforge/results/24x/2448/report/coding/2448/2448.wes.regular.2022-02-05.csv',
 '

## In Progress Reports

In [29]:
from glob import glob
from os.path import basename, join, dirname
from datetime import datetime
from pprint import pprint

In [79]:
RESULTS_PATH = '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress/'


In [80]:
# use same code as current genome

result_paths = []
for family in glob(join(RESULTS_PATH, "*/")):
    
    if basename(dirname(family))[0].isdigit():
        print(family)
#         no specific pattern in directory structure since it's changed over time, looks like the folder containing reports can be be either 'report' or 'reports'

        reports = glob(join(family, "**/*wes*"), recursive = True)
        reports = [x for x in reports if "clinical" not in x and 'synonymous' not in x] # changed here

        if len(reports) > 1:
            print("More than one report found.")
            pprint(reports) 
            result_paths.append(sorted(reports)[-1]) # changed here
        elif len(reports) == 1:
            result_paths.append(sorted(reports)[-1]) # changed here
        else:
            print("\t\t\tNo report files found for %s" % family)

/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress/2453/
/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress/1167/
/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress/2709/
			No report files found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress/2709/
/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress/2638/
			No report files found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress/2638/
/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress/2677/
			No report files found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress/2677/
/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress/1740/
More than one report found.
['/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress/1740/report/coding/1740/1740.wes.regular.2022-04-23.csv',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/exomes/in_progress/1740/report/coding/1740/1740.wes.regular.2022-04-23.w_hpoterms.tsv']
/hpf/largeprojects

In [81]:
len(result_paths)

63

### Old DCCForge
This should be a one-time thing..

In [3]:
from glob import glob
from os.path import basename, join, dirname
from datetime import datetime
from pprint import pprint

In [34]:
RESULTS_PATH = "/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/"

In [35]:
report_paths = []
for family_prefix_dir in glob(join(RESULTS_PATH, "*x/")):
    print(family_prefix_dir)
    
    # sorted - ascending order
    for family in glob(join(family_prefix_dir, "*/")): 
        print('\t' + family)
        # filters out config files and files ending in {create|merge}_report.csv 
        reports = glob(join(family, "**/*csv"), recursive = True)
        reports = [x for x in reports if "config" not in x and "report" not in x and "data_versions" not in x]
        print('\t\t', reports)

        # reports are not date timestamped, so when sorted the first one should be identical to the family dir, see 568
        if len(reports) > 1:
            print("More than one report found.")
            report_paths.append(sorted(reports)[0])
        elif len(reports) == 1:
            report_paths.append(sorted(reports)[0]) 
        else:
            print("No report files found for %s" % family)

/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/282_3/
		 ['/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/282_3/282_3.csv']
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/269_19/
		 ['/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/269_19/269_19.csv']
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/228/
		 ['/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/228/228.csv']
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/291/
		 ['/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/291/291.csv']
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/255_3/
		 ['/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/255_3/255_3.csv']
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/290/
		 ['/hpf/largeproj

In [38]:
len(report_paths)

477

In [9]:
old_exome_mapping = get_family_participants(report_paths)
    

UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/282_3/282_3.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/269_19/269_19.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/228/228.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/291/291.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/255_3/255_3.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/290/290.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/279/279.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_p

In [10]:
old_exome_mapping

[{'family': '282_3',
  'samples': ['282_IN0069', '282_IN0070', '282_IN0071'],
  'report_name': '/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/282_3/282_3.csv'},
 {'family': '269_19',
  'samples': ['269_S18'],
  'report_name': '/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/269_19/269_19.csv'},
 {'family': '228',
  'samples': ['228_10.8529', '228_10.8530'],
  'report_name': '/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/228/228.csv'},
 {'family': '291',
  'samples': ['291_VAN.01.001.01',
   '291_VAN.01.002.01A',
   '291_VAN.01.003.01B',
   '291_VAN.01.004.01A'],
  'report_name': '/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/291/291.csv'},
 {'family': '255_3',
  'samples': ['255.LH.13', '255.MH.14'],
  'report_name': '/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/255_3/255_3.csv'},
 {'family': '290',
  'samples': ['290_WI04', '290_WI05'],
  'report_name': '/hpf/largeprojects

In [6]:
with open(f"old-exome-family-participant-reports-{datetime.today().strftime('%Y-%m-%d')}.json", 'w') as f:
    json.dump(old_exome_mapping,f, indent = 4)

UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/282_3/282_3.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/269_19/269_19.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/228/228.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/291/291.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/255_3/255_3.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/290/290.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/279/279.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_p

In [25]:
# import pandas as pd

# report = '/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/5x/527/527.csv'

# try:
#     df = pd.read_csv(report, sep=',')
# except UnicodeDecodeError:
#     print("UnicodeDecodeError on %s. Trying latin-1 decoding." % report)
#     df = pd.read_csv(report, encoding='latin-1', sep=',')
        

UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/5x/527/527.csv. Trying latin-1 decoding.


In [26]:
df.head(5)

Unnamed: 0,Position,UCSC_Link,Ref,Alt,Zygosity.527_CH0052,Gene,Burden.527_CH0052,gts,Variation,Info_ensembl,...,Exac_missense_score,Exac_het,Exac_hom_alt,Conserved_in_29_mammals,Sift_score,Polyphen_score,Cadd_score,Imprinting_status,Imprinting_expressed_allele,Pseudoautosomal
0,chr1:1007424,"=HYPERLINK(""http://genome.ucsc.edu/cgi-bin/hgT...",G,A,Het,RNF223,1,G/A,missense_variant,RNF223:exonNA:ENST00000453464.2:c.523C>T:ENSP0...,...,,0,0,0,0.04,0.328,8.03,,,
1,chr1:109707313,"=HYPERLINK(""http://genome.ucsc.edu/cgi-bin/hgT...",C,T,Het,KIAA1324,1,C/T,missense_variant,KIAA1324:exon3/22:ENST00000369939.3:c.467C>T:E...,...,0.284556,8,0,1,0.35,0.003,16.05,,,
2,chr1:11150714,"=HYPERLINK(""http://genome.ucsc.edu/cgi-bin/hgT...",C,G,Het,EXOSC10,1,C/G,missense_variant,EXOSC10:exon6/24:ENST00000304457.7:c.655G>C:EN...,...,,0,0,1,0.12,0.042,12.23,,,
3,chr1:117644038,"=HYPERLINK(""http://genome.ucsc.edu/cgi-bin/hgT...",A,T,Het,TTF2,1,A/T,missense_variant,TTF2:exon23/23:ENST00000369466.4:c.3381A>T:ENS...,...,0.117307,60,1,1,0.11,0.899,17.23,,,
4,chr1:118629411,"=HYPERLINK(""http://genome.ucsc.edu/cgi-bin/hgT...",T,C,Het,SPAG17,2,T/C,splice_region_variant,"SPAG17:exonNA:ENST00000336338.5:c.1498-4A>G:,S...",...,-2.051721,86,1,1,,,0.04,,,


## Genome Reports

In [22]:
from glob import glob
from os.path import basename, join, dirname
from datetime import datetime
from pprint import pprint

In [23]:
GENOME_PATHS = ['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes', '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results']

### Old DCCForge Reports

In [27]:

old_genome_paths = []
for family_prefix_dir in glob(join(GENOME_PATHS[1], "*/")):
    print(family_prefix_dir)
    
    # no specific pattern in directory structure since it's changed over time, looks like the folder containg reports can be be either 'report' or 'reports'
    for folder in glob(join(family_prefix_dir, "*/")):

        # these are time stamped so when sorted we take the latest one 
        if 'report' in folder:
            print('\t' + folder)
            reports = glob(join(folder, "**/*wes*csv"), recursive = True)
            reports = sorted([x for x in reports if "clinical" not in x])
            
            if len(reports) > 1:
                print("More than one report found.")
                pprint(reports)
                old_genome_paths.append((reports)[-1])
            elif len(reports) == 1:
                old_genome_paths.append((reports)[-1]) 
            else:
                print("\t\t\tNo report files found for %s" % family_prefix_dir)


/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1544/
	/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1544/reports/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/2203/
	/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/2203/report/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/441/
	/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/441/reports/
More than one report found.
['/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/441/reports/441.wes.regular.2020-07-27.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/441/reports/441.wes.regular.2020-10-16.csv']
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/808/
	/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/808/reports/
More than one report found.
['/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/808/reports/808.wes.regular.2020-08-24.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/808/reports/808.wes.regular.2020-10-16.csv']
/hpf/

In [25]:
len(old_genome_paths)

129

In [137]:
current_genome_paths[:20]

['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1629/report/coding/1629/1629.wes.regular.2021-05-14.csv',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1933/report/coding/1933/1933.wes.regular.2021-04-10.csv',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1899/reports/1899.wes.regular.2021-01-30.csv',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1730rerun/report/coding/1730/1730.wes.regular.2021-12-02.csv',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/166_crg2/report/coding/166/166.wes.regular.2022-01-15.csv',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1441/reports/1441.wes.regular.2021-01-29.csv',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1974/report/coding/1974/1974.wes.regular.2021-09-23.csv',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1744/report/coding/1744/1744.wes.regular.2021-06-27.csv',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1201/reports/1201.wes.regular.2021-02-10.csv',
 '/hpf/largeprojects/ccmbio

In [138]:
old_genome_paths[:20]

['/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1544/reports/1544.wes.regular.2020-11-04.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/2203/report/coding/2203/2203.wes.regular.2021-07-18.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/441/reports/441.wes.regular.2020-10-16.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/808/reports/808.wes.regular.2020-10-16.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1691/reports/1691.wes.regular.2020-12-13.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1590/report/coding/1590/1590.wes.regular.2021-11-23.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1740/reports/1740.wes.regular.2020-11-09.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1504/reports/1504.wes.regular.2021-02-25.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/207/reports/207.wes.regular.2021-03-03.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdi

In [29]:
[x for x in old_genome_paths if 'coding' in x]

['/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/2203/report/coding/2203/2203.wes.regular.2021-07-18.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1590/report/coding/1590/1590.wes.regular.2021-11-23.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/846/report/coding/846/846.wes.regular.2022-04-07.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1746/report/coding/1746/1746.wes.regular.2021-07-25.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/2262/report/coding/2262/2262.wes.regular.2021-11-20.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/2332/report/coding/2332/2332.wes.regular.2021-11-24.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1887/report/coding/1887/1887.wes.regular.2021-11-13.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1634/report/coding/1634/1634.wes.regular.2022-04-09.csv',
 '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/261/report/coding

In [11]:
old_genome_fam_ptps = get_family_participants(old_genome_paths)

UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1590/report/coding/1590/1590.wes.regular.2021-11-23.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/207/reports/207.wes.regular.2021-03-03.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/667/reports/667.wes.regular.2020-07-31.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/C1003/reports/C1003.wes.regular.2020-08-29.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1342/reports/1342.wes.regular.2021-02-25.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1856/reports/1856.wes.regular.2021-02-20.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/64/reports/64.wes.r

In [12]:
with open(f"old-genome-family-participant-reports-{datetime.today().strftime('%Y-%m-%d')}.json", 'w') as f:
    json.dump(old_genome_fam_ptps,f, indent = 4)

### Current (on-going) DCCForge Reports

In [30]:
current_genome_paths = []
for family_prefix_dir in glob(join(GENOME_PATHS[0], "*/")):
    
    if basename(dirname(family_prefix_dir))[0].isdigit():
        print(family_prefix_dir)
        # no specific pattern in directory structure since it's changed over time, looks like the folder containing reports can be be either 'report' or 'reports'
        for folder in glob(join(family_prefix_dir, "*/")):
            
            if 'report' in folder:
                print('\t' + folder)
                reports = glob(join(folder, "**/*wes*csv"), recursive = True)
                reports = [x for x in reports if "clinical" not in x]
#                 print('\t\t', reports)

                if len(reports) > 1:
                    print("More than one report found.")
                    pprint(reports)
                    current_genome_paths.append(sorted(reports)[0])
                elif len(reports) == 1:
                    current_genome_paths.append(sorted(reports)[0]) 
                else:
                    print("\t\t\tNo report files found for %s" % family_prefix_dir)

/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1629/
	/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1629/report/
		 ['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1629/report/coding/1629/1629.wes.regular.2021-05-14.csv']
/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1933/
	/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1933/report/
		 ['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1933/report/coding/1933/1933.wes.regular.2021-04-10.csv']
/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1899/
	/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1899/reports/
		 ['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1899/reports/1899.wes.regular.2021-01-30.csv']
/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1730rerun/
	/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1730rerun/report/
		 ['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1730rerun/report/coding/1730/1730.wes.regular.2021-12-02.csv']
/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/

In [14]:
len(current_genome_paths)

103

In [108]:
[x for x in current_genome_paths if 'synonymous' in x]

[]

In [15]:
current_genome_fam_ptps = get_family_participants(current_genome_paths)

UnicodeDecodeError on /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1201/reports/1201.wes.regular.2021-02-10.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1737/report/coding/1737/1737.wes.regular.2021-05-30.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1922/report/coding/1922/1922.wes.regular.2021-11-23.csv. Trying latin-1 decoding.


In [16]:
with open(f"genome-family-participant-reports-{datetime.today().strftime('%Y-%m-%d')}.json", 'w') as f:
    json.dump(current_genome_fam_ptps, f, indent = 4)

### Test 

In [122]:
input_dict = {
    'current_exome': '/hpf/largeprojects/ccm_dccforge/dccforge/results',
    'old_exome' : '/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1',
    'current_genome': '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes',
    'old_genome': '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results'
    
}

In [84]:
family = '/hpf/largeprojects/ccm_dccforge/dccforge/results/23x/2370/'
# family = '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1629/'
family = '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1850/'
reports = glob(join(family, "**/*wes*"), recursive = True)

In [87]:
filter_reports(reports, 'current_genome')

['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1850/bcbio-small-variants/1850/1850.wes.regular.2020-11-23.csv']

In [159]:
%%time
report_paths = []
from typing import List
def filter_reports(reports:List[str], report_type:str) -> List[str]:
        # filtering regardless of report type
        
        if report_type == 'old_exome':
            filtered_reports = sorted([x for x in reports 
                                           # for old exomes, filters out config files and files ending in {create|merge}_report.csv 
                                           if "config" not in x and "report" not in x and "data_versions" not in x])
        elif report_type == 'current_exome':
            filtered_reports = sorted([x for x in reports if "clinical" not in x and 'synonymous' not in x])
        elif 'genome' in report_type:
            filtered_reports = sorted([x for x in reports if "clinical" not in x])
        else:
            raise ValueError("report_type must contain either genome or exome")
        return filtered_reports
    
for report_type, root_path in input_dict.items():
    print(report_type, root_path)
    
    if 'exome' in report_type:
        continue
    if 'exome' in report_type:
        globbed_root_path = glob(join(root_path, "*x/"))
    elif 'genome' in report_type:
        globbed_root_path = glob(join(root_path, "*/"))
    else:
        raise ValueError("report_type must contain either genome or exome")

    # traversing the root path
    # this will be 2x, 3x, etc for exomes, or the family_id for genomes
    # additional logic to parse out family subdirs in exomes
    for subfolder in globbed_root_path:

        if 'exome' in report_type:
            
            for family in glob(join(subfolder, "*/")):
                if 'bams' in family:
                    continue
                    
                if 'old_exome' in report_type:
                    pattern = "**/*sv"
                elif 'current_exome' in report_type:
                    pattern = "**/*wes*sv"
                    
                reports = glob(join(family, pattern), recursive = True)
                reports = filter_reports(reports, report_type)

                if len(reports) > 1:
#                     print("\tMore than one report found for %s" % family)
#                     pprint(reports)
                    report_paths.append({"folder": family,"report": reports[-1], "report_type": report_type, "all_reports": reports})
                elif len(reports) == 1:
                    report_paths.append({"folder": family, "report": reports[-1], "report_type": report_type, "all_reports": None})
                else:
                    report_paths.append({"folder": family, "report": None, "report_type": report_type, "all_reports": None})
                    print("\tNo report files found for %s" % family)
                
        elif 'genome' in report_type:
            # unlike exomes, the family_ids sit directly in the root path so we need to filter out some noise
            if basename(dirname(subfolder))[0].isdigit():

                for folder in glob(join(subfolder, "*/")): # recursively searching for 'report' is very slow
                    if 'report' in folder:
                        reports = glob(join(folder, "**/*wes*sv"), recursive = True)

                        reports = filter_reports(reports, report_type)

                        if len(reports) > 1:
            #                 print("\tMore than one report found for %s" % subfolder)
            #                 pprint(reports)
                            report_paths.append({"folder": subfolder, "report": reports[-1], "report_type": report_type, "all_reports": reports})
                        elif len(reports) == 1:
                            report_paths.append({"folder": subfolder, "report": reports[-1], "report_type": report_type, "all_reports": None})
                        else:
                            report_paths.append({"folder": subfolder, "report": None, "report_type": report_type, "all_reports": None})
                            print("\tNo report files found for %s" % folder)

current_exome /hpf/largeprojects/ccm_dccforge/dccforge/results
old_exome /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1
current_genome /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes
	No report files found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1850/reports/
	No report files found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1436/reports/
	No report files found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/159/reports/
	No report files found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/2026/report/
	No report files found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/338/reports/
	No report files found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/319/reports/
	No report files found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/418/report/
	No report files found for /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/407/reports/
	No report files found for /hpf/largeprojects/ccmbio/ccmmarvin_sha

In [160]:
import pandas as pd
df = pd.json_normalize(report_paths)
# how many folders did not have a report
df['report'].isnull().sum()

10

In [161]:
# how many folders did not have a report
print(df['report'].isnull().sum())
df[df['report'].isnull()]['folder'].tolist()

10


['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1850/',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1436/',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/159/',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/2026/',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/338/',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/319/',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/418/',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/407/',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1639/',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/2184/']

In [162]:
df[(~df['report'].isnull())]['report_type'].value_counts()

old_genome        128
current_genome    103
Name: report_type, dtype: int64

In [None]:
df[(df['report_type'] == 'current_genome') & (~df['report'].isnull())]['report'].tolist()

## Debugging 

In [None]:
# current exome

    globbed_dir =  glob(join(RESULTS_PATH, "*x/"))
    

    for family_prefix_dir in globbed_dir:
        if family_prefix_dir in IGNORE_FOLDERS:
            print("\tIgnoring folder it's in an ignored folder..")
            continue

        # sorted - ascending order
        for family in glob(join(family_prefix_dir, "*/")):

            # non-clinical and synonymous reports
            reports = sorted([report for report in glob(join(family, "**/*wes*"), recursive = True) if "clinical" not in report and "synonymous" not in report])

            if len(reports) >= 1:
                report_paths.append(reports[-1]) #latest report since prefix is: yyyy-mm-dd
            else:
                print("No report files found for %s" % family)



In [None]:
# old exome
report_paths = []
for family_prefix_dir in glob(join(RESULTS_PATH, "*x/")):
    print(family_prefix_dir)
    
    for family in glob(join(family_prefix_dir, "*/")): 
        print('\t' + family)
        reports = glob(join(family, "**/*csv"), recursive = True)
         # filters out config files and files ending in {create|merge}_report.csv 
        reports = [x for x in reports if "config" not in x and "report" not in x and "data_versions" not in x]
        print('\t\t', reports)

        # reports are not date timestamped, so when sorted the first one should be identical to the family dir, see 568
        if len(reports) > 1:
            print("More than one report found.")
            report_paths.append(sorted(reports)[0])
        elif len(reports) == 1:
            report_paths.append(sorted(reports)[0]) 
        else:
            print("No report files found for %s" % family)

In [36]:
# old genome
# old_genome_paths = []
for family_prefix_dir in glob(join(GENOME_PATHS[1], "*/")):
    if basename(dirname(family_prefix_dir))[0].isdigit():"print(family_prefix_dir)
    
    # no specific pattern in directory structure since it's changed over time, looks like the folder containg reports can be be either 'report' or 'reports'
#     for folder in glob(join(family_prefix_dir, "*/")):

#         # these are time stamped so when sorted we take the latest one 
#         if 'report' in folder:
#             print('\t' + folder)
#             reports = glob(join(folder, "**/*wes*csv"), recursive = True)
#             reports = sorted([x for x in reports if "clinical" not in x])
            
#             if len(reports) > 1:
#                 print("More than one report found.")
#                 pprint(reports)
#                 old_genome_paths.append((reports)[-1])
#             elif len(reports) == 1:
#                 old_genome_paths.append((reports)[-1]) 
#             else:
#                 print("\t\t\tNo report files found for %s" % family_prefix_dir)



/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1544/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/2203/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/441/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/808/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1691/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1590/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1740/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1504/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/207/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1316/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1630/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1631/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/482/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/667/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1633/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/846/
/hpf/largeproj

In [76]:
# current genome

current_genome_paths = []
for family_prefix_dir in glob(join(GENOME_PATHS[0], "*/")):
    
    if basename(dirname(family_prefix_dir))[0].isdigit():
        print(family_prefix_dir)
#         no specific pattern in directory structure since it's changed over time, looks like the folder containing reports can be be either 'report' or 'reports'
        for folder in glob(join(family_prefix_dir, "*/")):
            
            if 'report' in folder:
                print('\t' + folder)
                reports = glob(join(folder, "**/*wes*"), recursive = True)
                reports = [x for x in reports if "clinical" not in x]
#                 print('\t\t', reports)

                if len(reports) > 1:
                    print("More than one report found.")
                    pprint(reports)
                    current_genome_paths.append(sorted(reports)[0])
                elif len(reports) == 1:
                    current_genome_paths.append(sorted(reports)[0]) 
                else:
                    print("\t\t\tNo report files found for %s" % family_prefix_dir)

/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1629/
	/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1629/report/
More than one report found.
['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1629/report/hpo_annotated/1629.wes.regular.2021-05-14.w_hpoterms.tsv',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1629/report/coding/1629/1629.wes.regular.2021-05-14.csv']
/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1933/
	/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1933/report/
/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1899/
	/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1899/reports/
/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1730rerun/
	/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1730rerun/report/
More than one report found.
['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1730rerun/report/hpo_annotated/1730.wes.regular.2021-12-02.w_hpoterms.tsv',
 '/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1730rerun/report/coding/1730

In [77]:
len(current_genome_paths)

103

In [None]:
pprint(current_genome_paths)

## Copying script

In [139]:
df = pd.read_csv('all-report-paths-2022-04-20.csv')

In [169]:
df[~df['report'].isnull()]['report_type'].value_counts()

current_exome     1840
old_exome          477
old_genome         128
current_genome     103
Name: report_type, dtype: int64

In [145]:
import os
os.getcwd()

'/home/delvinso/variant_report'

In [147]:
[ os.path.basename(x) for x in df['report'].tolist()[:5]]

['269xS27.wes.2019-07-24.csv',
 '269xS19.wes.2019-07-24.csv',
 '228.wes.regular.2020-05-15.csv',
 '235.wes.2019-07-29.csv',
 '290.wes.2019-07-24.csv']

In [163]:
import os 
import pandas as pd
import shutil

df = pd.read_csv('all-report-paths-2022-04-20.csv')
to_move_dir = f"/home/delvinso/variant_report/all_reports-{datetime.today().strftime('%Y-%m-%d')}"

ss_df = df[(~df['report'].isnull())]

for report_type, subdf in ss_df.groupby('report_type'):
    
    report_type_folder = os.path.join(to_move_dir, report_type)

    if not os.path.exists(report_type_folder):
        os.makedirs(report_type_folder)
        
    for report in subdf['report'].tolist():
        # preserves the metadata
        shutil.copy2(report, os.path.join(report_type_folder, os.path.basename(report)))
    


In [156]:
report

nan

In [56]:
report_paths

[]