# Exploring Other Sources of Reports 



In [3]:
import json
import sys
import pandas as pd
def get_family_participants(report_paths:list) -> list:
    mapping_list = []
    for report in report_paths:

        if report.endswith(".csv"):
            sep = ","
        elif report.endswith(".tsv"):
            sep = "\t"
        else:
            print("Unknown fileformat and delimiter")
            sys.exit(1)

        try:
            df = pd.read_csv(report, sep=sep)
        except UnicodeDecodeError:
            print("UnicodeDecodeError on %s. Trying latin-1 decoding." % report)
            df = pd.read_csv(report, encoding="latin-1", sep=sep)
        except Exception as e:
            print(
                "Report '{}' could not be read in, please double check this is a valid csv!".format(
                    report
                )
            )
            sys.exit(1)

        # these columns are 'wide' wrt the variants
        d = {"Zygosity": [], "Burden": [], "Alt_depths": []}

        # get all columns with Zygosity, Burden, or Alt_depths - these are unique for each participant
        for key in d:
            d[key].extend([col for col in df.columns if col.startswith(key)])

        # get sample names - preserved order for genotype and trio coverage
        samples = [col.replace("Zygosity.", "").strip() for col in d["Zygosity"]]

        family = basename(report).split(".")[0]

        mapping_list.append({'family': family, 'samples': samples, 'report_name': report})
        
    return mapping_list
    

### Old DCCForge
This should be a one-time thing..

In [13]:
from glob import glob
from os.path import basename, join, dirname
from datetime import datetime
from pprint import pprint

In [14]:
RESULTS_PATH = "/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/"

In [17]:
report_paths = []
for family_prefix_dir in glob(join(RESULTS_PATH, "*x/")):
    print(family_prefix_dir)
    
    # sorted - ascending order
    for family in glob(join(family_prefix_dir, "*/")):
        print('\t' + family)
        # filters out config files and files ending in {create|merge}_report.csv 
        reports = glob(join(family, "**/*csv"), recursive = True)
        reports = [x for x in reports if "config" not in x and "report" not in x and "data_versions" not in x]
#         print('\t\t', reports)

        # reports are not date timestamped, so when sorted the first one should be identical to the family dir, see 568
        if len(reports) > 1:
            print("More than one report found.")
            report_paths.append(sorted(reports)[0])
        elif len(reports) == 1:
            report_paths.append(sorted(reports)[0]) 
        else:
            print("No report files found for %s" % family)

/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/282_3/
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/269_19/
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/228/
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/291/
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/255_3/
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/290/
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/279/
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/299/
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/269_18/
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/277/
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/284_3/
	/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/289/
	/hpf/largeproject

In [19]:
len(report_paths)

477

In [20]:

old_exome_mapping = get_family_participants(report_paths)
    
with open(f"old-exome-family-participant-reports-{datetime.today().strftime('%Y-%m-%d')}.json", 'w') as f:
    json.dump(old_exome_mapping,f, indent = 4)

UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/282_3/282_3.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/269_19/269_19.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/228/228.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/291/291.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/255_3/255_3.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/290/290.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/2x/279/279.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_p

In [25]:
# import pandas as pd

# report = '/hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/5x/527/527.csv'

# try:
#     df = pd.read_csv(report, sep=',')
# except UnicodeDecodeError:
#     print("UnicodeDecodeError on %s. Trying latin-1 decoding." % report)
#     df = pd.read_csv(report, encoding='latin-1', sep=',')
        

UnicodeDecodeError on /hpf/largeprojects/ccmbio/naumenko/project_cheo/DCC_Samples_part1/5x/527/527.csv. Trying latin-1 decoding.


In [26]:
df.head(5)

Unnamed: 0,Position,UCSC_Link,Ref,Alt,Zygosity.527_CH0052,Gene,Burden.527_CH0052,gts,Variation,Info_ensembl,...,Exac_missense_score,Exac_het,Exac_hom_alt,Conserved_in_29_mammals,Sift_score,Polyphen_score,Cadd_score,Imprinting_status,Imprinting_expressed_allele,Pseudoautosomal
0,chr1:1007424,"=HYPERLINK(""http://genome.ucsc.edu/cgi-bin/hgT...",G,A,Het,RNF223,1,G/A,missense_variant,RNF223:exonNA:ENST00000453464.2:c.523C>T:ENSP0...,...,,0,0,0,0.04,0.328,8.03,,,
1,chr1:109707313,"=HYPERLINK(""http://genome.ucsc.edu/cgi-bin/hgT...",C,T,Het,KIAA1324,1,C/T,missense_variant,KIAA1324:exon3/22:ENST00000369939.3:c.467C>T:E...,...,0.284556,8,0,1,0.35,0.003,16.05,,,
2,chr1:11150714,"=HYPERLINK(""http://genome.ucsc.edu/cgi-bin/hgT...",C,G,Het,EXOSC10,1,C/G,missense_variant,EXOSC10:exon6/24:ENST00000304457.7:c.655G>C:EN...,...,,0,0,1,0.12,0.042,12.23,,,
3,chr1:117644038,"=HYPERLINK(""http://genome.ucsc.edu/cgi-bin/hgT...",A,T,Het,TTF2,1,A/T,missense_variant,TTF2:exon23/23:ENST00000369466.4:c.3381A>T:ENS...,...,0.117307,60,1,1,0.11,0.899,17.23,,,
4,chr1:118629411,"=HYPERLINK(""http://genome.ucsc.edu/cgi-bin/hgT...",T,C,Het,SPAG17,2,T/C,splice_region_variant,"SPAG17:exonNA:ENST00000336338.5:c.1498-4A>G:,S...",...,-2.051721,86,1,1,,,0.04,,,


## Genome Reports

In [1]:
from glob import glob
from os.path import basename, join, dirname
from datetime import datetime
from pprint import pprint

In [2]:
GENOME_PATHS = ['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes', '/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results']

### Old DCCForge Reports

In [5]:

old_genome_paths = []
for family_prefix_dir in glob(join(GENOME_PATHS[1], "*/")):
    print(family_prefix_dir)
    
    # no specific pattern in directory structure since it's changed over time, looks like the folder containg reports can be be either 'report' or 'reports'
    for folder in glob(join(family_prefix_dir, "*/")):

        # these are time stamped so when sorted we take the latest one 
        if 'report' in folder:
            print('\t' + folder)
            reports = glob(join(folder, "**/*wes*csv"), recursive = True)
            reports = sorted([x for x in reports if "clinical" not in x])
#             print('\t\t', reports)
            
            if len(reports) > 1:
                print("More than one report found.")
                old_genome_paths.append((reports)[-1])
            elif len(reports) == 1:
                old_genome_paths.append((reports)[-1]) 
            else:
                print("\t\t\tNo report files found for %s" % family_prefix_dir)


/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1544/
	/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1544/reports/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/2203/
	/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/2203/report/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/441/
	/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/441/reports/
More than one report found.
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/808/
	/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/808/reports/
More than one report found.
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1691/
	/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1691/reports/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1590/
	/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1590/report/
/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1740/
	/hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1740/reports/
/hpf/largeprojects/

In [6]:
len(old_genome_paths)

127

In [7]:
old_genome_fam_ptps = get_family_participants(old_genome_paths)

UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1590/report/coding/1590/1590.wes.regular.2021-11-23.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/207/reports/207.wes.regular.2021-03-03.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/667/reports/667.wes.regular.2020-07-31.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/C1003/reports/C1003.wes.regular.2020-08-29.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1342/reports/1342.wes.regular.2021-02-25.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/1856/reports/1856.wes.regular.2021-02-20.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccm_dccforge/dccdipg/c4r_wgs/results/64/reports/64.wes.r

In [8]:
with open(f"old-genome-family-participant-reports-{datetime.today().strftime('%Y-%m-%d')}.json", 'w') as f:
    json.dump(old_genome_fam_ptps,f, indent = 4)

### Current (on-going) DCCForge Reports

In [9]:
current_genome_paths = []
for family_prefix_dir in glob(join(GENOME_PATHS[0], "*/")):
    
    if basename(dirname(family_prefix_dir))[0].isdigit():
        print(family_prefix_dir)
        # no specific pattern in directory structure since it's changed over time, looks like the folder containg reports can be be either 'report' or 'reports'
        for folder in glob(join(family_prefix_dir, "*/")):
            
            if 'report' in folder:
                print('\t' + folder)
                reports = glob(join(folder, "**/*wes*csv"), recursive = True)
                reports = [x for x in reports if "clinical" not in x]
                print('\t\t', reports)

                if len(reports) > 1:
                    print("More than one report found.")
                    current_genome_paths.append(sorted(reports)[0])
                elif len(reports) == 1:
                    current_genome_paths.append(sorted(reports)[0]) 
                else:
                    print("\t\t\tNo report files found for %s" % family_prefix_dir)

/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1629/
	/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1629/report/
		 ['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1629/report/coding/1629/1629.wes.regular.2021-05-14.csv']
/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1933/
	/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1933/report/
		 ['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1933/report/coding/1933/1933.wes.regular.2021-04-10.csv']
/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1899/
	/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1899/reports/
		 ['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1899/reports/1899.wes.regular.2021-01-30.csv']
/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1730rerun/
	/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1730rerun/report/
		 ['/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1730rerun/report/coding/1730/1730.wes.regular.2021-12-02.csv']
/hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/

In [10]:
len(current_genome_paths)

101

In [11]:
current_genome_fam_ptps = get_family_participants(current_genome_paths)

UnicodeDecodeError on /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1201/reports/1201.wes.regular.2021-02-10.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1737/report/coding/1737/1737.wes.regular.2021-05-30.csv. Trying latin-1 decoding.
UnicodeDecodeError on /hpf/largeprojects/ccmbio/ccmmarvin_shared/genomes/1922/report/coding/1922/1922.wes.regular.2021-11-23.csv. Trying latin-1 decoding.


In [12]:
with open(f"genome-family-participant-reports-{datetime.today().strftime('%Y-%m-%d')}.json", 'w') as f:
    json.dump(current_genome_fam_ptps, f, indent = 4)