In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
# paths
## Metadata
metadata_dir = "/u/project/ngarud/michaelw/hsiao_data/tables/"
## sample report
sample_report_path = "%ssample_report.csv" % (metadata_dir)
## output path
read_mapping_out_path = "%ssample_report_Mv3_ReadMapping.csv" % (metadata_dir)
## MIDASv3 output directory
MIDASv3_output_dir = "/u/project/ngarud/Garud_lab/hsiao_data/MIDASv3_output/"




In [3]:
# load sample report
sample_report = pd.read_csv(sample_report_path, index_col=0)

In [4]:
# accession list
accession_list = [accession.split("_")[0] for accession in sample_report.Accession]

In [5]:
# build mapped reads dictionary
mapped_reads_dict = dict({})
for accession in accession_list:
    # summary paths
    genes_summary_path = "%s%s/genes/genes_summary.tsv" % (MIDASv3_output_dir, accession)
    snps_summary_path = "%s%s/snps/snps_summary.tsv" % (MIDASv3_output_dir, accession)

    # load summary files
    genes_summary = pd.read_csv(genes_summary_path, sep = "\t")
    snps_summary = pd.read_csv(snps_summary_path, sep = "\t")

    # calculate mapped reads
    genes_mapped_reads = genes_summary.sum()['mapped_reads']
    snps_mapped_reads = snps_summary.sum()['mapped_reads']

    # add to dictionary
    mapped_reads_dict[accession] = dict({"genes": genes_mapped_reads, "snps": snps_mapped_reads})


    



In [6]:
# annotate sample report file with mapped read counts
sample_report['MIDASv3_Mapped_Reads_genes'] = sample_report.Accession.str.split("_").str[0].apply(lambda x: mapped_reads_dict[x]['genes'])
sample_report['MIDASv3_Mapped_Reads_snps'] = sample_report.Accession.str.split("_").str[0].apply(lambda x: mapped_reads_dict[x]['snps'])
sample_report['MIDASv3_PercentMapped_genes'] = sample_report['MIDASv3_Mapped_Reads_genes']/sample_report['Total_coverage']
sample_report['MIDASv3_PercentMapped_snps'] = sample_report['MIDASv3_Mapped_Reads_snps']/sample_report['Total_coverage']




In [8]:
# Saving
sample_report.to_csv(read_mapping_out_path, index = False)

In [11]:
## TEST
accession = "35-2mo-3-hTau-B2-Female"
# summary paths
genes_summary_path = "%s%s/genes/genes_summary.tsv" % (MIDASv3_output_dir, accession)
snps_summary_path = "%s%s/snps/snps_summary.tsv" % (MIDASv3_output_dir, accession)

# load summary files
genes_summary = pd.read_csv(genes_summary_path, sep = "\t")
snps_summary = pd.read_csv(snps_summary_path, sep = "\t")

In [14]:
genes_summary

Unnamed: 0,species_id,pangenome_size,covered_genes,fraction_covered,aligned_reads,mapped_reads,mean_depth,marker_depth
0,142728,3961,3606,0.91,207824,177516,6.886,6.102
1,141329,1687,706,0.418,83546,73888,16.533,4.883
2,141274,811,691,0.852,120723,107997,20.426,18.437
3,138779,5176,4550,0.879,693015,604124,18.777,15.869
4,137981,3491,3080,0.882,164046,135182,6.411,3.285
5,137434,2035,1897,0.932,185924,154037,10.265,9.306
6,132831,3971,3439,0.866,619029,557685,21.129,19.156
7,130685,2863,2802,0.979,153947,110762,5.875,4.514
8,129214,4704,3890,0.827,182932,153125,5.895,5.107
9,128931,2226,2052,0.922,465422,424369,30.023,31.121
