-
Notifications
You must be signed in to change notification settings - Fork 39
/
sample.py
123 lines (103 loc) · 4.32 KB
/
sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
""" Sample object """
from collections import OrderedDict
import pandas as pd
import numpy as np
from ipyrad.assemble.utils import ObjDict
class Sample(object):
"""
ipyrad Sample object. Links to files associated with an individual
sample, used to combine samples into Assembly objects.
"""
def __init__(self, name=""):
self.name = name
self.barcode = ""
# link to files
self.files = ObjDict({
"fastqs": [],
"edits": [],
"mapped_reads": [],
"unmapped_reads": [],
"clusters": [],
"consens": [],
"database": []
})
## summary stats dictionary
self.stats = pd.Series(
index=["state",
"reads_raw",
"reads_passed_filter",
"reads_merged",
"refseq_mapped_reads",
"refseq_unmapped_reads",
"clusters_total",
"clusters_hidepth",
"hetero_est",
"error_est",
"reads_consens",
], dtype=object)
## stats for each step
self.stats_dfs = ObjDict({
"s1": pd.Series(index=["reads_raw",
], dtype=object),
"s2": pd.Series(index=["reads_raw",
"trim_adapter_bp_read1",
"trim_adapter_bp_read2",
"trim_quality_bp_read1",
"trim_quality_bp_read2",
"reads_filtered_by_Ns",
"reads_filtered_by_minlen",
"reads_passed_filter",
], dtype=object),
#"filtered_by_qscore",
#"filtered_by_adapter",
"s3": pd.Series(index=["merged_pairs",
"clusters_total",
"hidepth_min",
"clusters_hidepth",
"avg_depth_total",
"avg_depth_mj",
"avg_depth_stat",
"sd_depth_total",
"sd_depth_mj",
"sd_depth_stat",
"filtered_bad_align",
], dtype=object),
"s4": pd.Series(index=["hetero_est",
"error_est",
], dtype=object),
"s5": pd.Series(index=["clusters_total",
"filtered_by_depth",
"filtered_by_maxH",
"filtered_by_maxAlleles",
"filtered_by_maxN",
"reads_consens",
"nsites",
"nhetero",
"heterozygosity",
], dtype=object),
})
## store cluster depth information (biggest memory cost),
self.depths = {}
def __str__(self):
return "<ipyrad.Sample object {}>".format(self.name)
def _to_fulldict(self):
"""
Write to dict including data frames. All sample dicts
are combined in save() to dump JSON output """
##
returndict = OrderedDict([
("name", self.name),
("barcode", self.barcode),
("files", self.files),
("stats_dfs", {
"s1": self.stats_dfs.s1.to_dict(),
"s2": self.stats_dfs.s2.to_dict(),
"s3": self.stats_dfs.s3.to_dict(),
"s4": self.stats_dfs.s4.to_dict(),
"s5": self.stats_dfs.s5.to_dict(),
}),
("stats", self.stats.to_dict()),
("depths", self.depths)
])
return returndict