-
Notifications
You must be signed in to change notification settings - Fork 0
/
excel.py
1483 lines (1261 loc) · 55.8 KB
/
excel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from collections import defaultdict
import operator
from pathlib import Path
import re
from string import ascii_uppercase as uppercase
from timeit import default_timer as timer
from typing import Union
from colour import Color
import Levenshtein as levenshtein
import numpy as np
from openpyxl.cell.rich_text import TextBlock, CellRichText
from openpyxl.cell.text import InlineFont
from openpyxl import drawing, load_workbook
from openpyxl.styles import Alignment, Border, DEFAULT_FONT, Font, Side
from openpyxl.styles.fills import PatternFill
from openpyxl.utils import get_column_letter
from openpyxl.worksheet.datavalidation import DataValidation
import pandas as pd
from .utils import is_numeric
# openpyxl style settings
THIN = Side(border_style="thin", color="000000")
MEDIUM = Side(border_style="medium", color="000001")
THIN_BORDER = Border(left=THIN, right=THIN, top=THIN, bottom=THIN)
DEFAULT_FONT.name = 'Calibri'
class excel():
"""
Functions for wrangling variant data into spreadsheet formatting and
writing output file
Attributes
----------
args : argparse.Namespace
arguments passed from command line
vcfs : list of pd.DataFrame
list of dataframes formatted to write to file from vcf() methods
additional_files : dict
(optional) if addition files have been passed, dict will be populated
with worksheet name : df of file data
refs : list
list of reference names parsed from vcf headers
writer : pandas.io.excel._openpyxl.OpenpyxlWriter
writer object for writing Excel data to file
workbook : openpyxl.workbook.workbook.Workbook
openpyxl workbook object for interacting with per-sheet writing and
formatting of output Excel file
Outputs
-------
{args.output}.xlsx : file
Excel file with variants written to, name passed from command line or
inferred from input vcf name if not specified
"""
def __init__(self, args, vcfs, additional_files, refs) -> None:
print(f"Writing to output file: {Path(args.output).absolute()}")
self.args = args
self.vcfs = vcfs
self.additional_files = additional_files
self.refs = refs
self.writer = pd.ExcelWriter(args.output, engine='openpyxl')
self.workbook = self.writer.book
self.summary = None
def generate(self) -> None:
"""
Calls all methods in excel() to generate output file
"""
self.write_summary()
if self.args.acmg:
self.write_reporting_template()
self.write_variants()
self.write_additional_files()
self.write_images()
self.workbook.save(self.args.output)
if self.args.summary == 'dias':
self.drop_down()
print('Done!')
def write_summary(self) -> None:
"""
Write summary sheet to excel file
"""
print('Writing summary sheet')
if self.args.summary:
self.summary = self.workbook.create_sheet('summary')
if self.args.summary == 'helios':
# add summary sheet for TSO500/Helios
self.helios_summary()
if self.args.summary == 'dias':
# generate summary sheet in format for RD/Dias
self.dias_summary()
def summary_sheet_cell_colour_key(self, row_count, to_bold) -> Union[int, list]:
"""
Write conditions and colours of colouring applied to cells to
the summary sheet if --colour specified
Parameters
----------
row_count : int
counter of current row to write to in sheet
to_bold : list
list of cells to set to bold
Returns
-------
int
counter of current row to write to in sheet
list
list of cells to set to bold
"""
# build a dict of each column and all its colour conditions
cols_to_colours = defaultdict(dict)
for i in self.args.colour:
column, condition, colour = i.split(':')
cols_to_colours[column][condition] = colour
self.summary.cell(row_count, 1).value = "Cell colouring applied:"
to_bold.append(f"A{row_count}")
self.summary[f"A{row_count}"].font = Font(
bold=True, name=DEFAULT_FONT.name
)
colour_col = 2
max_colour_rows_written = 0
# write colouring applied to each field as seperate column in summary
for column, conditions in cols_to_colours.items():
colour_row = row_count + 1
column_letter = get_column_letter(colour_col)
self.summary.cell(row_count, colour_col).value = column
to_bold.append(f"{column_letter}{row_count}")
for condition, colour in conditions.items():
condition = condition.replace('&', ' & ').replace('|', ' | ')
self.summary.cell(colour_row, colour_col).value = condition
self.summary.cell(colour_row, colour_col).data_type = 's'
colour = self.convert_colour(colour)
self.summary[f"{column_letter}{colour_row}"].fill = PatternFill(
patternType="solid",
start_color=colour
)
colour_row += 1
# set width to wider than max value in cell
width = max([len(x) for x in conditions.keys()])
width = 10 if width < 13 else width
self.summary.column_dimensions[column_letter].width = width + 3
colour_col += 2
if colour_row > max_colour_rows_written:
max_colour_rows_written = colour_row
return max_colour_rows_written, to_bold
def helios_summary(self) -> None:
"""
Writes summary sheet for helios pipeline with metrics such as
variant records per sheet, dx file IDs and parameters specified
"""
# track what cells to make bold
to_bold = []
# write titles for summary values
self.summary.cell(1, 1).value = "Sample ID:"
self.summary.cell(4, 1).value = "Name"
self.summary.cell(5, 1).value = "Clinical indication"
self.summary.cell(6, 1).value = "Additional comments"
self.summary.cell(9, 1).value = "Variant totals"
to_bold.extend(["A1", "A2", "A4", "A5", "A6", "A9"])
# get sample name from vcf, should only be one but handle everything
# list-wise just in case
sample = [
Path(x).name.replace('.vcf', '').replace('.gz', '')
for x in self.args.vcfs
]
sample = [x.split('_')[0] if '_' in x else x for x in sample]
sample = str(sample).strip('[]').strip("'")
self.summary.cell(1, 2).value = sample
# split sample name into constituent parts on '-' and write to
# separate cells for ease of them copying
for idx, part in enumerate(sample.split('-')):
self.summary.cell(2, idx+2).value = part
self.summary.column_dimensions['A'].width = 36
self.summary.column_dimensions['B'].width = 16
self.summary.column_dimensions['C'].width = 16
self.summary.column_dimensions['D'].width = 16
self.summary.merge_cells(
start_row=1, end_row=1, start_column=2, end_column=6)
row_count = 9
# write counts of variants
for sheet, vcf in zip(self.args.sheets, self.vcfs):
self.summary.cell(row_count, 2).value = sheet
self.summary.cell(row_count, 3).value = len(vcf.index)
to_bold.append(f"B{row_count}")
row_count += 1
row_count += 3
# Parsing of MetricsOutput metrics into summary sheet
for _, df in self.additional_files.items():
if df.empty:
continue
if df.iloc[0].iloc[0] == 'Metric (UOM)':
# its a metrics output file
if not len(df.columns.tolist()) == 4:
# not 4 cols => didn't parse out just sample values in
# utils.parse_metrics => skip
continue
# specific metrics lines we want to parse out
idxs = []
idxs.append(df[0].eq('Metric (UOM)').idxmax())
idxs.append(df[0].eq('CONTAMINATION_SCORE (NA)').idxmax())
idxs.append(df[0].eq('CONTAMINATION_P_VALUE (NA)').idxmax())
idxs.append(df[0].eq('PCT_EXON_50X (%)').idxmax())
idxs.append(df[0].eq('PCT_EXON_100X (%)').idxmax())
colouring = {"green": [], "amber": [], "red": []}
for file_row, idx in enumerate(idxs):
title = df.iloc[idx].iloc[0]
lsl = df.iloc[idx].iloc[1]
usl = df.iloc[idx].iloc[2]
sample = df.iloc[idx].iloc[3]
self.summary.cell(row_count, 1).value = title
self.summary.cell(row_count, 2).value = lsl
self.summary.cell(row_count, 3).value = usl
self.summary.cell(row_count, 4).value = sample
# perform colouring like in self.colour_metrics(), lazily
# catch anything in case of weird values to not break
try:
if title == 'CONTAMINATION_SCORE (NA)':
if float(sample) > float(usl):
colouring["amber"].append(row_count)
else:
colouring["green"].append(row_count)
elif title == 'CONTAMINATION_P_VALUE (NA)':
if float(sample) > float(usl):
colouring["red"].append(row_count)
else:
colouring["green"].append(row_count)
elif title == 'PCT_EXON_50X (%)':
if float(sample) >= 95:
colouring["green"].append(row_count)
else:
colouring["red"].append(row_count)
elif title == 'PCT_EXON_100X (%)':
if float(sample) >= 90:
colouring["green"].append(row_count)
else:
colouring["red"].append(row_count)
except Exception as err:
print(
"WARNING: error in colouring metrics values in "
f"summary sheet: {err}.\nContinuing without colouring"
)
to_bold.append(f"A{row_count}")
row_count += 1
# do the colouring
for colour, idxs in colouring.items():
for idx in idxs:
if colour == 'green':
self.summary[f"D{idx}"].fill = PatternFill(
patternType="solid",
start_color='008100'
)
elif colour == 'amber':
self.summary[f"D{idx}"].fill = PatternFill(
patternType="solid",
start_color='ff9f00'
)
elif colour == 'red':
self.summary[f"D{idx}"].fill = PatternFill(
patternType="solid",
start_color='b30000'
)
row_count += 2
# Parsing of TMB/MSI/Gene Amplifications into summary
for _, df in self.additional_files.items():
if df.empty:
continue
if df.iloc[0].iloc[0] == '[TMB]':
for _, row in df.iterrows():
self.summary.cell(row_count, 1).value = row[0]
self.summary.cell(row_count, 2).value = row[1]
if row[0] != 'NA':
to_bold.append(f"A{row_count}")
row_count += 1
row_count += 1
# write genome reference(s) parsed from vcf header
if self.refs:
self.summary.cell(row_count, 1).value = "Reference:"
self.summary[f"A{row_count}"].font = Font(
bold=True, name=DEFAULT_FONT.name
)
for ref in list(set(self.refs)):
self.summary.cell(row_count, 2).value = ref
row_count += 1
row_count += 2
if self.args.human_filter:
self.summary.cell(row_count, 1).value = "Filters applied:"
self.summary[f"A{row_count}"].font = Font(
bold=True, name=DEFAULT_FONT.name)
self.summary.cell(row_count, 2).value = self.args.human_filter
row_count += 2
# write args passed to script to generate report
self.summary.cell(row_count, 1).value = "Filter command:"
self.summary[f"A{row_count}"].font = Font(bold=True, name=DEFAULT_FONT.name)
if self.args.filter:
self.summary.cell(row_count, 2).value = self.args.filter
else:
self.summary.cell(row_count, 2).value = "None"
row_count += 2
# write in the colouring of any columns if done
if self.args.colour:
row_count, to_bold = self.summary_sheet_cell_colour_key(
row_count, to_bold)
# write more text with DNAnexus IDs etc
row_count += 2
self.summary.cell(row_count, 1).value = "Workflow:"
self.summary.cell(row_count + 1, 1).value = "Workflow ID:"
self.summary.cell(row_count + 2, 1).value = "Report Job ID:"
to_bold.extend([f"A{row_count + x}" for x in range(0, 3)])
self.summary.cell(row_count, 2).value = self.args.workflow[0]
self.summary.cell(row_count + 1, 2).value = self.args.workflow[1]
self.summary.cell(row_count + 2, 2).value = self.args.job_id
for cell in to_bold:
self.summary[cell].font = Font(bold=True, name=DEFAULT_FONT.name)
def dias_summary(self) -> None:
"""
Write summary sheet in format for RD group, adds the following info:
- sample ID, panel(s), run IDs etc.
- formatted tables for them to fill in reporting
"""
# write titles for summary values
self.summary.cell(1, 1).value = "Sample ID:"
self.summary.cell(1, 5).value = "Clinical Indication(s):"
self.summary.cell(2, 5).value = "Panel(s):"
self.summary.cell(34, 1).value = "Total records:"
# get sample name from vcf, should only be one but handle everything
# list-wise just in case
sample = [
Path(x).name.replace('.vcf', '').replace('.gz', '')
for x in self.args.vcfs
]
sample = [x.split('_')[0] if '_' in x else x for x in sample]
sample = str(sample).strip('[]').strip("'")
# write summary values
self.summary.cell(1, 2).value = sample
self.summary.cell(1, 6).value = self.args.clinical_indication
self.summary.cell(2, 6).value = self.args.panel
# write total rows in each sheet
count = 34
# cells to make bold
to_bold = []
for sheet, vcf in zip(self.args.sheets, self.vcfs):
self.summary.cell(count, 2).value = sheet
self.summary.cell(count, 3).value = len(vcf.index)
to_bold.append(f"A{count}")
count += 1
count += 5
# write genome reference(s) parsed from vcf header
if self.refs:
self.summary.cell(count, 1).value = "Reference:"
self.summary[f"A{count}"].font = Font(
bold=True, name=DEFAULT_FONT.name
)
for ref in list(set(self.refs)):
self.summary.cell(count, 2).value = ref
count += 1
count += 1
if self.args.human_filter:
self.summary.cell(count, 1).value = "Filters applied:"
self.summary.cell(count, 2).value = self.args.human_filter
to_bold.append(f"A{count}")
count += 2
# write args passed to script to generate report
self.summary.cell(count, 1).value = "Filter command:"
to_bold.append(f"A{count}")
if self.args.filter:
self.summary.cell(count, 2).value = self.args.filter
else:
self.summary.cell(count, 2).value = "None"
count += 2
if self.args.colour:
count, to_bold = self.summary_sheet_cell_colour_key(
count, to_bold)
count += 2
self.summary.cell(count, 1).value = "Workflow:"
self.summary.cell(count + 1, 1).value = "Workflow ID:"
self.summary.cell(count + 2, 1).value = "Report Job ID:"
to_bold.append(f"A{count}")
to_bold.append(f"A{count + 1}")
to_bold.append(f"A{count + 2}")
self.summary.cell(count, 2).value = self.args.workflow[0]
self.summary.cell(count + 1, 2).value = self.args.workflow[1]
self.summary.cell(count + 2, 2).value = self.args.job_id
# write center reporting section tables
self.summary.cell(9, 2).value = "Phenotype:"
self.summary.cell(16, 2).value = "Panels"
self.summary.cell(16, 3).value = "Excel file"
self.summary.cell(16, 4).value = "Comments"
self.summary.cell(16, 6).value = "Analysis by"
self.summary.cell(16, 7).value = "Date"
self.summary.cell(16, 8).value = "Checked by"
self.summary.cell(16, 9).value = "Date"
self.summary.cell(21, 2).value = "Sanger sequencing confirmation"
self.summary.cell(22, 2).value = "Gene"
self.summary.cell(22, 3).value = "NM_#"
self.summary.cell(22, 4).value = "Coordinate"
self.summary.cell(22, 5).value = "cDNA"
self.summary.cell(22, 6).value = "Protein change"
self.summary.cell(22, 7).value = "WS#"
self.summary.cell(22, 8).value = "Confirmed (Y/N)"
self.summary.cell(28, 2).value = "GEM comments summary"
self.summary.cell(28, 4).value = "Date"
# merge some title columns that have longer text
self.summary.merge_cells(
start_row=1, end_row=1, start_column=2, end_column=4)
self.summary.merge_cells(
start_row=1, end_row=1, start_column=6, end_column=20)
self.summary.merge_cells(
start_row=2, end_row=2, start_column=6, end_column=20)
self.summary.merge_cells(
start_row=9, end_row=9, start_column=2, end_column=5)
self.summary.merge_cells(
start_row=21, end_row=21, start_column=2, end_column=8)
self.summary.merge_cells(
start_row=16, end_row=16, start_column=4, end_column=5)
self.summary.merge_cells(
start_row=28, end_row=28, start_column=2, end_column=3)
self.summary.merge_cells(
start_row=28, end_row=28, start_column=4, end_column=6)
# titles to set to bold
to_bold += [
"A1", "A34", "A35", "A36","A38", "B1",
"B9", "B16", "B21", "B22", "B28", "B34", "B35", "B36", "B37",
"C16", "C22", "D16", "D22", "D28", "E1", "E2", "E22",
"F16", "F22", "G16", "G22", "H16", "H22", "I16"
]
for cell in to_bold:
self.summary[cell].font = Font(bold=True, name=DEFAULT_FONT.name)
# set column widths for readability
self.summary.column_dimensions['A'].width = 22 if self.args.colour else 18
self.summary.column_dimensions['B'].width = 13
self.summary.column_dimensions['C'].width = 13
self.summary.column_dimensions['D'].width = 13
self.summary.column_dimensions['E'].width = 22
self.summary.column_dimensions['F'].width = 16
self.summary.column_dimensions['G'].width = 16
self.summary.column_dimensions['H'].width = 16
# colour title cells
blueFill = PatternFill(patternType="solid", start_color="0CABA8")
colour_cells = [
"B9", "B16", "B21", "B22", "B28", "C16", "C22", "D16", "D22",
"D28", "E22", "F16", "F22", "G16", "G22", "H16", "H22", "I16"
]
for cell in colour_cells:
self.summary[cell].fill = blueFill
# set borders around table areas
row_ranges = [
'B9:E9', 'B10:E10', 'B11:E11', 'B12:E12', 'B13:E13',
'B16:I16', 'B17:I17', 'B18:I18',
'B21:H21', 'B22:H22', 'B23:H23', 'B24:H24', 'B25:H25',
'B28:F28', 'B29:F29', 'B30:F30', 'B31:F31', 'B32:F32'
]
for row in row_ranges:
for cells in self.summary[row]:
for cell in cells:
cell.border = THIN_BORDER
def write_reporting_template(self) -> None:
"""
Writes sheet to Excel file with formatting for reporting against
ACMG criteria
"""
report = self.workbook.create_sheet('report')
titles = {
"Gene": [2, 2],
"HGVSc": [2, 3],
"HGVSp": [2, 4],
"Evidence": [4, 3],
"Pathogenic": [4, 7],
"Yes / No": [4, 8],
"Benign": [4, 9],
"Yes / No": [4, 10],
"Associated disease": [5, 2],
"Known inheritance": [6, 2],
"Prevalence": [7, 2],
"Estimated allele frequency": [8, 2],
"Null variant where LOF function of disease": [9, 2],
"Same AA change as pathogenic change,regardless\nof nucleotide": [10, 2],
"De novo inheritance or inheritance confirmed / observed in\nhealthy adult": [11, 2],
"In vivo / in vitro functional studies": [12, 2],
"Prevalence in affected > controls": [13, 2],
"In mutational hotspot, without benign variation": [14, 2],
"Freq in controls eg ExAC, low/absent or >5%": [15, 2],
"Confirmation of in trans/in cis with pathogenic variant": [16, 2],
"In frame protein length change, non repeating vs. repeating": [17, 2],
"Same AA as a different pathogenic change": [18, 2],
"Assumed de novo (no confirmation)": [19, 2],
"Cosegregation with disease in family, not in unnaffected": [20, 2],
("Missense where low rate of benign missense and common\nmechanism"
"(Z score >3), or missense where LOF common\nmechanism"): [21, 2],
"Multiple lines of computational evidence (Cant use with PS3)": [22, 2],
("Phenotype/FH specific for disease of single etiology, or\n"
"alternative genetic cause of disease detected"): [23, 2],
"Reputable source reports but evidence not available": [24, 2],
"Synonymous change, no affect on splicing, not conserved": [25, 2],
"ACMG Classification": [26, 2],
}
for key, val in titles.items():
report.cell(val[0], val[1]).value = key
report.cell(val[0], val[1]).font = Font(
bold=True, name=DEFAULT_FONT.name
)
classifications = {
"Extra": [(5, 7), (6, 7), (7, 7), (8, 7), (5, 9), (6, 9), (7, 9)],
"PVS1": [(9, 7)],
"PS1": [(10, 7)],
"PS2": [(11, 7)],
"PS3": [(12, 7)],
"PS4": [(13, 7)],
"PM1": [(14, 7)],
"PM2": [(15, 7)],
"PM3": [(16, 7)],
"PM4": [(17, 7)],
"PM5": [(18, 7)],
"PM6": [(19, 7)],
"PP1": [(20, 7)],
"PP2": [(21, 7)],
"PP3": [(22, 7)],
"PP4": [(23, 7)],
"PP5": [(24, 7)],
"BS1": [(8, 9)],
"BS2": [(11, 9)],
"BS3": [(12, 9)],
"BA1": [(15, 9)],
"BP2": [(16, 9)],
"BP3": [(17, 9)],
"BS4": [(20, 9)],
"BP1": [(21, 9)],
"BP4": [(22, 9)],
"BP5": [(23, 9)],
"BP6": [(24, 9)],
"BP7": [(25, 9)]
}
for key, values in classifications.items():
for val in values:
report.cell(val[0], val[1]).value = key
# nice formatting of title text
for cell in report['B']:
breaks = str(cell.value).count("\n") + 1
report.row_dimensions[cell.row].height = 20 * breaks
report[f"B{cell.row}"].alignment = Alignment(
wrapText=True, vertical="center"
)
# merge evidence cells
for row in range(4, 26):
report.merge_cells(
start_row=row, end_row=row, start_column=3, end_column=6)
# set appropriate widths
report.column_dimensions['B'].width = 60
report.column_dimensions['C'].width = 35
report.column_dimensions['D'].width = 35
report.column_dimensions['E'].width = 5
report.column_dimensions['F'].width = 5
report.column_dimensions['G'].width = 12
# do some colouring
colour_cells = {
'FAC090': ['B2', 'B3', 'C2', 'C3', 'D2', 'D3'],
'8EB4E3': ['B4', 'B5', 'B6', 'B7', 'B8', 'C4', 'G4', 'H4', 'I4', 'J4'],
'FFFF99': ['B26', 'G5', 'G6', 'G7', 'G8', 'I5', 'I6', 'I7', 'I8'],
'E46C0A': ['G9', 'G10', 'G11', 'G12', 'G13'],
'FFC000': ['G14', 'G15', 'G16', 'G17', 'G18', 'G19'],
'FFFF00': ['G20', 'G21', 'G22', 'G23', 'G24'],
'00B0F0': ['I8', 'I11', 'I12', 'I20'],
'92D050': ['I16', 'I17', 'I21', 'I22', 'I23', 'I24', 'I25'],
'0070C0': ['I15'],
'FF0000': ['G9'],
'D9D9D9': [
'I9', 'I10', 'J9', 'J10', 'I13', 'I14', 'J13', 'J14',
'I18', 'I19', 'J18', 'J19', 'G25', 'H25'
]
}
for colour, cells in colour_cells.items():
for cell in cells:
report[cell].fill = PatternFill(
patternType="solid", start_color=colour
)
# add some borders
row_ranges = {
'horizontal': [
'B3:D3', 'B4:J4', 'B5:J5',
'B6:J6', 'B7:J7', 'B8:J8', 'B9:J9', 'B10:J10', 'B11:J11',
'B12:J12', 'B13:J13', 'B14:J14', 'B15:J15', 'B16:J16',
'B17:J17', 'B18:J18', 'B19:J19', 'B20:J20', 'B21:J21',
'B22:J22', 'B23:J23', 'B24:J24', 'B25:J25'
],
'horizontal_thick': [
'B2:D2', 'B4:J4', 'B26:J26', 'B27:J27'
],
'vertical': [
'E2:E3', 'G4:G26', 'H4:H26', 'I4:I26', 'J4:J26'
],
'vertical_thick': [
'B2:B26', 'C2:C26', 'K4:K26', 'E2:E3'
]
}
for side, values in row_ranges.items():
for row in values:
for cells in report[row]:
for cell in cells:
# border style is immutable => copy current and modify
cell_border = cell.border.copy()
if side == 'horizontal':
cell_border.top = THIN
if side == 'horizontal_thick':
cell_border.top = MEDIUM
if side == 'vertical':
cell_border.left = THIN
if side == 'vertical_thick':
cell_border.left = MEDIUM
cell.border = cell_border
def write_variants(self) -> None:
"""
Writes all variants from dataframe(s) to sheet(s) specified in
self.args.sheets.
If sheet names not specified, these will be set as "variant" where one
dataframe is being written, or the vcf filename prefix if there are
more than one dataframes to write
"""
total_rows = sum([len(x) for x in self.vcfs])
print(f"\nWriting total of {total_rows} rows to output xlsx file")
if total_rows > 5000:
print(
"Writing many rows to Excel is slow, "
"this may take a few minutes..."
)
with self.writer:
# add variants
for sheet, vcf in zip(self.args.sheets, self.vcfs):
sheet_no = self.args.sheets.index(sheet) + 1
print(
f"\nWriting {len(vcf)} rows to {sheet} sheet "
f"({sheet_no}/{len(self.args.sheets)})"
)
# timing how long it takes to write because its slow
start = timer()
vcf.to_excel(
self.writer, sheet_name=sheet,
index=False, float_format="%.3f"
)
curr_worksheet = self.writer.sheets[sheet]
self.set_widths(curr_worksheet, vcf.columns)
self.set_font(curr_worksheet)
self.colour_hyperlinks(curr_worksheet)
self.colour_cells(curr_worksheet)
self.set_dp(curr_worksheet)
# freeze header so scrolling keeps it in view
curr_worksheet.freeze_panes = self.args.freeze_column
self.workbook.save(self.args.output)
end = timer()
print(
f"Writing to Excel for sheet {sheet} took "
f"{round(end - start)}s"
)
# read back the written sheet to check its written correctly
self.check_written_sheets(vcf, sheet)
# set Excel types for numeric cells to suppress Excel warnings
self.set_types(curr_worksheet)
self.workbook.save(self.args.output)
def write_additional_files(self) -> None:
"""
Write each dataframe of additional files passed to separate sheets
"""
if not self.additional_files:
# empty dict => no files passed to write
return
print("Writing additional file(s) to workbook")
for file_name, file_df in self.additional_files.items():
file_df.to_excel(
self.writer, sheet_name=file_name, index=False, header=None
)
curr_worksheet = self.writer.sheets[file_name]
self.set_font(curr_worksheet)
self.set_types(curr_worksheet)
# set appropriate column widths based on cell contents
for idx, column in enumerate(curr_worksheet.columns, start=1):
# get max length of column contents, sensible max and min sizes
length = max(len(str(cell.value)) for cell in column)
length = 13 if length < 13 else length
length = 30 if length > 30 else length
col_letter = get_column_letter(idx)
curr_worksheet.column_dimensions[col_letter].width = length
# set widths of any columns we have specified below in set_width()
# to override the above defaults
# get column names from first row of sheet
sheet_columns = [
'' if x is None else x for x in file_df.iloc[0].tolist()]
self.set_widths(curr_worksheet, sheet_columns)
if file_df.iloc[0].iloc[0] == 'Metric (UOM)':
# additional file is MetricsOutput.tsv from TSO500 => attempt
# to colour metrics in output sheet
if len(file_df.columns.tolist()) == 4:
# only 4 columns => given sample metrics correctly
# parsed from full run metrics
try:
self.colour_metrics_output(file_df, curr_worksheet)
except Exception as err:
# catch any error raised to not break the app and
# just print a warning since its non-essential
print(
"Warning: error in colouring MetricsOutput sheet:"
f"\n\t{err}\nContinuing without colouring."
)
def write_images(self) -> None:
"""
Writes each of the passed images to a separate sheet
"""
if not self.args.images:
# no images to write
return
print("Writing image(s) to workbook")
for idx, image in enumerate(self.args.images):
if self.args.image_sheets:
# names for image sheets specified
sheet = self.workbook.create_sheet(
self.args.image_sheets[idx])
else:
sheet = self.workbook.create_sheet(f'image_{idx + 1}')
img = drawing.image.Image(image)
img.anchor = 'B2'
if self.args.image_sizes:
# set sizes if specified
try:
width, height = self.args.image_sizes[idx].split(':')
img.height = float(height)
img.width = float(width)
except Exception as error:
print(
"Failed to parse width/height from image_sizes "
f"argument for image no. {idx}, sizes specified: "
f"{self.args.image_sizes}.\n\nError: {error}\n\n"
f"Will use defaults from current image (width:"
f"{img.width}px, height:{img.height}px)"
)
else:
# set max size based off aspect ratio of original image
ratio = img.width / img.height
height = 972
width = 972 * ratio
img.height = height
img.width = width
sheet.add_image(img)
def check_written_sheets(self, vcf, sheet) -> None:
""""
Check that the written sheet is exactly the same as the dataframe
that was meant to be written
Parameters
----------
vcf : pd.DataFrame
dataframe of sheet that was written to file
sheet : str
sheet name to read from file
Raises
------
AssertionError
Raised when data written to sheet does not match the dataframe
"""
print(f"\nVerifying data written to file for {sheet} sheet...\n")
# read in written sheet using openpyxl to deal with Excel oddities
sheet_data = load_workbook(filename=self.args.output)[sheet]
written_sheet = pd.DataFrame(
sheet_data.values, columns=vcf.columns.tolist())
written_sheet = written_sheet.iloc[1:] # drop header on first row
# force nan values to be None strings for consistency on comparing
vcf = vcf.replace(r'^\s*$', np.nan, regex=True)
vcf.fillna('None', inplace=True)
written_sheet = written_sheet.replace(r'^\s*$', np.nan, regex=True)
written_sheet.fillna('None', inplace=True)
# set all columns of both dfs to strings
vcf = vcf.astype(str)
written_sheet = written_sheet.astype(str).reset_index(drop=True)
# floats with trailing zero seem inconsistent when writing to file,
# since we've cast everything to a string strip for ease of comparing
vcf = vcf.applymap(
lambda x: x.replace('.0', '') if x.endswith('.0') else x
)
written_sheet = written_sheet.applymap(
lambda x: x.replace('.0', '') if x.endswith('.0') else x
)
assert vcf.equals(written_sheet), (
f"Written data for sheet: {sheet} does not seem to match the "
"dataframe to be written"
)
def set_types(self, worksheet) -> None:
"""
Iterate over all worksheet cells and test if cell value can be numeric,
if so sets the type to numeric to suppress 'Number stored as text'
warning in Excel
Parameters
----------
worksheet : openpyxl.Writer
writer object for current sheet
"""
for cells in worksheet.rows:
for cell in cells:
if is_numeric(cell.value):
cell.data_type = 'n'
def set_font(self, worksheet) -> None:
"""
Set font to all cells in variant sheet to Calibri
Default is Times New Roman and it is ugly
Parameters
----------
worksheet : openpyxl.Writer
writer object for current sheet
"""
for cells in worksheet.rows:
for cell in cells:
cell.font = Font(name=DEFAULT_FONT.name)
def set_dp(self, worksheet) -> None:
"""
Set the dp to display values to (i.e. 0.14236 to dp -> 0.14).
Original value will remain unchanged in the formaula bar
on selecting the cell, just the view of data is adjusted
through Excel number formatting
Parameters
----------
worksheet : openpyxl.Writer
writer object for current sheet
"""
# mapping of column names to no. dp to display
col_to_dp = {
'VF': 2
}
for column, dp in col_to_dp.items():
for ws_column in worksheet.iter_cols(1, worksheet.max_column):
if ws_column[0].value.lower() == column.lower():
# column is a match, loop over every cell in the column
# to set formatting since there's no nicer way to apply
# the style in openpyxl
for row in ws_column:
row.number_format = '#,##0.00'
def convert_colour(self, colour) -> str:
"""
Converts string of colour to aRGB value that openpyxl will accept.
Valid colour strings reference here:
https://github.com/vaab/colour/blob/11f138eb7841d2045160b378a2eec0c2321144c0/colour.py#L52
Parameters
----------
colour : str
colour string, either hex value beginning with '#' (#82920c)
or human readable (ForestGreen)
Returns
-------
str
hex value without '#' prefix
"""