forked from htgt/Crisprs
/
process_paired_data.pl
466 lines (357 loc) · 18.1 KB
/
process_paired_data.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
#!/usr/bin/perl
#usage:
#windowBed -a ar_with_n_valid_21-ARA.bed -b ar_with_n_valid_21-ARB.bed -w 999999 | awk '!($2==$8 && $1==$7)' | perl ~/work/paired_crisprs/valid_pairs.pl ~/work/paired_crisprs/ar_crisprs.fq > ~/work/paired_crisprs/ar-a_ar-b.html
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use Bio::Perl;
use Try::Tiny;
use Data::Dumper;
use feature qw( say );
use YAML::Any qw( DumpFile LoadFile );
use List::MoreUtils;
use List::Util qw(sum);
my ( $crispr_fq_file, $crispr_yaml_file, @paired_files, $pair_yaml_file );
GetOptions(
"help" => sub { pod2usage( 1 ) },
"man" => sub { pod2usage( 2 ) },
"fq-file=s" => \$crispr_fq_file,
"pair-yaml-file=s" => \$pair_yaml_file,
"crispr-yaml-file=s" => \$crispr_yaml_file,
"paired-output=s{,}" => \@paired_files,
) or pod2usage( 2 );
die pod2usage( 2 ) unless $pair_yaml_file and $crispr_fq_file and $crispr_yaml_file and @paired_files;
my %CRISPRS = process_fq_file( $crispr_fq_file );
my %OFF_TARGET_DATA;
my $CRISPR_NAME_RE = qr/^(ENS.*)_.*$/;
#we need this data to make sure the off target we're adding isn't the same as the crispr
my $CRISPR_DATA = LoadFile( $crispr_yaml_file );
for my $filename ( @paired_files ) {
my ( %duplicates, %crispr_names );
my ( %num_valid_pairs, $shortest );
my @all;
open( my $fh, "<", $filename ) || die "Couldn't open $filename";
#say STDERR "Processing $filename";
while ( my $line = <$fh> ) {
chomp $line;
#split the line into two hashrefs of bed data
my ( $l_crispr, $r_crispr ) = process_windowbed_line( $line );
next unless $l_crispr and $r_crispr;
#have we already processed this pair but the other way?
#windowbed gives you each pair twice.
if ( exists $duplicates{ $r_crispr->{locus_str} }->{ $l_crispr->{locus_str} } ) {
next;
}
else {
#we've done this one now so add it to our duplicates hash
$duplicates{ $r_crispr->{locus_str} }->{ $l_crispr->{locus_str} }++;
}
my $pair_key = $l_crispr->{name}."-".$r_crispr->{name};
if ( ! defined $num_valid_pairs{$pair_key} ) {
$num_valid_pairs{$pair_key} = 0; #so we can accurately report how many for each
}
#for labelling the output
$crispr_names{ $l_crispr->{name} }++;
$crispr_names{ $r_crispr->{name} }++;
#check we have a valid orientation:
#see if the pam sites are end to end and if so is the sequence in the correct orientation.
#we need this for sites that are CC-GG or GG-CC as it would appear to be end to end,
#but the actual match
my ( $valid_pair, $type ) = ( 0, "" );
#we have already validated the seqs so we just need to check if its CC-GG or GG-CC
if ( $l_crispr->{direction} eq "rev" && $r_crispr->{direction} eq "fwd" ) {
$valid_pair = 1;
$type = "Tail to tail";
}
elsif ( $l_crispr->{direction} eq "fwd" && $r_crispr->{direction} eq "rev" ) {
$valid_pair = 1;
$type = "Head to head";
}
if ( $valid_pair ) {
#the entry wasn't a duplicate so lets process it
#find the entry from the crisprs.yaml to see if this is the original pair
my $l_crispr_yaml = $CRISPR_DATA->{ $l_crispr->{exon_id} }{ $l_crispr->{crispr_id} } || die;
my $r_crispr_yaml = $CRISPR_DATA->{ $r_crispr->{exon_id} }{ $r_crispr->{crispr_id} } || die;
my $left_identical = ( $l_crispr->{chr} eq $l_crispr_yaml->{locus}{chr_name} &&
$l_crispr->{start} eq $l_crispr_yaml->{locus}{chr_start} );
my $right_identical = ( $r_crispr->{chr} eq $r_crispr_yaml->{locus}{chr_name} &&
$r_crispr->{start} eq $r_crispr_yaml->{locus}{chr_start} );
#if its the original pair, skip it
if ( $left_identical and $right_identical ) {
say "Skipping original pair entry.";
next;
}
$num_valid_pairs{$pair_key}++;
my $distance;
if ( $r_crispr->{start} > $l_crispr->{start} ) {
$distance = $r_crispr->{start} - $l_crispr->{end};
}
else {
$distance = $l_crispr->{start} - $r_crispr->{end};
}
my $data = {
exon_id => $l_crispr->{exon_id}, #they will both have the same
first_locus => $l_crispr->{locus_str},
second_locus => $r_crispr->{locus_str},
first_name => $l_crispr->{name},
second_name => $r_crispr->{name},
first_seq => $l_crispr->{seq},
second_seq => $r_crispr->{seq},
distance => $distance,
type => $type,
};
#push @all, $data;
#if we already have a shortest and this one is longer we can ignore it
#we use abs so that -10 isnt "better" than 2
next if defined $shortest && $distance > abs( $shortest->{distance} );
#we'll set shortest if its not already set or distance is less.
$shortest = $data;
}
}
#uncomment this and the above push to display summary data.
# my @rows = sort { abs($a->{distance}) <=> abs($b->{distance}) } @all;
# my $total_in_150 = 0;
# say "l_name,l_locus,distance,r_name,r_locus";
# for ( @rows ) {
# last if $_->{distance} > 9000;
# ++$total_in_150;
# say join ",", $_->{first_name}, $_->{first_locus}, $_->{distance}, $_->{second_name}, $_->{second_locus};
# }
# say "$filename";
# say "\tTotal within 9000: " . scalar(@all);
# say "\tTotal within 150: " . $total_in_150;
# say "\tClosest: " . $rows[0]->{distance};
#make sure we didnt somehow get more than 2 crispr names
die "More than 2 crisprs being compared?? ($filename):" . join(", ", keys %crispr_names)
if scalar keys %crispr_names > 2;
#add the shortest one we found in this file to the global hash
if ( defined $shortest ) {
say STDERR "Closest crispr for " . join(" vs ", keys %crispr_names) . ": "
. $shortest->{distance};
say STDERR "Total valid: " . sum(values %num_valid_pairs) . " - " .
join(", ", map { $_ . ": " . $num_valid_pairs{$_} } keys %num_valid_pairs);
my ( $f_name ) = $shortest->{first_name} =~ /_(.*)$/;
my ( $s_name ) = $shortest->{second_name} =~ /_(.*)$/;
$OFF_TARGET_DATA{ $shortest->{exon_id} }->{$f_name}{$s_name} =
"{" .
'distance: "' . $shortest->{distance} . '", ' .
'left: "' . $shortest->{first_locus} . '", ' .
'right: "' . $shortest->{second_locus} . '", ' .
'type: "' . $shortest->{type} . '", ' .
'id: "' . "${f_name}_${s_name}" . '", ' .
'total_in_9k: "'.sum(values %num_valid_pairs). '", ' .
'algorithm: "bwa"' .
"}";
}
else {
say STDERR "Couldn't find a valid paired off target for crisprs:" . join ", ", keys %crispr_names;
}
}
#loop through pairs yaml and insert any off target data that we found
my $pair_data = LoadFile( $pair_yaml_file );
add_off_targets( $pair_data ); #edits the hash in place
DumpFile( $pair_yaml_file, $pair_data );
#
# METHODS
#
sub process_fq_file {
my ( $crispr_fq_file ) = @_;
my ( %crisprs, $current );
open( my $fq_fh, "<", $crispr_fq_file ) or die "Couldn't open $crispr_fq_file: $!";
while ( my $line = <$fq_fh> ) {
chomp $line;
#fq files have an @ line to signify a label, with the sequence on the next line
if ( $line =~ /^@(.*)/ ) {
$current = $1;
}
else {
die "Error: there must only be one sequence per entry" if defined $crisprs{ $current };
#pam must be at the end for bwa
die "Crisprs shouldn't be reverse complemented!" unless $line =~ /GG$/;
#add fwd and reverse so we can compute hamming distances later.
#we call it rev as we assume all crispr sequences have been reverse complemented.
#cant automatically determine which is which cause CC-GG is a valid crispr
$crisprs{$current} = { rev => revcom( $line )->seq, fwd => $line };
}
}
return %crisprs;
}
sub process_windowbed_line {
my ( $line ) = @_;
#chr, start, end, name, unknown, strand x2
my ( @data ) = split /\s+/, $line;
die "Chromosomes don't match: $line" if $data[0] ne $data[6];
my @bed_cols = qw( chr start end name score strand );
#not first and second crispr cause we need to check the order first
my ( %crispr_one, %crispr_two );
@crispr_one{@bed_cols} = @data[ 0 .. 5 ]; #half the line is crispr one,
@crispr_two{@bed_cols} = @data[ 6 .. 11 ]; #half is crispr two
#we need to do some additional processing on each crispr entry and get the direction.
#its the same for both crisprs so we do it in a loop
for my $crispr ( \%crispr_one, \%crispr_two ) {
$crispr->{start} += 1; #fix coords to match ensembl api, bed file number is different
$crispr->{chr} =~ s/Chr//; #strip Chr prefix that human genome has on chromosome names
$crispr->{locus_str} = $crispr->{chr} . ":" . $crispr->{start} . "-" . $crispr->{end}; #to detect dupes
#split the name field into name and seq (name looks like ARA-CCGCTAGCTAGCTGATG) with a hash slice
@{$crispr}{qw(name seq)} = delete($crispr->{name}) =~ /(.+)-([CGATcgat]+)/;
#now we have the name without the seq extract the exon_id and crispr_id
#the name is formatted like ENSMUSE000064698257_25A
@{$crispr}{qw(exon_id crispr_id)} = $crispr->{name} =~ /^(ENS.*)_(.*)$/;
die "Unknown name: " . $crispr->{name} unless exists $CRISPRS{ $crispr->{name} };
$crispr->{direction} = get_direction( $crispr->{name}, $crispr->{seq} );
}
#this should never happen
die "Pair is not in the same exon!" unless $crispr_one{exon_id} eq $crispr_two{exon_id};
#make sure we got a direction for both
return unless $crispr_one{direction} and $crispr_two{direction};
#determine which is the left crispr and which is the right
my ( $left_crispr, $right_crispr );
#which crispr is earlier in the chromosome? that is the first crispr
#if they're the regular way round, everything is good.
if ( $crispr_one{ start } < $crispr_two{ start } ) {
$left_crispr = \%crispr_one;
$right_crispr = \%crispr_two;
}
else {
#the latter crispr comes first genomically
#so we need them the other way as they're reversed
$left_crispr = \%crispr_two;
$right_crispr = \%crispr_one;
}
#do they overlap? if so we need to make sure that we have them the right way round:
#in the fwd direction the pam is at the very end, so the pam site
#will actually be further along than the rev pam site
if ( $right_crispr->{ start } - $left_crispr->{ end } < 0 ) {
if ( $left_crispr->{direction} eq "fwd" && $right_crispr->{direction} eq "rev" ) {
#they're the wrong way around so swap them
( $left_crispr, $right_crispr ) = ( $right_crispr, $left_crispr );
}
}
return $left_crispr, $right_crispr;
}
sub hamming_distance {
#use string xor to get the number of mismatches between the two strings.
#the xor returns a string with the binary digits of each char xor'd,
#which will be an ascii char between 001 and 255. tr returns the number of characters replaced.
die "Strings passed to hamming distance differ" if length($_[0]) != length($_[1]);
return (uc($_[0]) ^ uc($_[1])) =~ tr/\001-\255//;
}
sub get_pair_exon_id {
my ( $pair_data ) = @_;
my ( $l_exon_id ) = $pair_data->{first_name} =~ $CRISPR_NAME_RE;
my ( $r_exon_id ) = $pair_data->{second_name} =~ $CRISPR_NAME_RE;
die "Pair is not in the same exon!" unless $l_exon_id eq $r_exon_id;
return $l_exon_id;
}
sub get_direction {
my ( $name, $seq ) = @_;
my $fwd_distance = hamming_distance($seq, $CRISPRS{$name}->{fwd});
my $rev_distance = hamming_distance($seq, $CRISPRS{$name}->{rev});
#i hope this doesn't happen but its possible
#die "Identical hamming distances found for both orientations of $seq" if $fwd_distance == $rev_distance;
#make sure that we disregard ones we definitely cant identify
if ( $fwd_distance == $rev_distance && $seq =~ /^CC.*GG$/ ) {
say STDERR "Dual PAM and identical hamming distances for both orientations of $seq, skipping.";
return;
}
#try and get a direction
if ( $fwd_distance <= $rev_distance && $seq =~ /GG$/ ) {
#all seqs at this point should have a valid pam site, if not we complain.
#die "$name does not have a valid pam: $seq" unless $seq =~ /GG$/;
return if hamming_distance(substr($seq, 0, 20), substr($CRISPRS{$name}->{fwd},0,20)) > 5;
return "fwd";
}
elsif ( $fwd_distance >= $rev_distance && $seq =~ /^CC/ ) {
#die "$name does not have a valid pam: $seq" unless $seq =~ /^CC/;
return if hamming_distance(substr($seq, 3), substr($CRISPRS{$name}->{rev},3)) > 5;
return "rev";
}
else {
say STDERR "Crispr doesn't look valid, skipping ($fwd_distance $rev_distance $seq)";
return;
}
}
sub add_off_targets {
my ( $pair_data ) = @_; #should be hashref to yaml data
#if we didn't get an off target we notify the user by saying there were non within the search bounds
my $no_off_target = '{distance: "9000+"}';
while ( my ( $exon_id, $pairs ) = each %{ $pair_data } ) {
for my $pair ( @{ $pairs } ) {
#shorthands because the full name is too long
my ( $l_name, $r_name ) = ( $pair->{left_crispr}, $pair->{right_crispr} );
$pair->{off_target_summary} = ''; #clear any old off target summary data.
#see if there's any off target data for this pair
if ( defined $OFF_TARGET_DATA{$exon_id} ) {
#see if we have any off target data for this entry (we have to try both ways just in case)
#loop through all possibilities, e.g for 2A and 5B: 2A 5B, 5B 2A, 2A 2A, 5B 5B
my $found = 0;
for my $first ( $l_name, $r_name ) {
next unless defined $OFF_TARGET_DATA{$exon_id}->{$first};
for my $second ( $l_name, $r_name ) {
if ( defined $OFF_TARGET_DATA{$exon_id}->{$first}{$second} ) {
#if there's already an off target summary see if we have a shorter one
if ( $pair->{off_target_summary} && $pair->{off_target_summary} !~ /9000\+/ ) {
$found = 1;
my ( $distance1 ) = $pair->{off_target_summary} =~ /distance:\s+"(-?\d+)",/;
my ( $distance2 ) = $OFF_TARGET_DATA{$exon_id}->{$first}{$second} =~ /distance:\s+"(-?\d+)",/;
next if abs($distance1) < abs($distance2);
#we found a shorter one so add it
$pair->{off_target_summary} = $OFF_TARGET_DATA{$exon_id}->{$first}{$second};
say STDERR "Found shorter distance.";
}
else {
$pair->{off_target_summary} = $OFF_TARGET_DATA{$exon_id}->{$first}{$second};
$found = 1;
#say STDERR "found $first $second, set to " . $OFF_TARGET_DATA{$exon_id}->{$first}{$second};
}
}
}
}
next if $found;
unless ( $found ) {
say STDERR "Couldn't find $l_name vs $r_name pair in off target data.";
}
# if ( defined $OFF_TARGET_DATA{$exon_id}->{$l_name}{$r_name} ) {
# #we do have data so add it
# $pair->{off_target_summary} = $OFF_TARGET_DATA{$exon_id}->{$l_name}{$r_name};
# next;
# }
# elsif ( defined $OFF_TARGET_DATA{$exon_id}->{$r_name}{$l_name} ) {
# $pair->{off_target_summary} = $OFF_TARGET_DATA{$exon_id}->{$r_name}{$l_name};
# next;
# }
# else {
# say STDERR "Couldn't find $l_name vs $r_name pair in off target data.";
# say STDERR Dumper( \%OFF_TARGET_DATA );
# }
}
else {
say STDERR "$exon_id not defined in off target data hash";
}
#if we get here there's no data
$pair->{off_target_summary} = $no_off_target;
}
}
#say Dumper( \%OFF_TARGET_DATA );
}
1;
__END__
=head1 NAME
process_paired_data.pl - process windowbed output and merge the summary into the pairs file
=head1 SYNOPSIS
process_paired_data.pl [options]
--fq-file location of the crispr fq data created by find_paired_crisprs.pl
--pair-yaml-file location of the pair yaml data created by find_paired_crisprs.pl
--crispr-yaml-file location of crispr yaml data
--paired-output location of windowbed output files (can be multiple)
--help show this dialog
Example usage:
process_paired_data.pl --fq-file /nfs/users/nfs_a/ah19/work/crisprs/hprt_crisprs.fq --crispr-yaml-file /nfs/users/nfs_a/ah19/work/crisprs/hprt_crisprs.yaml --pair-yaml-file /nfs/users/nfs_a/ah19/work/crisprs/hprt_pairs.yaml --paired-output /nfs/users/nfs_a/ah19/work/crisprs/paired_output/*
=head1 DESCRIPTION
Iterate through all the paired output files to determine the closest off targets for the
given crispr pairings. The shortest for each pair is then written as a string into the original
pairs yaml file, ready to be persisted by persist_pairs.pl
=head AUTHOR
Alex Hodgkins
=cut