Permalink
Browse files

new corrected example file; fixed bug about collapsing

  • Loading branch information...
1 parent b55f802 commit bb700051adcba6eb3bd944b617cb2e0a8eeeb2f8 @nicolabertoldi nicolabertoldi committed Oct 8, 2012
Showing with 31 additions and 13 deletions.
  1. +19 −13 tags4moses/deannotate_words.pl
  2. +12 −0 tags4moses/examples/deannotate_test.txt_renumbered
@@ -46,7 +46,6 @@ ()
# print STDERR "passthough:|$passthrough|\n";
# print STDERR "trans:|$trans|\n";
-# print STDERR "trans:|$trans|\n";
#parsing translation
my @trgwords = split (/[ \t]+/, $trans);
@@ -77,20 +76,20 @@ ()
}
if ($type == 0){
$xml{$idx} .= "<$value>";
- $endxml{$idx} .= "</$mainvalue>";
+ $endxml{$idx} = "</$mainvalue>$endxml{$idx}";
$typexml{$idx} = $type;
}elsif ($type == 1){
$xml{$idx} .= "<$value/>";
$endxml{$idx} .= "";
$typexml{$idx} = $type;
}elsif ($type == 2){
- $xml{$idx} .= "<$value>";
- $endxml{$idx} = "</$mainvalue>$endxml{$idx}";
+ $xml{$idx} .= "<$value></$mainvalue>";
+# $endxml{$idx} .= "</$mainvalue>";
$typexml{$idx} = $type;
}else{
die "Third field should have one of the following values: 0, 1, 2\n";
}
-# print STDERR "INSIDE:$i idx:$idx xml{$idx}:|$xml{$idx}| endxml{$idx}:|$endxml{$idx}|\n";
+# print STDERR "INSIDE:$i idx:$idx type:$typexml{$idx} xml{$idx}:|$xml{$idx}| endxml{$idx}:|$endxml{$idx}|\n";
}
#reconctructing the tagged output
@@ -103,7 +102,7 @@ ()
}elsif ($typexml{$srcidx} == 1){
$out .= $xml{$srcidx}.$trgwords[$i]." ";
}elsif ($typexml{$srcidx} == 2){
- $out .= $xml{$srcidx}.$endxml{$srcidx}.$trgwords[$i]." ";
+ $out .= $xml{$srcidx}.$trgwords[$i]." ";
}else{
die "Third field should have one of the following values: 0, 1, 2\n";
}
@@ -118,27 +117,34 @@ ()
my $newout = "";
while ($contflag){
$contflag=0;
+ $newout = "";
+# print STDERR "START EXT WHILE contflag:|$contflag|\n";
# print STDERR "out:|$out|\n";
- while ($out =~ s/(.*?)(<\/[ ]*([^ >]+?)[ ]*>[ \t]*<(([^ \t>]+)([ \t][^>]*>|>)))//){
+ while ($out =~ s/(.*?)(<\/[ ]*([^ >]+?)[ ]*>[ \t]*<(([^ \t>\/]+?)([ \t][^>]*>|>)))//){
$newout .= " $1 ";
my $endtag = $3;
my $starttag = $5;
- # print STDERR "endtag:|$endtag| starttag:|$starttag|\n";
+# print STDERR "endtag:|$endtag| starttag:|$starttag|\n";
if ($endtag eq $starttag){
$contflag=1;
}
else
{
$newout .= " $2 ";
}
-# print STDERR "newout:|$newout|\n";
-# print STDERR "out:|$out|\n";
+# print STDERR "newout:|$newout|\n";
+# print STDERR "out:|$out|\n";
}
+ $newout .= " $out ";
+ $out = $newout;
+# print STDERR "newout:|$newout|\n";
+# print STDERR "out:|$out|\n";
+# print STDERR "END EXT WHILE contflag:|$contflag|\n";
}
- $newout .= " $out ";
- $out = $newout;
+ #$newout .= " $out ";
+ #$out = $newout;
}
# escaping (or not) some characters
@@ -152,7 +158,7 @@ ()
}
# removing index of tags
- $out =~ s/(<\/?)([^> ]+)_\d+/$1$2/g;
+# $out =~ s/(<\/?)([^> ]+)_\d+/$1$2/g;
# removing double spaces and spaces at the beginning and end of the line
$out =~ s/>([^ ])/> $1/g;
@@ -0,0 +1,12 @@
+<passthrough tag="1#&lt;b_1 id=&quot;4&quot;&gt;#0||2#&lt;b_1 id=&quot;4&quot;&gt;#0||2#&lt;i_2&gt;#0||4#&lt;dot_3&gt;#0" src="Also nested tags work ."/>Also |0| nested |1| tags |2| work |3| . |4|
+<passthrough tag="" src="Line with no tag ."/>Line |0| with |1| no |2| tag |3| . |4|
+<passthrough tag="3#&lt;b_1&gt;#0||4#&lt;b_1&gt;#0||5#&lt;b_1&gt;#0" src="Keep escaped markup &amp; lt ;"/>Keep |0| escaped |1| markup |2| &amp; |3| lt |4| ; |5|
+<passthrough tag="0#&lt;img_1 alt=&quot;a beautiful image&quot;&gt;#1" src=""/>
+<passthrough tag="0#&lt;BIG_1&gt;#2||0#&lt;BIG_1&gt;#0||0#&lt;BIG_2&gt;#0||1#&lt;BIG_1&gt;#0||1#&lt;BIG_2&gt;#0||2#&lt;BIG_1&gt;#0||2#&lt;BIG_2&gt;#0" src="double cased markup"/>double |0| cased |1| markup |2|
+<passthrough tag="" src="this is the hmtl symbol for &quot; higher than &quot; : &amp; gt ;"/>this |0| is |1| the |2| hmtl |3| symbol |4| for |5| &quot; |6| higher |7| than |8| &quot; |9| : |10| &amp; |11| gt |12| ; |13|
+<passthrough tag="" src="this is the character for &quot; higher than &quot; : &gt;"/>this |0| is |1| the |2| character |3| for |4| &quot; |5| higher |6| than |7| &quot; |8| : |9| &gt; |10|
+<passthrough tag="1#&lt;tag_1 id=&apos;double&quot;quote&quot;&apos; id2=&quot;single&apos;quote&quot;&gt;#0" src="Quotes work ."/>Quotes |0| work |1| . |2|
+<passthrough tag="3#&lt;text_1&gt;#2" src="Empty text : ."/>Empty |0| text |1| : |2| . |3|
+<passthrough tag="0#&lt;&lt;built-in function Comment&gt;_1&gt;#0" src="Comment"/>Comment |0|
+<passthrough tag="1#&lt;tag_1&gt;#1||2#&lt;treated_2&gt;#1||3#&lt;a_3&gt;#1||4#&lt;by_4&gt;#1||5#&lt;tokenizer_5&gt;#1" src="Each is as space the ."/>Each |0| is |1| as |2| space |3| the |4| . |5|
+<passthrough tag="0#&lt;a_1 help=&quot;empty element&quot;&gt;#1" src="3 &gt; 2 &amp; 4 &gt; 3 ."/>3 |0| &gt; |1| 2 |2| &amp; |3| 4 |4| &gt; |5| 3 |6| . |7|

0 comments on commit bb70005

Please sign in to comment.