added extracted data - should be ready for alignment

cocoxu · Apr 10, 2012 · 5e68664 · 5e68664
1 parent 7eaca7c
commit 5e68664
Show file tree

Hide file tree

Showing 4,452 changed files with 66,591 additions and 9 deletions.
diff --git a/bilingual-sentence-aligner/README.txt b/bilingual-sentence-aligner/README.txt
@@ -0,0 +1,88 @@
+BILINGUAL SENTENCE ALIGNER
+
+(c) Microsoft Corporation. All rights reserved.
+
+Your use of the Microsoft software ("Software") described herein is
+governed by the Microsoft Corporation Software License Agreement
+("License") in the accompanying file "license-agreement.txt".  Your
+use of the Software constitutes acceptance of this License.
+
+This directory contains Perl programs for finding bilingual sentence
+alignments.  The code implements the method described in the paper
+"Fast and Accurate Sentence Alignment of Bilingual Corpora," published
+in "Machine Translation: From Research to Real Users," the proceedings
+of the 5th conference of the Association for Machine Translation in
+the Americas.  The paper can also be downloaded from
+http://www.research.microsoft.com/pubs/.
+
+To adapt to different installations, the initial #! line beginning
+each file may need to be changed to point to the location of the Perl
+executable.
+
+There are two version of the code.  Each version has a top-level
+script that invokes several other Perl program files.  These have to
+be in the current working directory, or they won't be found.
+
+The sentences to be aligned need to be in paired files with one
+sentence per line and spaces between words.  The sentence files do not
+have to be in the same directory as the code files.
+
+The sentence aligner assumes that the alignable sentences in each file
+are in the same order, but that not all sentences align 1-to-1.
+Sentences that might be aligned 1-to-1, but which are out of order
+with respect to the majority of other alignable sentences will not be
+identified as alignable.
+
+The simpler version of the code is invoked by
+
+    align-sents-all.pl <lang_1_file> <lang_2_file> <threshold>
+
+which outputs into files named
+
+    <lang_1_file>.aligned
+    <lang_2_file>.aligned
+
+all the sentences from <lang_1_file> and <lang_2_file> that align
+1-to-1 with probability greater than <threshold> according to a
+statistical model computed by the aligner.  <threshold> may be
+omitted, in which case a probability threshold of 0.5 is used.
+
+This version of the code requires a pair of sentence files to have
+enough data to reliably estimate a statistical word-translation model.
+It has been determined that 10,000 sentence pairs should be adequate
+for this purpose.  Fewer sentence pairs may be sufficient, but this
+has not been tested.
+
+The second version of the code,
+
+   align-sents-all-multi-file.pl <directory> <threshold>
+
+looks for any number of paired sentence files in the folder
+<directory>, which should be given by a pathname relative to the
+current working directory.  The code assumes that paired sentence
+files have names of the form
+
+   <prefix>_<language>.snt
+
+where <language> can be any string not containing "_".  The code
+checks that that there are exactly two <language> strings used in the
+entire directory, and that there are the same number of files
+containing each of these strings.  For each different <prefix> it
+assumes that there are two files with names of the form
+
+   <prefix>_<language1>.snt
+   <prefix>_<language2>.snt
+
+It does not check this initially, but if the assumption is false then
+some later part of the process may die (with results that have not
+been investigated).
+
+Output files are generated in <directory> with the same naming
+convention as the two-file version of the code; that is, by appending
+".aligned" to the input file names.
+
+This version of the code also requires enough data to reliably
+estimate a statistical word-translation model, but it pools the data
+from all the files being aligned to build this model.  So, the
+individual sentence files can be small, but it is desirable to have at
+least 10,000 sentence pairs in total.
diff --git a/bilingual-sentence-aligner/align-sents-all-multi-file.pl b/bilingual-sentence-aligner/align-sents-all-multi-file.pl
@@ -0,0 +1,121 @@
+#!c:/Perl/bin/perl
+
+# (c) Microsoft Corporation. All rights reserved.
+
+# This version lets align-sents-length-plus-words-multi-file2.pl
+# handle the iteration over the sentence file pairs, so that the word
+# translation file only needs to be loaded once.
+
+use Cwd;
+
+($dir,$threshold) = @ARGV;
+
+if (!defined($threshold)) {
+    $threshold = .50;
+}
+
+(opendir DIR, $dir) || die("Could not open directory $dir\n"); 
+@all_snt_files = grep /\.snt$/, readdir DIR;
+closedir(DIR);
+
+$prog_dir = cwd();
+
+chdir($dir);
+
+print "program directory: $prog_dir\n";
+print "data directory: $dir\n";
+
+foreach $file_name (@all_snt_files) {
+    $file_name =~ /.*_(.+?)\.snt$/;
+    $language_tag{$1}++;
+}
+
+@languages = keys(%language_tag);
+
+unless (@languages == 2) {
+    die "not exactly two languages in directory: @languages\n";
+}
+
+($lang_1,$lang_2) = @languages;
+
+if ($language_tag{$lang_1} != $language_tag{$lang_2}) {
+    die "$language_tag{$lang_1} $lang_1 files, but $language_tag{$lang_2} $lang_2 files\n";
+}
+
+$file_index_limit = -1;
+foreach $file_name (@all_snt_files) {
+    $file_name =~ /(.*_)(.+?)\.snt$/;
+    if ($2 eq $lang_1) {
+	push(@sent_file_1_list,join('',$1,$lang_1,'.snt'));
+	push(@sent_file_2_list,join('',$1,$lang_2,'.snt'));
+	$file_index_limit++;
+    }
+}
+
+print "\nFinding length-based alignments and filtering initial high-probability aligned sentences\n";
+
+foreach $i (0..$file_index_limit) {
+    $sent_file_1 = $sent_file_1_list[$i];
+    $sent_file_2 = $sent_file_2_list[$i];
+    system("perl $prog_dir/align-sents-dp-beam7.pl $sent_file_1 $sent_file_2");
+    print "\n========================================================\n\n";
+    system("perl $prog_dir/filter-initial-aligned-sents.pl $sent_file_1 $sent_file_2");
+    print "\n========================================================\n";
+}
+
+print "\nConcatenating length-aligned sentence files\n";
+
+$start_time = (times)[0];
+
+open(OUT,"> all_$lang_1.snt.words");
+foreach $i (0..$file_index_limit) {
+    $sent_file_1 = $sent_file_1_list[$i];
+    open(IN,"$sent_file_1.words");
+    while ($line = <IN>) {
+	print OUT $line;
+    }
+    close(IN);
+}
+close(OUT);
+
+open(OUT,"> all_$lang_2.snt.words");
+foreach $i (0..$file_index_limit) {
+    $sent_file_2 = $sent_file_2_list[$i];
+    open(IN,"$sent_file_2.words");
+    while ($line = <IN>) {
+	print OUT $line;
+    }
+    close(IN);
+}
+close(OUT);
+
+
+$end_time = (times)[0];
+$concat_time = $end_time - $start_time;
+print "\n$concat_time seconds to concatenate files\n";
+
+print "\n========================================================\n";
+print "\nBuilding word association model\n";
+system("perl $prog_dir/build-model-one-multi-file.pl all_$lang_1.snt all_$lang_2.snt");
+print "\n========================================================\n";
+
+print "\nFinding alignment based on word associations and lengths and filtering final high-probability aligned sentences\n";
+
+open(OUT,"> sentence-file-pair-list");
+
+foreach $i (0..$file_index_limit) {
+    $sent_file_1 = $sent_file_1_list[$i];
+    $sent_file_2 = $sent_file_2_list[$i];
+    print OUT "$sent_file_1 $sent_file_2\n";
+}
+
+system("perl $prog_dir/align-sents-length-plus-words-multi-file2.pl");
+
+foreach $i (0..$file_index_limit) {
+    $sent_file_1 = $sent_file_1_list[$i];
+    $sent_file_2 = $sent_file_2_list[$i];
+    system("perl $prog_dir/filter-final-aligned-sents.pl $sent_file_1 $sent_file_2 $threshold");
+    print "\n========================================================\n";
+}
+
+print "\n";
diff --git a/bilingual-sentence-aligner/align-sents-all.pl b/bilingual-sentence-aligner/align-sents-all.pl
@@ -0,0 +1,25 @@
+#!c:/Perl/bin/perl
+
+# (c) Microsoft Corporation. All rights reserved.
+
+($sent_file_1,$sent_file_2,$threshold) = @ARGV;
+
+if (!defined($threshold)) {
+    $threshold = .50;
+}
+
+print "\nFinding length-based alignment\n";
+system("perl align-sents-dp-beam7.pl $sent_file_1 $sent_file_2");
+print "\n========================================================\n";
+print "\nFiltering initial high-probability aligned sentences\n";
+system("perl filter-initial-aligned-sents.pl $sent_file_1 $sent_file_2");
+print "\n========================================================\n";
+print "\nBuilding word association model\n";
+system("perl build-model-one6.pl $sent_file_1 $sent_file_2");
+print "\n========================================================\n";
+print "\nFinding alignment based on word associations and lengths\n";
+system("perl align-sents-length-plus-words3.pl $sent_file_1 $sent_file_2");
+print "\n========================================================\n";
+print "\nFiltering final high-probability aligned sentences\n";
+system("perl filter-final-aligned-sents.pl $sent_file_1 $sent_file_2 $threshold");
+print "\n";