Skip to content

cschen1205/cs-nlp-classical-machine-translation

Repository files navigation

cs-nlp-classical-machine-translation

Classical Machine Translation algorithms such as IBM models and EM methods in .NET

Install

Install-Package cs-nlp-classical-machine-translation

Usage

The sample codes below shows how to train and do language translation with IBM model 1:

using System;
using System.Collections.Generic;
using System.Linq;

namespace ClassicalMachineTranslation
{
    class IBMModel1Demo
    {
        public void RunSimpleTraining()
        {
            IBMModel1 model = new IBMModel1();

            List<SimpleTrainingRecord> training_corpus = new List<SimpleTrainingRecord>();

            EnglishTokenizer tokenizer_output = new EnglishTokenizer();
            FrenchTokenizer tokenizer_input = new FrenchTokenizer();


            SimpleTrainingMethod.Train(model, training_corpus);

            string sentence_input = "[Some French Sentence]";
            string sentence_output = "[Some English Sentence]";

            string[] input_lang = tokenizer_input.Tokenize(sentence_input);
            string[] output_lang = tokenizer_output.Tokenize(sentence_output);
            int[] alignment = model.GetAlignment(input_lang, output_lang);

            Dictionary<int, string> output_mapping = new Dictionary<int, string>();
            int m_input_len = input_lang.Length;
            for (int j = 0; j < m_input_len; ++j)
            {
                int a_j = alignment[j];
                string output_word = output_lang[a_j];
                output_mapping[a_j] = output_word;
            }
            List<int> output_sentence_index_list = output_mapping.Keys.ToList();
            output_sentence_index_list.Sort();

            string[] predicted_output_lang = new string[output_sentence_index_list.Count];
            for (int i = 0; i < predicted_output_lang.Length; ++i)
            {
                predicted_output_lang[i] = output_mapping[output_sentence_index_list[i]];
            }

            Console.WriteLine("Original French Sentence: {0}", sentence_input);
            Console.WriteLine("Predicted English Translation: {0}", string.Join(" ", predicted_output_lang));
        }
        
        public void RunEMTraining()
        {
            IBMModel1 model = new IBMModel1();

            List<EMTrainingRecord> training_corpus = new List<EMTrainingRecord>();

            EnglishTokenizer tokenizer_output = new EnglishTokenizer();
            FrenchTokenizer tokenizer_input = new FrenchTokenizer();

            EMTrainingMethod.Train(model, training_corpus, 20);

            string sentence_input = "[Some French Sentence]";
            string sentence_output = "[Some English Sentence]";

            string[] input_lang = tokenizer_input.Tokenize(sentence_input);
            string[] output_lang = tokenizer_output.Tokenize(sentence_output);
            int[] alignment = model.GetAlignment(input_lang, output_lang);

            Dictionary<int, string> output_mapping = new Dictionary<int, string>();
            int m_input_len = input_lang.Length;
            for (int j = 0; j < m_input_len; ++j)
            {
                int a_j = alignment[j];
                string output_word = output_lang[a_j];
                output_mapping[a_j] = output_word;
            }
            List<int> output_sentence_index_list = output_mapping.Keys.ToList();
            output_sentence_index_list.Sort();

            string[] predicted_output_lang = new string[output_sentence_index_list.Count];
            for (int i = 0; i < predicted_output_lang.Length; ++i)
            {
                predicted_output_lang[i] = output_mapping[output_sentence_index_list[i]];
            }

            Console.WriteLine("Original French Sentence: {0}", sentence_input);
            Console.WriteLine("Predicted English Translation: {0}", string.Join(" ", predicted_output_lang));
        }
    }
}

The sample codes below show how to train and do language translation using IBM model 2:

uusing System;
using System.Collections.Generic;
using System.Linq;

namespace ClassicalMachineTranslation
{
    class IBMModel2Demo
    {
        public void RunSimpleTraining()
        {
            IBMModel2 model = new IBMModel2();

            List<SimpleTrainingRecord> training_corpus = new List<SimpleTrainingRecord>();

            EnglishTokenizer tokenizer_output = new EnglishTokenizer();
            FrenchTokenizer tokenizer_input = new FrenchTokenizer();

            SimpleTrainingMethod.Train(model, training_corpus);

            string sentence_input = "[Some French Sentence]";
            string sentence_output = "[Some English Sentence]";

            string[] input_lang = tokenizer_input.Tokenize(sentence_input);
            string[] output_lang = tokenizer_output.Tokenize(sentence_output);
            int[] alignment = model.GetAlignment(input_lang, output_lang);

            Dictionary<int, string> output_mapping = new Dictionary<int, string>();
            int m_input_len = input_lang.Length;
            for (int j = 0; j < m_input_len; ++j)
            {
                int a_j = alignment[j];
                string output_word = output_lang[a_j];
                output_mapping[a_j] = output_word;
            }
            List<int> output_sentence_index_list = output_mapping.Keys.ToList();
            output_sentence_index_list.Sort();

            string[] predicted_output_lang = new string[output_sentence_index_list.Count];
            for (int i = 0; i < predicted_output_lang.Length; ++i)
            {
                predicted_output_lang[i] = output_mapping[output_sentence_index_list[i]];
            }

            Console.WriteLine("Original French Sentence: {0}", sentence_input);
            Console.WriteLine("Predicted English Translation: {0}", string.Join(" ", predicted_output_lang));
        }
        
        public void RunEMTraining()
        {
            IBMModel2 model = new IBMModel2();

            List<EMTrainingRecord> training_corpus = new List<EMTrainingRecord>();

            EnglishTokenizer tokenizer_output = new EnglishTokenizer();
            FrenchTokenizer tokenizer_input = new FrenchTokenizer();


            EMTrainingMethod.Train(model, training_corpus, 20);

            string sentence_input = "[Some French Sentence]";
            string sentence_output = "[Some English Sentence]";

            string[] input_lang = tokenizer_input.Tokenize(sentence_input);
            string[] output_lang = tokenizer_output.Tokenize(sentence_output);
            int[] alignment = model.GetAlignment(input_lang, output_lang);

            Dictionary<int, string> output_mapping = new Dictionary<int, string>();
            int m_input_len = input_lang.Length;
            for (int j = 0; j < m_input_len; ++j)
            {
                int a_j = alignment[j];
                string output_word = output_lang[a_j];
                output_mapping[a_j] = output_word;
            }
            List<int> output_sentence_index_list = output_mapping.Keys.ToList();
            output_sentence_index_list.Sort();

            string[] predicted_output_lang = new string[output_sentence_index_list.Count];
            for (int i = 0; i < predicted_output_lang.Length; ++i)
            {
                predicted_output_lang[i] = output_mapping[output_sentence_index_list[i]];
            }

            Console.WriteLine("Original French Sentence: {0}", sentence_input);
            Console.WriteLine("Predicted English Translation: {0}", string.Join(" ", predicted_output_lang));
        }
    }
}

About

Classical Machine Translation algorithms such as IBM models and EM methods in .NET

Topics

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages