lib/language_detector.rb

require 'yaml'
require 'jcode' if RUBY_VERSION < '1.9'
$KCODE = 'u' if RUBY_VERSION < '1.9'

class LanguageDetector
  def detect text
    @profiles ||= load_model

    p = LanguageDetector::Profile.new("")
    p.init_with_string text
    best_profile = nil
    best_distance = nil
    @profiles.each {|profile|
      distance = profile.compute_distance(p)

      if !best_distance || distance < best_distance
        best_distance = distance
        best_profile = profile
      end
    }
    return best_profile.name
  end

  def self.train

    # For a full list of ISO 639 language tags visit:

    # http:#www.loc.gov/standards/iso639-2/englangn.html

    #LARGE profiles follow:

    #NOTE: These profiles taken from the "World War II" node on wikipedia
    #with the 'lang' and ?action=raw URI which results in a UTF8 encoded
    #file.  If we need to get more profile data for a language this is
    #always a good source of data.
    #
    # http:#en.wikipedia.org/wiki/World_War_II

    training_data = [
      # af (afrikaans)
      [ "ar", "ar-utf8.txt", "utf8", "arabic" ],
      [ "bg", "bg-utf8.txt", "utf8", "bulgarian" ],
      # bs (bosnian )
      # ca (catalan)
      [ "cs", "cs-utf8.txt", "utf8", "czech" ],
      # cy (welsh)
      [ "da", "da-iso-8859-1.txt", "iso-8859-1", "danish" ],
      [ "de", "de-utf8.txt", "utf8", "german" ],
      [ "el", "el-utf8.txt", "utf8", "greek" ],
      [ "en", "en-iso-8859-1.txt", "iso-8859-1", "english" ],
      [ "et", "et-utf8.txt", "utf8", "estonian" ],
      [ "es", "es-utf8.txt", "utf8", "spanish" ],
      [ "fa", "fa-utf8.txt", "utf8", "farsi" ],
      [ "fi", "fi-utf8.txt", "utf8", "finnish" ],
      [ "fr", "fr-utf8.txt", "utf8", "french" ],
      [ "fy", "fy-utf8.txt", "utf8", "frisian" ],
      [ "ga", "ga-utf8.txt", "utf8", "irish" ],
      #gd (gaelic)
      #haw (hawaiian)
      [ "he", "he-utf8.txt", "utf8", "hebrew" ],
      [ "hi", "hi-utf8.txt", "utf8", "hindi" ],
      [ "hr", "hr-utf8.txt", "utf8", "croatian" ],
      #id (indonesian)
      [ "io", "io-utf8.txt", "utf8", "ido" ],
      [ "is", "is-utf8.txt", "utf8", "icelandic" ],
      [ "it", "it-utf8.txt", "utf8", "italian" ],
      [ "ja", "ja-utf8.txt", "utf8", "japanese" ],
      [ "ko", "ko-utf8.txt", "utf8", "korean" ],
      #ku (kurdish)
      #la ?
      #lb ?
      #lt (lithuanian)
      #lv (latvian)
      [ "hu", "hu-utf8.txt", "utf8", "hungarian" ],
      #mk (macedonian)
      #ms (malay)
      #my (burmese)
      [ "nl", "nl-iso-8859-1.txt", "iso-8859-1", "dutch" ],
      [ "no", "no-utf8.txt", "utf8", "norwegian" ],
      [ "pl", "pl-utf8.txt", "utf8", "polish" ],
      [ "pt", "pt-utf8.txt", "utf8", "portuguese" ],
      [ "ro", "ro-utf8.txt", "utf8", "romanian" ],
      [ "ru", "ru-utf8.txt", "utf8", "russian" ],
      [ "sl", "sl-utf8.txt", "utf8", "slovenian" ],
      #sr (serbian)
      [ "sv", "sv-iso-8859-1.txt", "iso-8859-1", "swedish" ],
      #[ "sv", "sv-utf8.txt", "utf8", "swedish" ],
      [ "th", "th-utf8.txt", "utf8", "thai" ],
      #tl (tagalog)
      #ty (tahitian)
      [ "uk", "uk-utf8.txt", "utf8", "ukraninan" ],
      [ "vi", "vi-utf8.txt", "utf8", "vietnamese" ],
      #wa (walloon)
      #yi (yidisih)
      [ "zh", "zh-utf8.txt", "utf8", "chinese" ]
    ]

    profiles = []
    training_data.each {|data|
      p = LanguageDetector::Profile.new data[0]
      p.init_with_file data[1]
      profiles << p
    }
    puts 'saving model...'
    filename = File.expand_path(File.join(File.dirname(__FILE__), "model.yml"))
    File.open(filename, 'w') {|f|
      YAML.dump(profiles, f)
    }
  end

  def load_model
    filename = File.expand_path(File.join(File.dirname(__FILE__), "model.yml"))
    @profiles = YAML.load_file(filename)
  end

  class LanguageDetector::Profile

    PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
    ?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
    ?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]

    LIMIT = 2000

    def compute_distance other_profile
      distance = 0
      other_profile.ngrams.each {|k, v|
        n = @ngrams[k]
        if n
          distance += (v - n).abs
        else
          distance += LanguageDetector::Profile::LIMIT
        end
      }
      return distance
    end

    attr_reader :ngrams, :name

    def initialize(name)
      @name = name
      @puctuations = {}
      PUNCTUATIONS.each {|p| @puctuations[p] = 1}
      @ngrams = {}
    end

    def init_with_file filename
      ngram_count = {}

      path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
      puts "training with " + path
      File.open(path).each_line{ |line|
        _init_with_string line, ngram_count
      }

      a = ngram_count.sort {|a,b| b[1] <=> a[1]}
      i = 1
      a.each {|t|
        @ngrams[t[0]] = i
        i += 1
        break if i > LIMIT
      }
    end

    def init_with_string str
      ngram_count = {}

      _init_with_string str, ngram_count

      a = ngram_count.sort {|a,b| b[1] <=> a[1]}
      i = 1
      a.each {|t|
        @ngrams[t[0]] = i
        i += 1
        break if i > LIMIT
      }
    end

    def _init_with_string str, ngram_count
      tokens = tokenize(str)
      tokens.each {|token|
        count_ngram token, 2, ngram_count
        count_ngram token, 3, ngram_count
        count_ngram token, 4, ngram_count
        count_ngram token, 5, ngram_count
      }
    end

    def tokenize str
      tokens = []
      s = ''
      str.each_byte {|b|
        if is_puctuation?(b)
          tokens << s unless s.empty?
          s = ''
        else
          s << b
        end
      }
      tokens << s unless s.empty?
      return tokens
    end

    def is_puctuation? b
      @puctuations[b]
    end

    def count_ngram token, n, counts
      if RUBY_VERSION < '1.9'
        token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.jlength >= n
      else
        token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.length >= n
      end
      i = 0
      while i + n <= token.length
        s = ''
        j = 0
        while j < n
          s << token[i+j]
          j += 1
        end
        if counts[s]
          counts[s] = counts[s] + 1
        else
          counts[s] = 1
        end
        i += 1
      end

      return counts
    end

  end

end

if $0 == __FILE__
  if ARGV.length == 1 && 'train' == ARGV[0]
    LanguageDetector.train
  else
    d = LanguageDetector.new
    p d.detect("what language is it is?")
  end
end