Skip to content

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
...
  • 5 commits
  • 5 files changed
  • 0 commit comments
  • 1 contributor
Showing with 10,107 additions and 57 deletions.
  1. +21 −1 lib/linguist/blob_helper.rb
  2. +9 −11 lib/linguist/classifier.rb
  3. +10,072 −45 lib/linguist/classifier.yml
  4. +1 −0 lib/linguist/languages.yml
  5. +4 −0 test/test_classifier.rb
View
22 lib/linguist/blob_helper.rb
@@ -1,3 +1,4 @@
+require 'linguist/classifier'
require 'linguist/language'
require 'linguist/mime'
require 'linguist/pathname'
@@ -453,11 +454,30 @@ def lexer
# Returns a Language or nil.
def disambiguate_extension_language
if Language.ambiguous?(extname)
+ possible_languages = Language.all.select { |l| l.extensions.include?(extname) }
+
name = "guess_#{extname.sub(/^\./, '')}_language"
- send(name) if respond_to?(name)
+ language = send(name) if respond_to?(name)
+
+ if possible_languages.any?
+ results = Classifier.instance.classify(data, possible_languages)
+ guessed_language, score = results.first
+
+ if guessed_language != language
+ report_classifier_incorrect_guess(language, guessed_language, score)
+ end
+ end
+
+ language
end
end
+ class LanguageClassifierError < StandardError; end
+
+ def report_classifier_incorrect_guess(expected, actual, score)
+ raise LanguageClassifierError, "Expected #{expected}, but was #{actual} scoring #{score}.\n#{name}\n#{data}"
+ end
+
# Internal: Guess language of .cls files
#
# Returns a Language.
View
20 lib/linguist/classifier.rb
@@ -68,21 +68,19 @@ def train(language, data)
nil
end
+ # Public: Verify internal counts are consistent.
+ #
+ # Returns Boolean.
+ def verify
+ @languages.inject(0) { |n, (l, c)| n += c } == @languages_total &&
+ @language_tokens.inject(0) { |n, (l, c)| n += c } == @tokens_total &&
+ @tokens.inject(0) { |n, (l, ts)| n += ts.inject(0) { |m, (t, c)| m += c } } == @tokens_total
+ end
+
# Public: Prune infrequent tokens.
#
# Returns receiver Classifier instance.
def gc
- @tokens.each do |language, tokens|
- if @language_tokens[language] > 20
- tokens.each do |name, count|
- if count == 1
- @tokens[language].delete(name)
- @language_tokens[language] -= 1
- @tokens_total -= 1
- end
- end
- end
- end
self
end
View
10,117 lib/linguist/classifier.yml
10,072 additions, 45 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
1 lib/linguist/languages.yml
@@ -1287,6 +1287,7 @@ Visual Basic:
primary_extension: .vb
extensions:
- .bas
+ - .cls
- .frx
- .vb
- .vba
View
4 test/test_classifier.rb
@@ -55,6 +55,10 @@ def test_instance_classify_empty
assert results.first[1] < 0.5, results.first.inspect
end
+ def test_verify
+ assert Classifier.instance.verify
+ end
+
def test_gc
Classifier.instance.gc
end

No commit comments for this range

Something went wrong with that request. Please try again.