Skip to content

Commit

Permalink
Extract strategies for detecting the language
Browse files Browse the repository at this point in the history
  • Loading branch information
bkeepers committed Nov 3, 2014
1 parent 8d7b4f8 commit fd32938
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 33 deletions.
4 changes: 4 additions & 0 deletions lib/linguist/heuristics.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ module Linguist
class Heuristics
ACTIVE = true

def self.call(blob, languages)
find_by_heuristics(blob.data, langauges)
end

# Public: Given an array of String language names,
# apply heuristics against the given data and return an array
# of matching languages, or nil.
Expand Down
53 changes: 22 additions & 31 deletions lib/linguist/language.rb
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,17 @@ def self.create(attributes = {})
end

require 'linguist/strategy/filename'
require 'linguist/strategy/shebang'
require 'linguist/strategy/classifier'
STRATEGIES = [
Linguist::Strategy::Filename,
# Don't bother with binary contents or an empty file
lambda {|blob, langauges| [] if blob.data.nil? || blob.data == "" },
# Check if there's a shebang line and use that as authoritative
Linguist::Strategy::Shebang,
Linguist::Heuristics,
Linguist::Strategy::Classifier
]

# Public: Detects the Language of the blob.
#
Expand All @@ -101,40 +112,20 @@ def self.create(attributes = {})
#
# Returns Language or nil.
def self.detect(blob)
# Check if the blob is possibly binary and bail early; this is a cheap
# test that uses the extension name to guess a binary binary mime type.
#
# We'll perform a more comprehensive test later which actually involves
# looking for binary characters in the blob
# Check if the blob is possibly binary and bail early.
return nil if blob.likely_binary? || blob.binary?

possible_languages = Linguist::Strategy::Filename.new.call(blob)

# If there is more than one possible language with that extension (or no
# extension at all, in the case of extensionless scripts), we need to continue
# our detection work
if possible_languages.length > 1
data = blob.data
possible_language_names = possible_languages.map(&:name)

# Don't bother with binary contents or an empty file
if data.nil? || data == ""
nil
# Check if there's a shebang line and use that as authoritative
elsif (result = find_by_shebang(data)) && !result.empty?
result.first
# No shebang. Still more work to do. Try to find it with our heuristics.
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
determined.first
# Lastly, fall back to the probabilistic classifier.
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
Language[classified[0]]
STRATEGIES.reduce([]) do |languages, strategy|
if candidates = strategy.call(blob, languages)
if candidates.size > 1
candidates
else
break candidates
end
else
languages
end
else
# Simplest and most common case, we can just return the one match based on extension
possible_languages.first
end
end.first
end

# Public: Get all Languages
Expand Down
12 changes: 12 additions & 0 deletions lib/linguist/strategy/classifier.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module Linguist
module Strategy
class Classifier
def self.call(blob, languages)
Linguist::Classifier.classify(Samples.cache, blob.data, possible_language_names).map do |name|
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
Language[name]
end
end
end
end
end
4 changes: 2 additions & 2 deletions lib/linguist/strategy/filename.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module Linguist
module Strategy
class Filename
def call(blob)
def self.call(blob, _)
name = blob.name.to_s

# A bit of an elegant hack. If the file is executable but extensionless,
Expand All @@ -13,7 +13,7 @@ def call(blob)
end

# First try to find languages that match based on filename.
possible_languages = Language.find_by_filename(name)
Language.find_by_filename(name)
end
end
end
Expand Down
9 changes: 9 additions & 0 deletions lib/linguist/strategy/shebang.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
module Linguist
module Strategy
class Shebang
def self.call(blob, _)
Language.find_by_shebang(blob.data)
end
end
end
end

0 comments on commit fd32938

Please sign in to comment.