forked from boyter/scc
/
detector.go
211 lines (171 loc) · 5.06 KB
/
detector.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
package processor
import (
"errors"
"fmt"
"sort"
"strings"
)
// Detects a language based on the filename returns the language extension and error
func DetectLanguage(name string) ([]string, string) {
extension := ""
t := strings.Count(name, ".")
// If there is no . in the filename or it starts with one then check if #! or other
if (t == 0 || (name[0] == '.' && t == 1)) && len(AllowListExtensions) == 0 {
return checkFullName(name)
}
// Lookup in case the full name matches
language, ok := ExtensionToLanguage[strings.ToLower(name)]
// If no match check if we have a matching extension
if !ok {
extension = getExtension(name)
language, ok = ExtensionToLanguage[extension]
}
// Convert from d.ts to ts and check that in case of multiple extensions
if !ok {
extension = getExtension(extension)
language, ok = ExtensionToLanguage[extension]
}
return language, extension
}
func checkFullName(name string) ([]string, string) {
// Need to check if special type
language, ok := FilenameToLanguage[strings.ToLower(name)]
if ok {
return []string{language}, name
}
if Verbose {
printWarn(fmt.Sprintf("possible #! file: %s", name))
}
// No extension indicates possible #! so mark as such for processing
return []string{SheBang}, name
}
// Given some content attempt to determine if it has a #! that maps to a known language and return the language
func DetectSheBang(content string) (string, error) {
if !strings.HasPrefix(content, "#!") {
return "", errors.New("Missing #!")
}
index := strings.Index(content, "\n")
if index != -1 {
content = content[:index]
}
cmd, err := scanForSheBang([]byte(content))
if err != nil {
return "", err
}
for k, v := range ShebangLookup {
for _, x := range v {
// detects both full path and env usage
if x == cmd {
return k, nil
}
}
}
return "", errors.New("Unknown #!")
}
func scanForSheBang(content []byte) (string, error) {
state := 0
lastSlash := 0
candidate1 := ""
candidate2 := ""
for i := range content {
switch state {
case 0: // Deals with whitespace after #! and before first /
if content[i] == '/' {
lastSlash = i
state = 1
}
case 1: // Once we found the first / keep going till we hit whitespace
if content[i] == '/' {
lastSlash = i
}
// when at the end pull out the candidate
if i == len(content)-1 {
candidate1 = string(content[lastSlash+1 : i+1])
}
// between last slash and here is the first candidate which is either env or perl/php/python etc..
if isWhitespace(content[i]) {
// mark from lastSlash to here as first argument
candidate1 = string(content[lastSlash+1 : i])
state = 2
}
case 2: // We have the first candidate, see if there is another
// go till end of whitespace, mark that spot as new start
if !isWhitespace(content[i]) {
lastSlash = i
state = 3
}
case 3:
if i == len(content)-1 {
candidate2 = string(content[lastSlash : i+1])
}
if isWhitespace(content[i]) {
candidate2 = string(content[lastSlash:i])
state = 4
}
case 4:
break
}
}
switch {
case candidate1 == "env":
return candidate2, nil
case candidate1 != "":
return candidate1, nil
}
return "", errors.New("Unable to determine #! command")
}
type languageGuess struct {
Name string
Count int
}
// Given a filename, fallback language, possible languages and content make a guess to the type.
// If multiple possible it will guess based on keywords similar to how https://github.com/vmchale/polyglot does
func DetermineLanguage(filename string, fallbackLanguage string, possibleLanguages []string, content []byte) string {
// If being called through an API its possible nothing is set here and as
// such should just return as the Language value should have already been set
if len(possibleLanguages) == 0 {
return fallbackLanguage
}
// There should only be two possibilities now, either we have a single fallbackLanguage
// in which case we set it and return
// or we have multiple in which case we try to determine it heuristically
if len(possibleLanguages) == 1 {
return possibleLanguages[0]
}
startTime := makeTimestampNano()
var toCheck string
if len(content) > 20000 {
toCheck = string(content)[:20000]
} else {
toCheck = string(content)
}
toSort := []languageGuess{}
for _, lan := range possibleLanguages {
LanguageFeaturesMutex.Lock()
langFeatures := LanguageFeatures[lan]
LanguageFeaturesMutex.Unlock()
count := 0
for _, key := range langFeatures.Keywords {
if strings.Contains(toCheck, key) {
count++
}
}
toSort = append(toSort, languageGuess{Name: lan, Count: count})
}
sort.Slice(toSort, func(i, j int) bool {
if toSort[i].Count == toSort[j].Count {
return strings.Compare(toSort[i].Name, toSort[j].Name) < 0
}
return toSort[i].Count > toSort[j].Count
})
if Verbose {
printWarn(fmt.Sprintf("guessing language %s for file %s", toSort[0].Name, filename))
}
if Trace {
printTrace(fmt.Sprintf("nanoseconds to guess language: %s: %d", filename, makeTimestampNano()-startTime))
}
if len(toSort) != 0 {
return toSort[0].Name
}
return fallbackLanguage
}