-
Notifications
You must be signed in to change notification settings - Fork 4.1k
/
blob_helper.rb
634 lines (563 loc) · 15.5 KB
/
blob_helper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
require 'linguist/language'
require 'linguist/mime'
require 'linguist/pathname'
require 'charlock_holmes'
require 'escape_utils'
require 'pygments'
require 'yaml'
module Linguist
# BlobHelper is a mixin for Blobish classes that respond to "name",
# "data" and "size" such as Grit::Blob.
module BlobHelper
# Internal: Get a Pathname wrapper for Blob#name
#
# Returns a Pathname.
def pathname
Pathname.new(name || "")
end
# Public: Get the extname of the path
#
# Examples
#
# blob(name='foo.rb').extname
# # => '.rb'
#
# Returns a String
def extname
pathname.extname
end
# Public: Get the actual blob mime type
#
# Examples
#
# # => 'text/plain'
# # => 'text/html'
#
# Returns a mime type String.
def mime_type
@mime_type ||= pathname.mime_type
end
# Public: Get the Content-Type header value
#
# This value is used when serving raw blobs.
#
# Examples
#
# # => 'text/plain; charset=utf-8'
# # => 'application/octet-stream'
#
# Returns a content type String.
def content_type
@content_type ||= binary? ? mime_type :
(encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
end
# Public: Get the Content-Disposition header value
#
# This value is used when serving raw blobs.
#
# # => "attachment; filename=file.tar"
# # => "inline"
#
# Returns a content disposition String.
def disposition
if text? || image?
'inline'
elsif name.nil?
"attachment"
else
"attachment; filename=#{EscapeUtils.escape_url(pathname.basename)}"
end
end
def encoding
if hash = detect_encoding
hash[:encoding]
end
end
# Try to guess the encoding
#
# Returns: a Hash, with :encoding, :confidence, :type
# this will return nil if an error occurred during detection or
# no valid encoding could be found
def detect_encoding
@detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
end
# Public: Is the blob binary according to its mime type
#
# Return true or false
def binary_mime_type?
if mime_type = Mime.lookup_mime_type_for(pathname.extname)
mime_type.binary?
end
end
# Public: Is the blob binary?
#
# Return true or false
def binary?
# Large blobs aren't even loaded into memory
if data.nil?
true
# Treat blank files as text
elsif data == ""
false
# Charlock doesn't know what to think
elsif encoding.nil?
true
# If Charlock says its binary
else
detect_encoding[:type] == :binary
end
end
# Public: Is the blob text?
#
# Return true or false
def text?
!binary?
end
# Public: Is the blob a supported image format?
#
# Return true or false
def image?
['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
end
# Public: Is the blob a possible drupal php file?
#
# Return true or false
def drupal_extname?
['.module', '.install', '.test', '.inc'].include?(extname)
end
# Public: Is the blob likely to have a shebang?
#
# Return true or false
def shebang_extname?
extname.empty? &&
mode &&
(mode.to_i(8) & 05) == 05
end
MEGABYTE = 1024 * 1024
# Public: Is the blob too big to load?
#
# Return true or false
def large?
size.to_i > MEGABYTE
end
# Public: Is the blob viewable?
#
# Non-viewable blobs will just show a "View Raw" link
#
# Return true or false
def viewable?
!large? && text?
end
vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
VendoredRegexp = Regexp.new(vendored_paths.join('|'))
# Public: Is the blob in a vendored directory?
#
# Vendored files are ignored by language statistics.
#
# See "vendor.yml" for a list of vendored conventions that match
# this pattern.
#
# Return true or false
def vendored?
name =~ VendoredRegexp ? true : false
end
# Public: Get each line of data
#
# Requires Blob#data
#
# Returns an Array of lines
def lines
@lines ||= (viewable? && data) ? data.split("\n", -1) : []
end
# Public: Get number of lines of code
#
# Requires Blob#data
#
# Returns Integer
def loc
lines.size
end
# Public: Get number of source lines of code
#
# Requires Blob#data
#
# Returns Integer
def sloc
lines.grep(/\S/).size
end
# Internal: Compute average line length.
#
# Returns Integer.
def average_line_length
if lines.any?
lines.inject(0) { |n, l| n += l.length } / lines.length
else
0
end
end
# Public: Is the blob a generated file?
#
# Generated source code is supressed in diffs and is ignored by
# language statistics.
#
# Requires Blob#data
#
# Includes:
# - XCode project XML files
# - Minified JavaScript
#
# Please add additional test coverage to
# `test/test_blob.rb#test_generated` if you make any changes.
#
# Return true or false
def generated?
if xcode_project_file?
true
elsif generated_coffeescript? || minified_javascript? || generated_net_docfile?
true
else
false
end
end
# Internal: Is the blob an XCode project file?
#
# Generated if the file extension is an XCode project
# file extension.
#
# Returns true of false.
def xcode_project_file?
['.xib', '.nib', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
end
# Internal: Is the blob minified JS?
#
# Consider JS minified if the average line length is
# greater then 100c.
#
# Returns true or false.
def minified_javascript?
return unless extname == '.js'
average_line_length > 100
end
# Internal: Is the blob JS generated by CoffeeScript?
#
# Requires Blob#data
#
# CoffeScript is meant to output JS that would be difficult to
# tell if it was generated or not. Look for a number of patterns
# outputed by the CS compiler.
#
# Return true or false
def generated_coffeescript?
return unless extname == '.js'
if lines[0] == '(function() {' && # First line is module closure opening
lines[-2] == '}).call(this);' && # Second to last line closes module closure
lines[-1] == '' # Last line is blank
score = 0
lines.each do |line|
if line =~ /var /
# Underscored temp vars are likely to be Coffee
score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
# bind and extend functions are very Coffee specific
score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
end
end
# Require a score of 3. This is fairly arbitrary. Consider
# tweaking later.
score >= 3
else
false
end
end
# Internal: Is this a generated documentation file for a .NET assembly?
#
# Requires Blob#data
#
# .NET developers often check in the XML Intellisense file along with an
# assembly - however, these don't have a special extension, so we have to
# dig into the contents to determine if it's a docfile. Luckily, these files
# are extremely structured, so recognizing them is easy.
#
# Returns true or false
def generated_net_docfile?
return false unless extname.downcase == ".xml"
return false unless lines.count > 3
# .NET Docfiles always open with <doc> and their first tag is an
# <assembly> tag
return lines[1].include?("<doc>") &&
lines[2].include?("<assembly>") &&
lines[-2].include?("</doc>")
end
# Public: Should the blob be indexed for searching?
#
# Excluded:
# - Files over 0.1MB
# - Non-text files
# - Langauges marked as not searchable
# - Generated source files
#
# Please add additional test coverage to
# `test/test_blob.rb#test_indexable` if you make any changes.
#
# Return true or false
def indexable?
if binary?
false
elsif language.nil?
false
elsif !language.searchable?
false
elsif generated?
false
elsif size > 100 * 1024
false
else
true
end
end
# Public: Detects the Language of the blob.
#
# May load Blob#data
#
# Returns a Language or nil if none is detected
def language
if defined? @language
@language
else
@language = guess_language
end
end
# Internal: Guess language
#
# Please add additional test coverage to
# `test/test_blob.rb#test_language` if you make any changes.
#
# Returns a Language or nil
def guess_language
return if binary_mime_type?
# Disambiguate between multiple language extensions
disambiguate_extension_language ||
# See if there is a Language for the extension
pathname.language ||
# Look for idioms in first line
first_line_language ||
# Try to detect Language from shebang line
shebang_language
end
# Internal: Get the lexer of the blob.
#
# Returns a Lexer.
def lexer
language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
end
# Internal: Disambiguates between multiple language extensions.
#
# Delegates to "guess_EXTENSION_language".
#
# Please add additional test coverage to
# `test/test_blob.rb#test_language` if you add another method.
#
# Returns a Language or nil.
def disambiguate_extension_language
if Language.ambiguous?(extname)
name = "guess_#{extname.sub(/^\./, '')}_language"
send(name) if respond_to?(name)
end
end
# Internal: Guess language of header files (.h).
#
# Returns a Language.
def guess_h_language
if lines.grep(/^@(interface|property|private|public|end)/).any?
Language['Objective-C']
elsif lines.grep(/^class |^\s+(public|protected|private):/).any?
Language['C++']
else
Language['C']
end
end
# Internal: Guess language of .m files.
#
# Objective-C heuristics:
# * Keywords
#
# Matlab heuristics:
# * Leading function keyword
# * "%" comments
#
# Returns a Language.
def guess_m_language
# Objective-C keywords
if lines.grep(/^#import|@(interface|implementation|property|synthesize|end)/).any?
Language['Objective-C']
# File function
elsif lines.first.to_s =~ /^function /
Language['Matlab']
# Matlab comment
elsif lines.grep(/^%/).any?
Language['Matlab']
# Fallback to Objective-C, don't want any Matlab false positives
else
Language['Objective-C']
end
end
# Internal: Guess language of .pl files
#
# The rules for disambiguation are:
#
# 1. Many perl files begin with a shebang
# 2. Most Prolog source files have a rule somewhere (marked by the :- operator)
# 3. Default to Perl, because it is more popular
#
# Returns a Language.
def guess_pl_language
if shebang_script == 'perl'
Language['Perl']
elsif lines.grep(/:-/).any?
Language['Prolog']
else
Language['Perl']
end
end
# Internal: Guess language of .r files.
#
# Returns a Language.
def guess_r_language
if lines.grep(/(rebol|(:\s+func|make\s+object!|^\s*context)\s*\[)/i).any?
Language['Rebol']
else
Language['R']
end
end
# Internal: Guess language of .t files.
#
# Returns a Language.
def guess_t_language
score = 0
score += 1 if lines.grep(/^% /).any?
score += data.gsub(/ := /).count
score += data.gsub(/proc |procedure |fcn |function /).count
score += data.gsub(/var \w+: \w+/).count
# Tell-tale signs its gotta be Perl
if lines.grep(/^(my )?(sub |\$|@|%)\w+/).any?
score = 0
end
if score >= 3
Language['Turing']
else
Language['Perl']
end
end
# Internal: Guess language of .gsp files.
#
# Returns a Language.
def guess_gsp_language
if lines.grep(/<%|<%@|\$\{|<%|<g:|<meta name="layout"|<r:/).any?
Language['Groovy Server Pages']
else
Language['Gosu']
end
end
# Internal: Guess language from the first line.
#
# Look for leading "<?php" in Drupal files
#
# Returns a Language.
def first_line_language
# Only check files with drupal php extensions
return unless drupal_extname?
# Fail fast if blob isn't viewable?
return unless viewable?
if lines.first.to_s =~ /^<\?php/
Language['PHP']
end
end
# Internal: Extract the script name from the shebang line
#
# Requires Blob#data
#
# Examples
#
# '#!/usr/bin/ruby'
# # => 'ruby'
#
# '#!/usr/bin/env ruby'
# # => 'ruby'
#
# '#!/usr/bash/python2.4'
# # => 'python'
#
# Please add additional test coverage to
# `test/test_blob.rb#test_shebang_script` if you make any changes.
#
# Returns a script name String or nil
def shebang_script
# Fail fast if blob isn't viewable?
return unless viewable?
if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
bang.sub!(/^#! /, '#!')
tokens = bang.split(' ')
pieces = tokens.first.split('/')
if pieces.size > 1
script = pieces.last
else
script = pieces.first.sub('#!', '')
end
script = script == 'env' ? tokens[1] : script
# python2.4 => python
if script =~ /((?:\d+\.?)+)/
script.sub! $1, ''
end
# Check for multiline shebang hacks that exec themselves
#
# #!/bin/sh
# exec foo "$0" "$@"
#
if script == 'sh' &&
lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
script = $1
end
script
end
end
# Internal: Get Language for shebang script
#
# Returns the Language or nil
def shebang_language
# Skip file extensions unlikely to have shebangs
return unless shebang_extname?
if script = shebang_script
Language[script]
end
end
# Public: Highlight syntax of blob
#
# options - A Hash of options (defaults to {})
#
# Returns html String
def colorize(options = {})
return if !text? || large? || generated?
options[:options] ||= {}
options[:options][:encoding] ||= encoding
lexer.highlight(data, options)
end
# Public: Highlight syntax of blob without the outer highlight div
# wrapper.
#
# options - A Hash of options (defaults to {})
#
# Returns html String
def colorize_without_wrapper(options = {})
if text = colorize(options)
text[%r{<div class="highlight"><pre>(.*?)</pre>\s*</div>}m, 1]
else
''
end
end
Language.overridden_extensions.each do |extension|
name = "guess_#{extension.sub(/^\./, '')}_language".to_sym
unless instance_methods.map(&:to_sym).include?(name)
warn "Language##{name} was not defined"
end
end
end
end