-
Notifications
You must be signed in to change notification settings - Fork 965
/
checkers.py
155 lines (133 loc) · 3.91 KB
/
checkers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import bz2
import gzip
import imghdr
import re
import zipfile
from six import StringIO
from galaxy import util
HTML_CHECK_LINES = 100
try:
import Image as PIL
except ImportError:
try:
from PIL import Image as PIL
except:
PIL = None
def check_image( file_path ):
if PIL is not None:
try:
im = PIL.open( file_path )
except:
return False
if im:
return im
return False
else:
if imghdr.what( file_path ) is not None:
return True
return False
def check_html( file_path, chunk=None ):
if chunk is None:
temp = open( file_path, "U" )
else:
temp = chunk
regexp1 = re.compile( "<A\s+[^>]*HREF[^>]+>", re.I )
regexp2 = re.compile( "<IFRAME[^>]*>", re.I )
regexp3 = re.compile( "<FRAMESET[^>]*>", re.I )
regexp4 = re.compile( "<META[\W][^>]*>", re.I )
regexp5 = re.compile( "<SCRIPT[^>]*>", re.I )
lineno = 0
# TODO: Potentially reading huge lines into string here, this should be
# reworked.
for line in temp:
lineno += 1
matches = regexp1.search( line ) or regexp2.search( line ) or regexp3.search( line ) or regexp4.search( line ) or regexp5.search( line )
if matches:
if chunk is None:
temp.close()
return True
if HTML_CHECK_LINES and (lineno > HTML_CHECK_LINES):
break
if chunk is None:
temp.close()
return False
def check_binary( name, file_path=True ):
# Handles files if file_path is True or text if file_path is False
is_binary = False
if file_path:
temp = open( name, "U" )
else:
temp = StringIO( name )
try:
for char in temp.read( 100 ):
if util.is_binary( char ):
is_binary = True
break
finally:
temp.close( )
return is_binary
def check_gzip( file_path ):
# This method returns a tuple of booleans representing ( is_gzipped, is_valid )
# Make sure we have a gzipped file
try:
temp = open( file_path, "U" )
magic_check = temp.read( 2 )
temp.close()
if magic_check != util.gzip_magic:
return ( False, False )
except:
return ( False, False )
# We support some binary data types, so check if the compressed binary file is valid
# If the file is Bam, it should already have been detected as such, so we'll just check
# for sff format.
try:
header = gzip.open( file_path ).read(4)
if header == b'.sff':
return ( True, True )
except:
return( False, False )
CHUNK_SIZE = 2 ** 15 # 32Kb
gzipped_file = gzip.GzipFile( file_path, mode='rb' )
chunk = gzipped_file.read( CHUNK_SIZE )
gzipped_file.close()
# See if we have a compressed HTML file
if check_html( file_path, chunk=chunk ):
return ( True, False )
return ( True, True )
def check_bz2( file_path ):
try:
temp = open( file_path, "U" )
magic_check = temp.read( 3 )
temp.close()
if magic_check != util.bz2_magic:
return ( False, False )
except:
return( False, False )
CHUNK_SIZE = 2 ** 15 # reKb
bzipped_file = bz2.BZ2File( file_path, mode='rb' )
chunk = bzipped_file.read( CHUNK_SIZE )
bzipped_file.close()
# See if we have a compressed HTML file
if check_html( file_path, chunk=chunk ):
return ( True, False )
return ( True, True )
def check_zip( file_path ):
if zipfile.is_zipfile( file_path ):
return True
return False
def is_bz2( file_path ):
is_bz2, is_valid = check_bz2( file_path )
return is_bz2
def is_gzip( file_path ):
is_gzipped, is_valid = check_gzip( file_path )
return is_gzipped
__all__ = [
'check_binary',
'check_bz2',
'check_gzip',
'check_html',
'check_image',
'check_zip',
'is_gzip',
'is_bz2',
]