-
Notifications
You must be signed in to change notification settings - Fork 0
/
metadupclean
executable file
·246 lines (221 loc) · 9.35 KB
/
metadupclean
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#!/usr/bin/env python
# vim: ts=8 sw=4 sta sts=4 ai
#rom pysqlite2 import dbapi2 as sqlite
import sqlite3 as sqlite
#from optparse import OptionParser
import sys
import os
import hashlib
import re
try:
import xattr
use_xattr=1
except ImportError,e:
use_xattr=None
class FnHash(object):
size = None
mtime = None
name = None
sha256 = None
sha256attr = None
sha256real = None
sha1 = None
sha1attr = None
sha1real = None
md5 = None
md5attr = None
md5real = None
# XXX recheck when advising removal
def __init__(self, name=""):
try:
with open(name) as f:
si = os.fstat(f.fileno())
if si:
self.size = si.st_size
self.mtime = si.st_mtime
self.ino = si.st_ino
self.dev = si.st_dev
self.nlink = si.st_nlink
if '/' in name:
self.name = name.split('/')[-1]
self.path = os.path.dirname(name)
else:
self.name = name
self.path = ''
if use_xattr:
try:
self.sha256attr = xattr._fgetxattr(f.fileno(),'user.sha256')
self.sha256 = self.sha256attr
self.sha1attr = xattr._fgetxattr(f.fileno(),'user.sha1')
self.sha1 = self.sha1attr
self.md5attr = xattr._fgetxattr(f.fileno(),'user.md5')
self.md5 = self.md5attr
except:
pass
if not self.sha256:
f_sha256 = hashlib.new('sha256')
for b in f:
f_sha256.update(b)
self.sha256real = f_sha256.hexdigest()
self.sha256 = self.sha256real
except IOError:
## XXX
pass
# XXX reload - refresh sha256 if not loaded
def reload(self):
pass
# update - update hashes on xattrs if not present
def update(self,sha256=None,sha1=None,md5=None):
allok = True
if use_xattr and (sha256 or sha1 or md5):
if (sha256 and self.sha256real and not self.sha256attr) or \
(sha1 and not self.sha1attr) or \
(md5 and not self.md5attr):
with open(os.path.join(self.path,self.name)) as f:
try:
if sha256 and self.sha256real == sha256 and not self.sha256attr:
xattr._fsetxattr(f.fileno(),'user.sha256',sha256.encode('ascii'))
allok = True
except:
pass
try:
if sha1 and not self.sha1attr:
allok = None
xattr._fsetxattr(f.fileno(),'user.sha1',sha1.encode('ascii'))
allok = True
except:
pass
try:
if md5 and not self.md5attr:
allok = None
xattr._fsetxattr(f.fileno(),'user.md5',md5.encode('ascii'))
allok = True
except:
pass
return allok
def __repr__(self):
return "h:%s @:%s s:%s d:%d i:%d n:%s"%(self.sha256[:16],self.mtime,self.size,self.dev,self.ino,self.name[:16])
def maybe_int(maybeq):
try:
return int(maybeq)
except (ValueError,TypeError):
return 0 if maybeq=='' or maybeq=='?' or maybeq is None else maybeq
# metadupclean
# - XXX does not remove last entry in database
# - XXX does not remove last existing file copy
# - currently, does not remove files or modify databases
# - writes new xattrs
#
# Reasons of removing a duplicate metadata/fs entry:
# Invalid/temporary name of an already downloaded/stored entry, with *existing* original file, but NO record for the new name
# - ensure original file exists, remove this file as invalid
# Invalid/temporary name of an already downloaded/stored entry, with *existing* original file, WITH an entry for this name too
# - ensure this record is newer, remove duplicate (newer) record
# - ensure original file exists, only then remove duplicate file with this name
def metadupclean(db_conn,cwd,fn,vault_path=None):
cur = db_conn.cursor()
# node (from fn) comes from the command line
# testnode (later) will be iterating over db details for diff/similarity
# we're going by stored/asc - so the first found is the earliest copy
node = FnHash(fn)
if node.sha256 == 'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855' and node.size == 0:
print 'removable %s (null)'%(node.name,)
#rint "rm '%s'"%(fn,)
return
cur.execute("select rowid,sha256,sha1,md5,strftime('%s',stamp)+0,size,path,fn from meta where sha256=? order by (strftime('%s',stored)+0) asc",(node.sha256,))
orig_name=None
orig_path=None
orig_stored=None
err_rows=[]
row_fn={}
del_rows={}
for r in cur.fetchall():
try:
#print r['rowid'],r['sha256'][:12],r['fn'].encode('utf-8'),r['path'].encode('utf-8')
pass
except:
# should be rare - make sure if it barfed, we knew why
print '* bad encoding %s'%(r['rowid'],)
if not orig_stored:
orig_stored = node.update(r['sha256'],r['sha1'],r['md5'])
row_fn[r['rowid']]=r['fn']
newfn = os.path.join(r['path'],r['fn'])
testnode = FnHash(newfn)
if testnode.sha256 == node.sha256 and testnode.size == node.size:
if not orig_name:
# if we're going by stored/desc, the first recorded (testnode) is the most authentic
orig_name = testnode.name
orig_path = testnode.path
if testnode.name == node.name:
# if node is different, that file can be removed
# check link count and inode/dev match
# if nlink > 1, we have a copy somewhere else
# but unless we can find it in the metadb, don't bother
if node.dev <> testnode.dev or node.ino <> testnode.ino:
print '* removable %s (matches %s in %s)'%(fn,orig_name,orig_path)
else:
# we have two different names with confirmed content
# and the current isn't the original - safe to remove
print '* removable %s (matched %s in %s)'%(fn,orig_name,orig_path)
# if we have a record of a missing file
# BUT also we have confirmed copies, those can be removed from sql
for rowid in err_rows:
if rowid in del_rows:
print '* sql %d already deleted'%(rowid)
else:
print '* sql %d deletable (%s missed for %s)'%(rowid,row_fn[rowid],orig_name)
print "delete from meta where rowid=%s and sha256='%s';"%(rowid,node.sha256)
del_rows[rowid]=rowid
err_rows=[]
else:
if r['rowid'] in del_rows:
print '* sql %d already deleted'%(r['rowid'])
else:
print '* sql %d deletable (%s dup for %s)'%(r['rowid'],testnode.name,orig_name)
print "delete from meta where rowid=%s and sha256='%s';"%(r['rowid'],node.sha256)
del_rows[r['rowid']]='del'
else:
if orig_name:
if r['rowid'] in del_rows:
print '* sql %d already deleted'%(r['rowid'])
else:
print '* sql %d deletable (%s gone for %s)'%(r['rowid'],r['fn'],orig_name)
print "delete from meta where rowid=%s and sha256='%s';"%(r['rowid'],node.sha256)
del_rows[r['rowid']]='del'
else:
# first item can't be found - remember its rec
# XXX hoping if a file was removed, it's not the first recorded
print '* row %s cant find %s (for %s)'%(r['rowid'],newfn,fn,)
err_rows.append(r['rowid'])
if not orig_stored:
print '*',fn,'not in db'
# XXX dup name check here?
cur.execute("select rowid,fn,path,md5 from meta where fn like ?",(fn,))
warned={}
for r in cur.fetchall():
if fn != r[1]:
if r[1] not in warned: warned[r[1]]=[]
warned[r[1]].append(r[2])
for n in warned:
for p in warned[n]:
print '* %s name similar to %s/%s'%(fn,p,n)
cur.close()
return
def main(argv=None):
argv = argv or sys.argv[1:]
db_file = os.environ['FLDB']
# XXX customize vault_path if needed
with sqlite.connect(db_file) as db_conn:
db_conn.row_factory = sqlite.Row
for fn in argv:
fn = fn.decode('utf-8')
if os.path.isfile(fn):
metadupclean(db_conn,cwd,fn,vault_path)
else:
print '* check %s'%(fn,)
if __name__ == "__main__":
try:
sys.exit(main())
except KeyboardInterrupt:
print "^C",
sys.exit(1)