Skip to content

Commit

Permalink
option to restrict imported records based on data_quality values
Browse files Browse the repository at this point in the history
  • Loading branch information
Philip Mateescu committed Jan 5, 2012
1 parent eeec984 commit c96cdfb
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 15 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ MySQL or other databases are not supported at the moment, but you are welcome to
* `-o couch -p "couch URI"`: exports to a CouchDB server running on localhost on port 5984 into a database named `discogs`;
* `-o mongo -p "mongodb://localhost/discogs"`: connects, with `user` and `pass`, to a MongoDB server running on localhost, and into a database named `discogs`. See [Standard Connection String Format](http://www.mongodb.org/display/DOCS/Connections) in the MongoDB docs.
* `-o mongo -p "file:///path/to/dir/"`: outputs each of the Artists, Labels, Masters, Releases into a separate JSON file into the specified directory, `/path/to/dir/` in this case, one line for each. Pass `--ignoreblanks` to `mongoimport` in case extra new-lines are added; you probably also want `--upsert --upseftFields id`.
* **Output**: `-q`/`--quality` - imports only items with the specified data_quality. Takes in a comma-separated list of values for multiple entries. Valid values: 'Needs Vote', 'Complete And Correct', 'Correct', 'Needs Minor Changes', 'Needs Major Changes', 'Entirely Incorrect', 'Entirely Incorrect Edit'.
* `discogsparser.py -q 'Complete And Correct,Correct,Needs Minor Changes'`


# Examples:
Expand Down
10 changes: 9 additions & 1 deletion couchdbexporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

class CouchDbExporter(object):

def __init__(self, server_url):
def __init__(self, server_url, data_quality=[]):
self.min_data_quality = data_quality
self.server = server_url
self.connect(server_url)

Expand All @@ -18,7 +19,14 @@ def connect(self, server_url):
couch = couchdb.Server(server)
self.db = couch[db_name]

def good_quality(self, what):
if len(self.min_data_quality):
return what.data_quality.lower() in self.min_data_quality
return True

def execute(self, what):
if not self.good_quality(what):
return
# have to convert it to json and back because
# on simple objects couchdb-python throws:
# TypeError: argument of type 'instance' is not iterable
Expand Down
16 changes: 14 additions & 2 deletions discogsparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,16 @@
'mongo' : 'mongodbexporter.MongoDbExporter',
}

# http://www.discogs.com/help/voting-guidelines.html
data_quality_values = ( 'Needs Vote',
'Complete And Correct',
'Correct',
'Needs Minor Changes',
'Needs Major Changes',
'Entirely Incorrect',
'Entirely Incorrect Edit'
)


def first_file_match(file_pattern):
global options
Expand Down Expand Up @@ -164,7 +174,8 @@ def make_exporter(options):
for i in xrange(1, len(parts)):
m = getattr(m, parts[i])

return m(options.params)
data_quality = list(x.strip().lower() for x in (options.data_quality or '').split(',') if x)
return m(options.params, data_quality=data_quality)



Expand All @@ -188,10 +199,11 @@ def main(argv):
opt_parser.add_argument('-o', '--output', choices=exporters.keys(), default='json', help='What to output to')
opt_parser.add_argument('-p', '--params', help='Parameters for output, e.g. connection string')
opt_parser.add_argument('-i', '--ignore-unknown-tags', action='store_true', dest='ignore_unknown_tags', help='Do not error out when encountering unknown tags')
opt_parser.add_argument('-q', '--quality', dest='data_quality', help='Comma-separated list of permissable data_quality values.')
opt_parser.add_argument('file', nargs='*', help='Specific file(s) to import. Default is to parse artists, labels, releases matching -d')
global options
options = opt_parser.parse_args(argv)
# print(options)
print(options)

if options.date is None and len(options.file) == 0:
opt_parser.print_help()
Expand Down
27 changes: 17 additions & 10 deletions jsonexporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,34 @@ def jsonizer(obj, specify_object_type = True):


class JsonConsoleExporter:
def __init__(self, params):
pass
def __init__(self, params, data_quality=[]):
self.min_data_quality = data_quality

def good_quality(self, what):
if len(self.min_data_quality):
return what.data_quality.lower() in self.min_data_quality
return True

def dump(self, what):
if not self.good_quality(what):
return
j = self._store(what)
print j

def finish(self, completely_done = False):
pass

def _store(self, what):
return json.dumps(what, default=jsonizer)

def storeArtist(self, artist):
j = self._store(artist)
print j
self.dump(artist)

def storeLabel(self, label):
j = self._store(label)
print j
self.dump(label)

def storeRelease(self, release):
j = self._store(release)
print j
self.dump(release)

def storeMaster(self, master):
j = self._store(master)
print j
self.dump(master)
11 changes: 10 additions & 1 deletion mongodbexporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,10 @@ def finish(self):


class MongoDbExporter(object):
def __init__(self, mongo_uri):
def __init__(self, mongo_uri, data_quality=[]):
'''mongo_uri: mongodb://[username:password@]host1[:port1],...[,hostN[:portN]][/[database][?options]]'''
# TODO: if uri is file://path/ - create a json dump for using with mongo import
self.min_data_quality = data_quality
self._options = {}
self._quick_uniq = None
self.connect(mongo_uri)
Expand Down Expand Up @@ -184,7 +185,15 @@ def _store_processed(self, collection, id, md5_digest):
if self._quick_uniq is not None:
self._quick_uniq.process(collection, id, md5_digest)

def good_quality(self, what):
if len(self.min_data_quality):
return what.data_quality.lower() in self.min_data_quality
return True

def execute(self, collection, what):
if not self.good_quality(what):
# print "Bad quality: %s for %s" % (what.data_quality, what.id)
return
# have to convert it to json and back because
# on simple objects couchdb-python throws:
# TypeError: argument of type 'instance' is not iterable
Expand Down
12 changes: 11 additions & 1 deletion postgresexporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ class ExecuteError(Exception):
def __init__(self, args):
self.args = args

def __init__(self, connection_string):
def __init__(self, connection_string, data_quality):
self.formatNames = {}
self.imgUris = {}
self.connect(connection_string)
self.min_data_quality = data_quality

def connect(self, connection_string):
import psycopg2
Expand All @@ -50,6 +51,11 @@ def connect(self, connection_string):
print "%s" % (e.args)
sys.exit()

def good_quality(self, what):
if len(self.min_data_quality):
return what.data_quality.lower() in self.min_data_quality
return True

def execute(self, query, values):
import psycopg2
try:
Expand All @@ -67,6 +73,7 @@ def finish(self, completely_done=False):
self.cur.close()

def storeLabel(self, label):
if not self.good_quality(label) return
values = []
values.append(label.id)
values.append(label.name)
Expand Down Expand Up @@ -116,6 +123,7 @@ def storeLabel(self, label):
self.execute("INSERT INTO labels_images(image_uri, label_id) VALUES(%s,%s);", (img.uri, label.id))

def storeArtist(self, artist):
if not self.good_quality(artist) return
values = []
values.append(artist.id)
values.append(artist.name)
Expand Down Expand Up @@ -172,6 +180,7 @@ def storeArtist(self, artist):
self.execute("INSERT INTO artists_images(image_uri, artist_id) VALUES(%s,%s);", (img.uri, artist.id))

def storeRelease(self, release):
if not self.good_quality(release) return
values = []
values.append(release.id)
values.append(release.title)
Expand Down Expand Up @@ -304,6 +313,7 @@ def storeRelease(self, release):
(trackid, extr.name, role))

def storeMaster(self, master):
if not self.good_quality(master) return

values = []
values.append(master.id)
Expand Down

0 comments on commit c96cdfb

Please sign in to comment.