Permalink
Browse files

Fixes

  • Loading branch information...
1 parent 66482b2 commit f43ebefff087108bd8983706cadac1f686a3e0d0 Donovan Hide committed May 18, 2010
Showing with 33 additions and 14 deletions.
  1. +1 −1 ArchiveDiff/indexer/models.py
  2. +23 −11 ArchiveDiff/indexer/tasks.py
  3. +1 −0 fabfile.py
  4. +8 −2 requirements.txt
@@ -30,7 +30,7 @@ class Response(models.Model):
content_type = models.ForeignKey(ContentType,editable=False)
warc = models.ForeignKey(Warc,editable=False)
code = models.IntegerField(db_index=True)
- time = models.DateTimeField(db_index=True)
+ time = models.DateTimeField(db_index=True, null=True,blank=True)
etag = models.CharField(max_length=200,null=True,blank=True)
hash = models.CharField(max_length=200)
last_modified = models.DateTimeField(null=True,blank=True)
@@ -9,29 +9,39 @@
class WarcIndexer(Task):
name = "warc.index"
+ def truncate_url(self,url):
+ if len(url)>2000:
+ print url
+ return url[0:2000]
+ return url
+
def run(self,warc_path,**kwargs):
- indexerCommand = os.path.join(settings.INDEXER_DIR,"run.sh")
- indexer = subprocess.Popen("%s %s" % (indexerCommand,warc_path),stdout=subprocess.PIPE,shell=True)
+ logger = self.get_logger(**kwargs)
+
+ indexerScript = os.path.join(settings.INDEXER_DIR,"run.sh")
+ indexerCommand = "%s %s" % (indexerScript,warc_path)
+ logger.info("Running command: %s" % indexerCommand)
+ indexer = subprocess.Popen(indexerCommand,stdout=subprocess.PIPE,shell=True)
reader = csv.reader(indexer.stdout, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
rows=[]
for row in reader:
rows.append(row)
- print "Adding WARC"
+ logger.info("Adding WARC")
for row in rows:
sys.stdout.write('.')
warc = row[0]
break
newWarc, created = Warc.objects.get_or_create(filename=warc, defaults={'filename':warc})
- print "Adding Requests and Content Types"
+ logger.info("Adding Requests and Content Types")
requests={}
contentTypes = {}
for row in rows:
sys.stdout.write('.')
- url = row[1]
+ url = self.truncate_url(row[1])
if not requests.has_key(url):#Needed?
newRequest, created = Request.objects.get_or_create(url=url, defaults = {'url':url})
requests[url] = newRequest.id
@@ -40,14 +50,14 @@ def run(self,warc_path,**kwargs):
newContentType, created = ContentType.objects.get_or_create(name=contentType,defaults={'name': contentType})
contentTypes[contentType] = newContentType.id
- print "Deleting Responses associated with this Warc"
+ logger.info("Deleting Responses associated with this Warc")
Response.objects.filter(warc=newWarc).delete()
- print "Adding Responses"
+ logger.info("Adding Responses")
for row in rows:
sys.stdout.write('.')
- url = row[1]
- date = row[2]
+ url = self.truncate_url(row[1])
+ time = row[2]
code = row[3]
etag = row[4]
last_modified = row[5]
@@ -56,7 +66,6 @@ def run(self,warc_path,**kwargs):
offset = row[8]
hash = row[9]
newResponse = Response(
- time=date,
warc=newWarc,
code=code,
etag=etag,
@@ -69,9 +78,12 @@ def run(self,warc_path,**kwargs):
newResponse.request_id = requests[url]
if last_modified:
newResponse.last_modified=last_modified
+ if time:
+ newResponse.time=time
newResponse.save()
except Exception, e:
print e
print row
- sys.stdout.flush()
+ logger.info("Finished index")
+
View
@@ -31,3 +31,4 @@ def deploy():
sudo('git pull')
restart_webserver()
+
View
@@ -5,5 +5,11 @@ fabric
http://initd.org/pub/software/psycopg/psycopg2-2.0.14.tar.gz
-e svn+http://code.djangoproject.com/svn/django/trunk#egg=Django
--e git://github.com/ask/celery.git@v1.0.2#egg=celery
--e git://github.com/robhudson/django-debug-toolbar#egg=django-debug-toolbar
+-e git://github.com/ask/billiard.git#egg=billiard
+-e git://github.com/ask/carrot.git#egg=carrot
+-e git://github.com/ask/celery.git#egg=celery
+-e git://github.com/robhudson/django-debug-toolbar#egg=django-debug-toolbar
+
+
+
+

0 comments on commit f43ebef

Please sign in to comment.