Permalink
Browse files

renamed file queue, used write_batch instead of write. Added some doc…

…umentation
  • Loading branch information...
AKSHAYUBHAT committed May 9, 2014
1 parent 33e7f7f commit 77e9af03a4316de3a52005edeea1a51d5eabf9ef
Showing with 42 additions and 29 deletions.
  1. +4 −4 config.py
  2. +4 −3 fabfile.py
  3. +7 −1 queue.py → filequeue.py
  4. +26 −20 readme.md
  5. +1 −1 worker.py
View
@@ -50,11 +50,11 @@
# Job Configuration
#
#########
EC2_Tag = "simple_wat_stats_2"
JOB_QUEUE = 'simple_wat_stats_2' # SQS queue name
OUTPUT_S3_BUCKET = 'simple_wat_stats_2' # S3 bucket
EC2_Tag = "cc_wat_13_2"
JOB_QUEUE = 'wat_stats_2013_2' # SQS queue name
OUTPUT_S3_BUCKET = 'wat_stats_2013_2' # S3 bucket
CODE_BUCKET = "akshay_code" # bucket used to store code & configuration make sure this is different from output bucket
CODE_KEY = "simple_wat_stats_2" # key for storing code which will be downloaded by user-data script
CODE_KEY = "wat_stats_2013_2" # key for storing code which will be downloaded by user-data script
FILE_TYPE = "wat" # Type of files you wish to process choose from {"wat","wet","text","warc"}
CRAWL_ID = "2013_2" # 2nd crawl in 2013
View
@@ -2,6 +2,7 @@
from fabric.api import env,local,run,sudo,put,cd,lcd
from config import *
from spotinstance import *
import filequeue
import logging
logging.basicConfig(filename='fab.log',level=logging.DEBUG,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
@@ -36,17 +37,17 @@ def setup_job():
from boto.s3.connection import S3Connection
from cclib.commoncrawl import CommonCrawl
logging.getLogger('boto').setLevel(logging.CRITICAL)
from queue import FileQueue
import filequeue
S3 = S3Connection()
logging.info("Creating bucket "+OUTPUT_S3_BUCKET)
S3.create_bucket(OUTPUT_S3_BUCKET)
logging.info("bucket created")
# SQS
crawl = CommonCrawl(CRAWL_ID)
file_list = crawl.get_file_list(FILE_TYPE) # Text files
queue = FileQueue(JOB_QUEUE,file_list)
file_queue = filequeue.FileQueue(JOB_QUEUE,VISIBILITY_TIMEOUT,file_list)
logging.debug("Adding "+str(len(file_list))+" "+FILE_TYPE+" files to queue "+JOB_QUEUE)
queue.add_files()
file_queue.add_files()
logging.debug("Finished adding files")
print "Finished adding files"
@@ -1,6 +1,7 @@
__author__ = 'aub3'
from boto.sqs.connection import SQSConnection
from boto.sqs.message import Message
import base64
class FileQueue(object):
"""
@@ -30,7 +31,12 @@ def add_files(self,count=None):
count = len(self.files)
while count:
count -= 1
self.queue.write(Message(body=self.files.pop()))
message_buffer.append((count,base64.b64encode(self.files.pop()),0)) # required to maintain compatibility with
if len(message_buffer) > 9:
self.queue.write_batch(message_buffer)
message_buffer = []
self.queue.write_batch(message_buffer)
def clear(self):
"""
View
@@ -1,13 +1,13 @@
Common Crawl Dev
------------
=================
A simple app for mining common crawl data
Author
-------
============
Akshay Uday Bhat (www.akshaybhat.com)
Description:
---------
==============
This repo contains code for accessing Common Crawl crawls (2013 & later) & code for launching spot instances for analyzing the crawl data.
The code follows most of the best practices to ensure :
@@ -26,32 +26,19 @@ The current worker.py implements a simple function which stores count of urls an
The function and configuration can be easily modified to support more complex analysis.
Dependancies
--------------
===============
- Boto (latest)
- Fabric (1.8.1)
Documentation
------------
* libs/setup.py
* libs/cclib/commoncrawl13.py
* libs/cclib/data/*.gz pickle files containing list of keys/files in each crawl
* config.py Contains configuration for launching job, identifiers for bucket, queue etc.
* worker.py Code executed on each file in the crawl
* fabfile.py Contains tasks for setting up, running, monitoring and terminating jobs.
* spotinstance.py A small class to keep track of spot instance requests.
Configuration
==============
* queue.py A small class to keep track of files in SQS queue.
* example.json Example of output stored in the bucket from one file, using current worker.py
Instructions / Tasks
----------------
=============
1. AWS credentials should be stored in /etc/boto.cfg, the credentials are not transferred
2. To install library locally run "fab update_lib"
3. To set up job run "fab setup_job", this will create IAM roles, S3 output bucket and SQS queue.
@@ -62,7 +49,26 @@ Instructions / Tasks
8. To terminate all instances run "fab terminate_instances" (NOTE its important that you manually terminate all instances.)
Optionally
--------------
* Use "fab ls_buckets" to check status of the output bucket and to download one randomly selected key to temp.json.
Files
==================
* libs/setup.py
* libs/cclib/commoncrawl13.py
* libs/cclib/data/*.gz pickle files containing list of keys/files in each crawl
* config.py Contains configuration for launching job, identifiers for bucket, queue etc.
* worker.py Code executed on each file in the crawl
* fabfile.py Contains tasks for setting up, running, monitoring and terminating jobs.
* spotinstance.py A small class to keep track of spot instance requests.
* queue.py A small class to keep track of files in SQS queue.
* example.json Example of output stored in the bucket from one file, using current worker.py
View
@@ -2,7 +2,7 @@
import logging
from collections import defaultdict
from cclib import commoncrawl
from queue import FileQueue
from filequeue import FileQueue
from boto.s3.connection import S3Connection
from boto.s3.key import Key
import json

0 comments on commit 77e9af0

Please sign in to comment.