In [317]:
import boto
import boto.s3.connection

import warnings
import numpy as np

from collections import defaultdict
import regex as re
import zlib, codecs
import time
from datetime import datetime

In [2]:
conn = boto.connect_s3(anon=True, host='datasets.iccluster.epfl.ch', 
                       calling_format=boto.s3.connection.OrdinaryCallingFormat())

In [3]:
bucket = conn.get_bucket("google-ngrams")

In [4]:
all_keys = list(bucket.list())

In [5]:
k = all_keys[1]

In [6]:
km = bucket.get_key(k.name)

In [27]:
km.get_metadata('Date')

In [9]:
k.get_metadata('Content-Length')

In [47]:
all_keys

[<Key: google-ngrams,P12-3029.pdf>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-0.gz>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-1.gz>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-2.gz>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-3.gz>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-4.gz>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-5.gz>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-6.gz>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-7.gz>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-8.gz>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-9.gz>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-_ADJ_.gz>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-_ADP_.gz>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-_ADV_.gz>,
 <Key: google-ngrams,eng/googlebooks-eng-all-0gram-20120701-_CONJ_.gz>,
 <Key: go

In [74]:
for p in list(bucket.list(prefix='eng/googlebooks-eng-all-2gram-20120701-', delimiter='-')):
    print (p.name)

eng/googlebooks-eng-all-2gram-20120701-0.gz
eng/googlebooks-eng-all-2gram-20120701-1.gz
eng/googlebooks-eng-all-2gram-20120701-2.gz
eng/googlebooks-eng-all-2gram-20120701-3.gz
eng/googlebooks-eng-all-2gram-20120701-4.gz
eng/googlebooks-eng-all-2gram-20120701-5.gz
eng/googlebooks-eng-all-2gram-20120701-6.gz
eng/googlebooks-eng-all-2gram-20120701-7.gz
eng/googlebooks-eng-all-2gram-20120701-8.gz
eng/googlebooks-eng-all-2gram-20120701-9.gz
eng/googlebooks-eng-all-2gram-20120701-_ADJ_.gz
eng/googlebooks-eng-all-2gram-20120701-_ADP_.gz
eng/googlebooks-eng-all-2gram-20120701-_ADV_.gz
eng/googlebooks-eng-all-2gram-20120701-_CONJ_.gz
eng/googlebooks-eng-all-2gram-20120701-_DET_.gz
eng/googlebooks-eng-all-2gram-20120701-_NOUN_.gz
eng/googlebooks-eng-all-2gram-20120701-_NUM_.gz
eng/googlebooks-eng-all-2gram-20120701-_PRON_.gz
eng/googlebooks-eng-all-2gram-20120701-_PRT_.gz
eng/googlebooks-eng-all-2gram-20120701-_VERB_.gz
eng/googlebooks-eng-all-2gram-20120701-a_.gz
eng/googlebooks-eng-all-2gram-2

In [14]:
def build_prefix(lang, n, version='20120701'):
    """ Builds a prefix for listing keys from the GoogleBooks bucket.
    This is mostly hardcoded and doesn't check if the prefix is valid (returns existing keys)
    
    Parameters:
    -----------
        lang : 3-letter string for language. It doesn't check if the language actually exists
        n    : the n-gram. 
        version: which data version, e.g. '20120701'
    """
    # TODO: you could validate this agains the bucket by checking each prefix
    return '{lang}/googlebooks-{lang}-all-{n}gram-{version}-'.format(lang=lang, n=n, version=version)

In [15]:
def get_s3_bucket():
    """ Returns the bucket object for downloading the files. Currently uses the boto2 interface"""
    conn = boto.connect_s3(anon=True, host='datasets.iccluster.epfl.ch', 
                           calling_format=boto.s3.connection.OrdinaryCallingFormat())
    return conn.get_bucket('google-ngrams')

In [16]:
def key_basename(key, extension=False):
    """ Customised basename for google keys, where the separator is '-'.
    By default, returns the name without extension. Set extension=True to include it """
    return key.name.split('-')[-1].split('.')[0]

In [34]:
# save a model file where the 2nd char is punctuation
fk = list(filter(lambda k: key_basename(k) == "s_", all_keys))[4]

with open(path.basename(fk.name), 'wb') as f:
    fk.get_file(f)

In [151]:
# save a model file with some large-ish voc
da = list(filter(lambda k: key_basename(k) == "da", all_keys))[4]
da.get_contents_to_filename(path.basename(da.name))

In [286]:
def decompress_stream(stream):
    """ Given an iterable stream of gzipped data (such as a `key` in S3 storage), this function returns an iterator
    over the uncompressed and utf-8 decoded data """
    extracter = zlib.decompressobj(16 + zlib.MAX_WBITS)
    decoder   = codecs.getincrementaldecoder('utf-8')()  # note the second () which instantiates the object

    for chunk in stream:
        yield decoder.decode( extracter.decompress(chunk) )

    yield decoder.decode( extracter.flush() , final = True)

In [117]:
for data in fk:
    pass

In [118]:
num_chunks = 0
num_bytes = 0
for chunk in decompress_stream(fk):
    num_chunks += 1
    num_bytes += len(chunk)

In [143]:
def prepare_word_freqs(start_year=1840, end_year=2001, min_count=20):
    """ Initialises a `word_freq` dictionary for the given parameters
    This will consider only words in year range [start_year, end_year)
    
    This dictionary holds the frequencies for each word, along with some custom data.
    Because our custom keys have '_' character in their name we are SURE they will
    NOT COLLIDE with actual words from the vocabulary.
    """
    
    range_size = end_year - start_year - 1 # open interval
    def get_default_freqs():
        return np.zeros(range_size)
    
    counts_years = np.zeros(range_size, dtype=np.uint64) # set as ulong since it can get big
    word_freqs = defaultdict(get_default_freqs, start_year=start_year, end_year=end_year, 
                             min_count=min_count, counts_years=counts_years)
    
    return word_freqs

In [199]:
re.sub(r'_[A-Z]*(\b|_)', '', "Daisme _ADJ_")

'dal 26'

In [202]:
re.search(r"[^\p{Lu}\p{Ll}_' ]", "d'_ADJ actionnaire")

In [291]:
def add_to_freqs(word_freqs, entry):
    """ If the entry is valid, updates it into `word_freqs`
        `entry` should be list of 4 fields, a split line from an n-gram file 
    """
    # skip uninteresting entries (small count, year out of range)
    
    count, year = int(entry[2]), int(entry[1])
        
    if count < word_freqs['min_count'] or \
       not word_freqs['start_year'] <= year < word_freqs['end_year']:
        return
    
    # skip if it's not letters only (and space _ apostrophe)
    if re.search(r"[^\p{Lu}\p{Ll}_' ]", entry[0]):
        return
    
    # remove POS tags
    word = re.sub(r'_[A-Z]*\b', '', entry[0])
    
    idx = year-word_freqs['start_year'] - 1 # zero based
    word_freqs[word][idx] += count
    word_freqs['counts_years'][idx] += count

In [298]:
def process_S3_key(key, word_freqs):
    """ given an S3 key, merge its data into `word_freqs` """
    prev_chunk_end = u""
    num_chunks = 0
    for chunk in decompress_stream(key):
        chunk = prev_chunk_end + chunk
        lines = chunk.split('\n')

        for idx, line in enumerate(lines[:-1]):  # last line may be incomplete, leave it for the next chunk
            fields = line.split('\t')
            add_to_freqs(word_freqs, fields)

        # if, by chance, the chunk ends with '\n' then the last line is empty, so it's OK
        prev_chunk_end = lines[-1]
        num_chunks += 1
    return word_freqs

In [364]:
def google_word_freqs(lang, n, word_freqs):
    """ Build the word frequency dictionary from the S3 data 
    `word_freqs` should be a dictionary initialised with `prepare_word_freqs`
    
    Note: 'word' in this context means an n-gram phrase
    """
    bucket = get_s3_bucket()
    prefix = build_prefix(lang, n)
    log_file = open("log.txt", "a")
        
    total_time, num_keys, num_kbytes = 0, 0, 0
    for key in bucket.list(prefix, '-'):
        # skip uninteresting files; Now we skip "a_" as well, but we could/should include them
        name = key_basename(key)
        if not name.isalpha() or name in ['other', 'punctuation']:
            continue
        
        start = time.time()
        process_S3_key(key, word_freqs)
        end = time.time()
        
        total_time += end - start
        num_keys += 1
        num_kbytes += key.size >> 10
        
        print("{info} Processed {k} keys, in {t:.1f}s ({avg:.1f} s/key, {b} kbytes/s ). Total words: {w}. Key: {kn}".format(
              info=datetime.now(), k=num_keys, t=total_time, avg=total_time / num_keys, 
              b = num_kbytes//total_time, w=len(word_freqs.keys()), kn=name), file=log_file, flush=True)
    
    log_file.close()

    return word_freqs

In [365]:
word_freqs = prepare_word_freqs()

In [366]:
word_freqs

defaultdict(<function __main__.prepare_word_freqs.<locals>.get_default_freqs>,
            {'counts_years': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint64),
             'end_year': 2001,
             'min_count': 20,
             'start_year': 1840})

In [None]:
google_word_freqs('eng', 2, word_freqs)

In [369]:
len(word_freqs.keys())

31748932

In [370]:
import pickle

In [374]:
factory = word_freqs.default_factory

In [376]:
word_freqs.default_factory = None

In [None]:
%time
with open('/mnt/cluster-nas/ciprian/temp_data/eng_2.pk', 'wb') as f:
    pickle.dump(word_freqs, f)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 13.4 µs
