This is a guide on how to utilize Google Cloud Storage as a Data Lake.

1. Go to https://console.cloud.google.com/apis/credentials/serviceaccountkey and create a new service account, set as the Project Owner. 
2. Save the JSON file that downloads automatically to a location of your choice.
3. Add the JSON filepath to your GOOGLE_APPLICATION_CREDENTIALS environment variable.
4. Refer to Jupyter Notebook cloudy-broth in jpstat for examples on API calls.
5. To more easily manage files from the command line, use the gsutil tool: https://cloud.google.com/storage/docs/gsutil

In [3]:
from google.cloud import storage
import os
import pandas as pd

In [4]:
pd.set_option('display.max_colwidth', -1)

In [5]:
filepath = os.path.join('extracts', 'monthly_reports')

In [6]:
bucket_name = "i-agility-212104.appspot.com"

In [7]:
file200601_1dir = os.path.join(filepath, '2006')
file200601_1name = os.path.join('01Jan', '2.xls')
file200601_1path = os.path.join(file200601_1dir, file200601_1name)

In [8]:
testdir = os.path.join('extracts', 'monthly_reports', '2018')
filepathlist = []
for dirpath, subdir, files in os.walk(testdir):
    for filename in files:
    if filename.endswith('.xls'):
        filepathlist.append(os.path.join(dirpath, filename))

print(filepathlist)

['extracts\\monthly_reports\\2018\\01Jan\\1.xls', 'extracts\\monthly_reports\\2018\\01Jan\\2.xls', 'extracts\\monthly_reports\\2018\\01Jan\\3-1.xls', 'extracts\\monthly_reports\\2018\\01Jan\\3-2.xls', 'extracts\\monthly_reports\\2018\\01Jan\\3-3.xls', 'extracts\\monthly_reports\\2018\\02Feb\\1.xls', 'extracts\\monthly_reports\\2018\\02Feb\\2.xls', 'extracts\\monthly_reports\\2018\\02Feb\\3-1.xls', 'extracts\\monthly_reports\\2018\\02Feb\\3-2.xls', 'extracts\\monthly_reports\\2018\\02Feb\\3-3.xls', 'extracts\\monthly_reports\\2018\\03Mar\\1.xls', 'extracts\\monthly_reports\\2018\\03Mar\\2.xls', 'extracts\\monthly_reports\\2018\\03Mar\\3-1.xls', 'extracts\\monthly_reports\\2018\\03Mar\\3-2.xls', 'extracts\\monthly_reports\\2018\\03Mar\\3-3.xls', 'extracts\\monthly_reports\\2018\\04Apr\\1.xls', 'extracts\\monthly_reports\\2018\\04Apr\\2.xls', 'extracts\\monthly_reports\\2018\\04Apr\\3-1.xls', 'extracts\\monthly_reports\\2018\\04Apr\\3-2.xls', 'extracts\\monthly_reports\\2018\\04Apr\\3-3.x

In [15]:
%%time
# Change path delimiters to unix format if necessary
[upload_blob(bucket_name, source_file, source_file.replace('\\', '/')) for source_file in filepathlist]

File extracts\monthly_reports\2018\01Jan\1.xls uploaded to extracts/monthly_reports/2018/01Jan/1.xls.
File extracts\monthly_reports\2018\01Jan\2.xls uploaded to extracts/monthly_reports/2018/01Jan/2.xls.
File extracts\monthly_reports\2018\01Jan\3-1.xls uploaded to extracts/monthly_reports/2018/01Jan/3-1.xls.
File extracts\monthly_reports\2018\01Jan\3-2.xls uploaded to extracts/monthly_reports/2018/01Jan/3-2.xls.
File extracts\monthly_reports\2018\01Jan\3-3.xls uploaded to extracts/monthly_reports/2018/01Jan/3-3.xls.
File extracts\monthly_reports\2018\02Feb\1.xls uploaded to extracts/monthly_reports/2018/02Feb/1.xls.
File extracts\monthly_reports\2018\02Feb\2.xls uploaded to extracts/monthly_reports/2018/02Feb/2.xls.
File extracts\monthly_reports\2018\02Feb\3-1.xls uploaded to extracts/monthly_reports/2018/02Feb/3-1.xls.
File extracts\monthly_reports\2018\02Feb\3-2.xls uploaded to extracts/monthly_reports/2018/02Feb/3-2.xls.
File extracts\monthly_reports\2018\02Feb\3-3.xls uploaded to e

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [9]:
def retrieve_month_excels(murl, path):
    mcols = ['url', 'excel_num', 'excel_description', 'excel_url']
    mdf = pd.DataFrame(columns=mcols)
    mr = re.get(murl)
    mdata = mr.text
    msoup = BeautifulSoup(mdata, "lxml")
    mtable = msoup.find('div', {'class': 'stat-dataset_list-body'})
    for row in mtable.find_all('article', {'class': 'stat-dataset_list-item'}):
        excel_num = row.find('li', {'class': 'stat-dataset_list-detail-item stat-dataset_list-border-top'}).contents[0].replace('\n','')
        excel_description = row.find('a').contents[0]
        excel_url = ''
        excel_a = row.find_all('a')[1]
        
        if(excel_a['data-file_type'] == 'EXCEL'):
            excel_url = path + excel_a['href']
        mdfrow = pd.DataFrame([[murl, excel_num, excel_description, excel_url]], columns=mcols)
        if(len(mdf)==0):
            mdf = mdfrow
        else:
            mdf = mdf.append(mdfrow, ignore_index=True) #why the hell doesn't df.append work inplace?? Didn't it always use to?
    logging.info("Retrieved excel URLs from month-URL: " + murl + '.')
    return(mdf)

In [10]:
def update_file_links():
    col = ['year', 'month', 'url']
    top_df = pd.DataFrame(columns=col)
    url = "http://www.e-stat.go.jp/SG1/estat/OtherListE.do?bid=000001006005&cycode=1"
    path='https://www.e-stat.go.jp'

    r = re.get(url)
    data = r.text
    year='None'
    soup = BeautifulSoup(data, "lxml")
    table = soup.find('div', {'class': 'stat-cycle_sheet'})
    for year_section in table.find_all('ul', {'class': 'stat-cycle_ul_other'}):
        header = year_section.find('li', {'class': 'stat-cycle_header'})
        year = header.find('span').get_text(' ', strip=True)

        months_section = year_section.find('li', {'class': 'stat-cycle_item'})
        for month_row in months_section.find_all('div'):
            month_a = month_row.find('a')
            month = month_a.get_text().rstrip('.\n')
            monthurl = path + month_a.get('href')

            row = pd.DataFrame([[year, month, monthurl]], columns=col)
            if(len(top_df)==0):
                top_df = row
            else:
                top_df = top_df.append(row, ignore_index=True) #why the hell doesn't df.append work inplace?? Didn't it always use to?

    logging.info('DataFrame of months generated: ' + str(top_df.size) + ' months available, from ' 
        + top_df.loc[len(top_df)-1,'year'] + ' ' + top_df.loc[len(top_df)-1,'month'] + ' to ' 
        + top_df.loc[0,'year'] + ' ' + top_df.loc[0,'month'])

    retrieve_month_excels_starttime = datetime.utcnow()
    logging.info('Starting to retrieve urls of every excel sheet at ' + str(retrieve_month_excels_starttime) + ' UTC.')

    excels_df = pd.concat([retrieve_month_excels(murl, path) for murl in top_df['url']])
    excelref_df = top_df.merge(excels_df, how='left', on='url')

    logging.info('Excel URLs retrieved in ' + str(datetime.utcnow() - retrieve_month_excels_starttime) + '.')
    
    return excelref_df

In [None]:
from bs4 import BeautifulSoup
import requests as re
import pandas as pd
import numpy as np
import os
import io

import logging
logging.getLogger().setLevel(logging.INFO)
from datetime import datetime

# https://cloud.google.com/sql/docs/postgres/connect-external-app#languages
import psycopg2
conn = psycopg2.connect(user='postgres', password='',
                        dbname='postgres',
                        host='localhost')

print(conn.closed) #0 when open

In [None]:
%%time
excelref_df = update_file_links()

INFO:root:DataFrame of months generated: 453 months available, from 2006 Dec to 2018 Jan
INFO:root:Starting to retrieve urls of every excel sheet at 2018-09-13 03:12:58.912802 UTC.
INFO:root:Retrieved excel URLs from month-URL: https://www.e-stat.go.jp/en/stat-search/files?cycle=1&layout=datalist&toukei=00200523&tstat=000000070001&tclass1=000001008739&year=20180&month=11010301&result_back=1.
INFO:root:Retrieved excel URLs from month-URL: https://www.e-stat.go.jp/en/stat-search/files?cycle=1&layout=datalist&toukei=00200523&tstat=000000070001&tclass1=000001008739&year=20180&month=11010302&result_back=1.
INFO:root:Retrieved excel URLs from month-URL: https://www.e-stat.go.jp/en/stat-search/files?cycle=1&layout=datalist&toukei=00200523&tstat=000000070001&tclass1=000001008739&year=20180&month=11010303&result_back=1.
INFO:root:Retrieved excel URLs from month-URL: https://www.e-stat.go.jp/en/stat-search/files?cycle=1&layout=datalist&toukei=00200523&tstat=000000070001&tclass1=000001008739&year

INFO:root:Retrieved excel URLs from month-URL: https://www.e-stat.go.jp/en/stat-search/files?cycle=1&layout=datalist&toukei=00200523&tstat=000000070001&tclass1=000001008739&year=20150&month=23070908&result_back=1.
INFO:root:Retrieved excel URLs from month-URL: https://www.e-stat.go.jp/en/stat-search/files?cycle=1&layout=datalist&toukei=00200523&tstat=000000070001&tclass1=000001008739&year=20150&month=23070909&result_back=1.
INFO:root:Retrieved excel URLs from month-URL: https://www.e-stat.go.jp/en/stat-search/files?cycle=1&layout=datalist&toukei=00200523&tstat=000000070001&tclass1=000001008739&year=20150&month=24101210&result_back=1.
INFO:root:Retrieved excel URLs from month-URL: https://www.e-stat.go.jp/en/stat-search/files?cycle=1&layout=datalist&toukei=00200523&tstat=000000070001&tclass1=000001008739&year=20150&month=24101211&result_back=1.
INFO:root:Retrieved excel URLs from month-URL: https://www.e-stat.go.jp/en/stat-search/files?cycle=1&layout=datalist&toukei=00200523&tstat=00000

In [None]:
excelref_df.head(1)

In [None]:
conn.rollback() #After an erronious call, the connection needs to be rolled back before use again.

In [None]:
%%time
cur = conn.cursor()
excelurl_textstream = io.StringIO()

upload_df = excelref_df.copy()
upload_df['excel_description'].replace(['\n', '\t'], '', regex=True, inplace=True)

# psycopg2's copy_from() doesn't allow quoted strings as values. Ensure that the file you're copying doesn't have quoted strings

import csv
noquotes = csv.QUOTE_NONE

upload_df.to_csv(excelurl_textstream, sep='\t', header=False, index=False, quoting=noquotes)
excelurl_textstream.seek(0) 

print(excelurl_textstream.getvalue()[0:510])

In [154]:
cur.copy_from(excelurl_textstream, 'jpstat_excel_urls', null="", sep='\t') # null values become ''
conn.commit()
logging.info('Excel URL database table updated.')

INFO:root:Excel URL database table updated.


After testing on a local database, let's upload it to Google Cloud SQL.

In [168]:
%%time
gc_conn = psycopg2.connect(user='postgres', password='Xypherium-0',
                        dbname='jpstat',
                        host='35.224.240.50')

print(gc_conn.closed) #0 when open

0
Wall time: 1.14 s


In [170]:
gc_cur = gc_conn.cursor()
excelurl_textstream.seek(0) 
gc_cur.copy_from(excelurl_textstream, 'jpstat_excel_urls', null="", sep='\t') # null values become ''
gc_conn.commit()
logging.info('Goole Cloud SQL Excel URL database table updated.')

INFO:root:Goole Cloud SQL Excel URL database table updated.


Cells below this point are functions lifted directly from https://cloud.google.com/storage/docs/uploading-objects and associated pages.

In [4]:
"""
Deleting buckets is supported by the Python API, but buckets must be empty. 
To be safer, perform bucket operations using gsutil or from the Google Cloud Platform UI.
"""
def create_bucket(bucket_name):
    """Creates a new bucket."""
    storage_client = storage.Client()
    bucket = storage_client.create_bucket(bucket_name)
    print('Bucket {} created'.format(bucket.name))

In [14]:
"""
https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
"""
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print('File {} uploaded to {}.'.format(
        source_file_name,
        destination_blob_name))

In [6]:
"""
https://cloud.google.com/storage/docs/listing-objects
"""
def list_blobs(bucket_name):
    """Lists all the blobs in the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)

    blobs = bucket.list_blobs()

    for blob in blobs:
        print(blob.name)

In [7]:
def list_blobs_with_prefix(bucket_name, prefix, delimiter=None):
    """Lists all the blobs in the bucket that begin with the prefix.

    This can be used to list all blobs in a "folder", e.g. "public/".

    The delimiter argument can be used to restrict the results to only the
    "files" in the given "folder". Without the delimiter, the entire tree under
    the prefix is returned. For example, given these blobs:

        /a/1.txt
        /a/b/2.txt

    If you just specify prefix = '/a', you'll get back:

        /a/1.txt
        /a/b/2.txt

    However, if you specify prefix='/a' and delimiter='/', you'll get back:

        /a/1.txt

    """
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)

    blobs = bucket.list_blobs(prefix=prefix, delimiter=delimiter)

    print('Blobs:')
    for blob in blobs:
        print(blob.name)

    if delimiter:
        print('Prefixes:')
        for prefix in blobs.prefixes:
            print(prefix)

In [8]:
"""
https://cloud.google.com/storage/docs/downloading-objects
"""
def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(source_blob_name)

    blob.download_to_filename(destination_file_name)

    print('Blob {} downloaded to {}.'.format(
        source_blob_name,
        destination_file_name))

In [9]:
"""
https://cloud.google.com/storage/docs/renaming-copying-moving-objects

Deleting blobs is not supported in the Python API. To move, i.e. copy then delete, use 'gsutil mv'.
"""
def rename_blob(bucket_name, blob_name, new_name):
    """Renames a blob."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)

    new_blob = bucket.rename_blob(blob, new_name)

    print('Blob {} has been renamed to {}'.format(
        blob.name, new_blob.name))


def copy_blob(bucket_name, blob_name, new_bucket_name, new_blob_name):
    """Copies a blob from one bucket to another with a new name."""
    storage_client = storage.Client()
    source_bucket = storage_client.get_bucket(bucket_name)
    source_blob = source_bucket.blob(blob_name)
    destination_bucket = storage_client.get_bucket(new_bucket_name)

    new_blob = source_bucket.copy_blob(
        source_blob, destination_bucket, new_blob_name)

    print('Blob {} in bucket {} copied to blob {} in bucket {}.'.format(
        source_blob.name, source_bucket.name, new_blob.name,
        destination_bucket.name))

In [10]:
"""
Setting or adding metadata is not properly supported through the python API
So, use gsutil: 'gsutil setmeta -h "[METADATA_KEY]:[METADATA_VALUE]" gs://[BUCKET_NAME]/[OBJECT_NAME]'
or, use the Google Cloud Platform UI.
"""
def blob_metadata(bucket_name, blob_name):
    """Prints out a blob's metadata."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.get_blob(blob_name)

    print('Blob: {}'.format(blob.name))
    print('Bucket: {}'.format(blob.bucket.name))
    print('Storage class: {}'.format(blob.storage_class))
    print('ID: {}'.format(blob.id))
    print('Size: {} bytes'.format(blob.size))
    print('Updated: {}'.format(blob.updated))
    print('Generation: {}'.format(blob.generation))
    print('Metageneration: {}'.format(blob.metageneration))
    print('Etag: {}'.format(blob.etag))
    print('Owner: {}'.format(blob.owner))
    print('Component count: {}'.format(blob.component_count))
    print('Crc32c: {}'.format(blob.crc32c))
    print('md5_hash: {}'.format(blob.md5_hash))
    print('Cache-control: {}'.format(blob.cache_control))
    print('Content-type: {}'.format(blob.content_type))
    print('Content-disposition: {}'.format(blob.content_disposition))
    print('Content-encoding: {}'.format(blob.content_encoding))
    print('Content-language: {}'.format(blob.content_language))
    print('Metadata: {}'.format(blob.metadata))

In [None]:
"""
To handle streaming uploads and downloads, use the boto client library plugin: https://cloud.google.com/storage/docs/boto-plugin
"""

#Uploads:
"""
dst_uri = boto.storage_uri(<bucket> + '/' + <object>, 'gs')
dst_uri.new_key().set_contents_from_stream(<stream object>)

E.g.
filename = 'data_file'
MY_BUCKET = 'my_app_bucket'
my_stream = open(filename, 'rb')
dst_uri = boto.storage_uri(MY_BUCKET + '/' + filename, 'gs')
dst_uri.new_key().set_contents_from_stream(my_stream)
"""

#Downloads:
"""
import sys

src_uri = boto.storage_uri(<bucket> + '/' + <object>, 'gs')
src_uri.get_key().get_file(sys.stdout)

E.g.
downloaded_file = 'saved_data_file'
MY_BUCKET = 'my_app_bucket'
object_name = 'data_file'
src_uri = boto.storage_uri(MY_BUCKET + '/' + object_name, 'gs')
src_uri.get_key().get_file(sys.stdout)
"""

In [11]:
def delete_blob(bucket_name, blob_name):
    """Deletes a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)

    blob.delete()

    print('Blob {} deleted.'.format(blob_name))

In [12]:
"""
Bucket Labels are key:value pairs. You can add multiple labels to a single bucket.
"""
def add_bucket_label(bucket_name):
    """Add a label to a bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)

    labels = bucket.labels
    labels['example'] = 'label'
    bucket.labels = labels
    bucket.patch()

    print('Updated labels on {}.'.format(bucket.name))
    pprint.pprint(bucket.labels)
    
def get_bucket_labels(bucket_name):
    """Prints out a bucket's labels."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    labels = bucket.labels
    pprint.pprint(labels)
    
def remove_bucket_label(bucket_name):
    """Remove a label from a bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)

    labels = bucket.labels

    if 'example' in labels:
        del labels['example']

    bucket.labels = labels
    bucket.patch()

    print('Updated labels on {}.'.format(bucket.name))
    pprint.pprint(bucket.labels)

Object versioning helps maintain live and archived versions of blobs. However, to accommodate inexperienced users, I recommend
managing files using appropriate filenames, metadata, and bucket labels, when using GCS as a data lake. 

Object versioning may be more appropriate for use in maintaining live app resources.

To learn how to enable, read:

https://cloud.google.com/storage/docs/using-object-versioning

For an example of how versioning works, read:

https://cloud.google.com/storage/docs/object-versioning#example

In [16]:
"""You can make individual blobs(files) public. 
I recommend you handle this in gsutil, because you seldom will need to do this programatically.

To apply to a file:
gsutil acl ch -u AllUsers:R gs://[BUCKET_NAME]/[OBJECT_NAME]

To apply to a bucket:
gsutil iam ch allUsers:objectViewer gs://[BUCKET_NAME]"""
        
def make_blob_public(bucket_name, blob_name):
    """Makes a blob publicly accessible."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)

    blob.make_public()

    print('Blob {} is publicly accessible at {}'.format(
        blob.name, blob.public_url))

I recommend all other access control operations be performed using gsutil or the Google Cloud Platform UI.
https://cloud.google.com/storage/docs/access-control/using-iam-permissions