In [1]:
# This file performs feature ingestion from feature files. Once the migration of feature storage within
# product matching is complete it will be possible to migrate this notebook to obtain the features
# directly from BigQuery. Currently this needs msgpack and msgpack_numpy installed. These will be removed once
# data is obtained from BigQuery.

In [None]:
import os

import datetime as dt
from pytz import utc

from google.cloud import storage as gcs
from contextlib import contextmanager

import tempfile

import msgpack
import msgpack_numpy

import numpy as np
import pandas as pd


from feast import Client, FeatureSet, Entity, ValueType, Feature
from google.protobuf.duration_pb2 import Duration
from random import randrange, randint

from feast.types.Value_pb2 import DoubleList as FeatureDoubleList

In [None]:
project_name = 'product_matching_team'
feature_set_name = 'product_features'

preprocessing_system_version = '0.5.0'
bucket_name = 'mount-pm-matching-production-features'
markets = ['US']
scrape_date_from = '2019-01-16'
scrape_date_to = '2019-01-31'
sites = ['farfetch']

FEATURE_NAMES = ['texture_embedding', 'spoc_embedding', 'local_histogram', 'ccv']
FEATURE_TYPES = [ValueType.DOUBLE_LIST, ValueType.DOUBLE_LIST, ValueType.DOUBLE_LIST, ValueType.DOUBLE_LIST]

In [None]:
CORE_URL = os.getenv('FEAST_CORE_URL')
BATCH_SERVING_URL = os.getenv('FEAST_BATCH_SERVING_URL')
ONLINE_SERVING_URL = os.getenv('FEAST_ONLINE_SERVING_URL')
print(CORE_URL, BATCH_SERVING_URL, ONLINE_SERVING_URL)

In [None]:
def construct_project(project_name):
    client = Client(core_url=CORE_URL, serving_url=BATCH_SERVING_URL, project=project_name)
    if project_name not in client.list_projects():
        print('constructing project: {}'.format(project_name))
        client.create_project(project_name)
    else:
        print('project already exists: {}'.format(project_name))
    return client

In [None]:
def construct_feature_set(client, project_name, feature_set_name, feature_names, feature_types, max_age=365*86400):
    feature_sets = client.list_feature_sets(
    project=project_name, 
    name=feature_set_name)
    
    if feature_sets:
        print('feature set already exists: {}'.format(feature_set_name))
        return feature_sets[0]
    else:
        # update to define schema
        feature_set = FeatureSet(
            feature_set_name,
            max_age=Duration(seconds=max_age), 
            entities=[Entity(name='product_id', dtype=ValueType.INT64)],
            features=[Feature(name=name, dtype=dtype) for name, dtype in zip(feature_names, feature_types)] 
        )
        client.apply(feature_set)
        return feature_set

In [None]:
client = construct_project(project_name=project_name)

In [None]:
feature_set = construct_feature_set(client, project_name, feature_set_name, FEATURE_NAMES, FEATURE_TYPES, max_age=365*86400)

In [None]:
print([x.name for x in feature_set.entities])

In [None]:
print([x.name for x in feature_set.features])

In [None]:
DATE_FORMAT = '%Y-%m-%d'

SITES = (
    'aizel',
    'alexandermcqueen',
    'balenciaga',
    'balmain',
    'bergdorfgoodman',
    'bloomingdales',
    'brownsfashion',
    'burberry',
    'chloe',
    'dolceandgabbana',
    'dsquared2',
    'farfetch',
    'fendi',
    'fwrd',
    'gucci',
    'harrods',
    'jimmychoo',
    'kenzo',
    'luisaviaroma',
    'marni',
    'matchesfashion',
    'miumiu',
    'moncler',
    'mrporter',
    'msgm',
    'mytheresa',
    'neimanmarcus',
    'netaporter',
    'nordstrom',
    'offwhite',
    'ounass',
    'prada',
    'redone',
    'rickowens',
    'saintlaurent',
    'saksfifthavenue',
    'secoo',
    'selfportrait',
    'selfridges',
    'shopbop',
    'ssense',
    'stylebop',
    'stellamccartney',
    'thombrowne',
    'tmall',
    'toryburch',
    'tsum',
    'twentyfoursevres',
    'valentino',
)

In [None]:
class Storage(BaseGoogleClient):
    """Google Cloud Storage client."""
    client_factory = gcs.Client

    def blob_exists(self, bucket_name, blob_name):
        """
        Simple wrapper function to determine whether a blob_name exists on the given GCS bucket.
        :param bucket_name: GCS bucket name.
        :param blob_name: GCS blob name.
        :return: Boolean
        """
        return self.blob(bucket_name, blob_name).exists()

    def save_to_json(self, obj, bucket_name, blob_name):
        """JSON Serializes a dict and stores it in GCS in the bucket/name provided.

        :param dict obj: A JSON serializable dict.
        :param str bucket_name: GCS bucket name.
        :param str blob_name: GCS blob name.
        """
        blob = self.blob(bucket_name, blob_name)

        with io.BytesIO() as f:
            f.write(json.dumps(obj).encode())
            blob.upload_from_file(f, rewind=True)

    def load_json(self, bucket_name, blob_name):
        """Downloads and deserializes a JSON file from the bucket/name provided.

        :param str bucket_name: GCS bucket name.
        :param str blob_name: GCS blob name.
        :returns: A dict of the deserialized JSON file.
        """
        with self.bytesio_blob(bucket_name, blob_name) as f:
            obj = json.loads(f.getvalue().decode())

        return obj

    def bucket(self, bucket_name):
        return self.client.get_bucket(bucket_name)

    def blob(self, bucket_name, blob_name):
        return self.bucket(bucket_name).blob(blob_name)

    @contextmanager
    def bytesio_blob(self, bucket_name, blob_name):
        """
        Returns the contents of the blob as BytesIO in the target of a context manager
        :param str bucket_name:
        :param str blob_name:
        """
        blob = self.blob(bucket_name, blob_name)
        with io.BytesIO() as f:
            blob.download_to_file(f)
            f.seek(0)
            yield f

    @contextmanager
    def tmpfile_blob(self, bucket_name, blob_name):
        """
        Returns the contents of the blob as temporary file descriptor in the target of a context manager
        :param str bucket_name:
        :param str blob_name:
        """
        blob = self.blob(bucket_name, blob_name)
        with tempfile.NamedTemporaryFile() as f:
            blob.download_to_filename(f.name)
            yield f

    def explode_tgz(self, bucket_name, blob_name, target_dir):
        """
        Downloads .tgz file from GCS and extracts contents into target directory
        :param bucket_name:
        :param blob_name:
        :param target_dir: file system path into which to extract contents of archive
        """
        with tempfile.TemporaryDirectory() as d:
            tgz_filename = os.path.join(d, 'temp.tgz')

            with open(tgz_filename, 'wb') as f:
                self.blob(bucket_name, blob_name).download_to_file(f)

            with tarfile.open(tgz_filename, 'r:gz') as tgz:
                tgz.extractall(path=target_dir)

    def download_uris_from_file(self, uris_filename, target_dir):
        """
        The contents for the file of uris should not contain duplicates to avoid problems with GCS downloads blocking.

        gsutil is expected to be available where this method is executing.

        gsutil can and does fail to download files from GCS on occasion which is why we retry.  To prevent repeatedly
        trying to download the entire list of uris, this list is instead filtered by files corresponding to blobs
        already downloaded.  This is the responsibility of the caller when scripting multi-processing gsutil commands:

        https://cloud.google.com/storage/docs/gsutil/addlhelp/ScriptingProductionTransfers

        :param uris_filename: path to file containing one uri per line
        :param target_dir: directory into which files corresponding to the uris will be copied
        :raises: GCSDownloadException
        """
        print('Downloading blobs from uris listed in file %s to %s', uris_filename, target_dir)
        cmd_args = ['gsutil', '-mq', 'cp', '-I', target_dir]

        download_filename = '{}.downloading'.format(uris_filename)
        with open(download_filename, 'w') as f_out:
            with open(uris_filename, 'r') as f_in:
                for l in f_in:
                    target_filename = os.path.join(target_dir, l.strip().split('/')[-1])
                    if not os.path.exists(target_filename):
                        f_out.write(l)
        try:
            with open(download_filename, 'r') as stdin:
                subprocess.run(cmd_args, stdin=stdin, shell=False, check=True)
        except subprocess.CalledProcessError as ex:
            msg = 'Unable to download images from file {} to {} after retries'.format(uris_filename, target_dir)
            raise errors.GCSDownloadException(msg) from ex
        finally:
            if os.path.exists(download_filename):
                os.remove(download_filename)

    def copy(self, source, target, is_recursive=False):
        """
        Uses gsutils to copy from source to target
        :param source: local file system path or gs uri
        :param target: local file system path or gs uri
        :param is_recursive: if copy is to be recursive
        :raises errors.GCSDownloadException:
        """
        print('Copying files from %s to %s', source, target)

        cmd_args = ['gsutil', '-mq', 'cp']
        if is_recursive:
            cmd_args.append('-r')

        cmd_args.extend([source, target])

        try:
            subprocess.run(cmd_args, shell=False, check=True)
        except subprocess.CalledProcessError as ex:
            msg = 'Unable to upload from {} to {}'.format(source, target)
            raise errors.GCSDownloadException(msg) from ex


In [None]:
def date_iterator(date_from, date_to):
    """
    Iterate over the given date range.
    :param date_from: The initial date of the date window.
    :type date_from: datetime.datetime
    :param date_to: The initial date of the date window.
    :type date_to: datetime.datetime
    :return: yields a string representation of the current date.
    """
    while date_from <= date_to:
        yield date_from.strftime(DATE_FORMAT)
        date_from += dt.timedelta(days=1)

In [None]:
def construct_date_range(date_string=None, date_from_string=None, date_to_string=None, n_days=None):
    """
    Construct a date range.
    Note: The date strings are expected to be in the format given by, settings.DATE_FORMAT.
    :param date_string: A string, denoting a date, or None.
    :param date_from_string: A string, denoting a date, or None.
    :param date_to_string: A string, denoting a date, or None.
    :param n_days: A positive integer or None. If given this is the size of the date range
    :return: A list of strings, each denoting a date.
    """
    if date_string is not None:
        if n_days is None:
            yield date_string
        else:
            if n_days >= 0:
                date_from = dt.datetime.strptime(date_string, DATE_FORMAT)
                date_to = date_from + dt.timedelta(days=n_days - 1)
            else:
                date_to = dt.datetime.strptime(date_string, DATE_FORMAT)
                date_from = date_to - dt.timedelta(days=abs(n_days) - 1)
            yield from date_iterator(date_from, date_to)

    else:
        if not date_from_string:
            return

        date_from = dt.datetime.strptime(date_from_string, DATE_FORMAT)
        if date_to_string is None:
            date_to = dt.datetime.today()
        else:
            date_to = dt.datetime.strptime(date_to_string, DATE_FORMAT)
        yield from date_iterator(date_from, date_to)


In [None]:
def construct_features_blob_name(preprocessing_system_version, site, market, scrape_date):
    """
    Construct a file path for the pre-processed product features.
    :param preprocessing_system_version: The version of the preprocessing system
    :param site: The site of the products
    :param market: The market of the products
    :param scrape_date: The scrape date of the products
    :return: The file path of the file for the pre-processed product image features.
    """
    return os.path.join(preprocessing_system_version, site, market, scrape_date)


In [None]:
def construct_features_blob_names(
        preprocessing_system_version, markets, scrape_date_from, scrape_date_to=None, sites=None):
    """
    Construct a list of file paths for the pre-processed product features.
    :param preprocessing_system_version: The version of the preprocessing system
    :param markets: The market(s) of the products
    :param scrape_date_from: The lower bound on the scrape date of the products
    :param scrape_date_to: The upper bound on the scrape date of the products
    :param sites: The sites of the products
    :return: The file paths of the file for the pre-processed product image features.
    """
    markets = [markets] if isinstance(markets, str) else markets
    if sites is None:
        sites = constants.SITES
    scrape_dates = construct_date_range(date_from_string=scrape_date_from, date_to_string=scrape_date_to)

    return [construct_features_blob_name(preprocessing_system_version, s, market, sd)
            for sd in scrape_dates
            for s in sites
            for market in markets]

In [None]:
def load_features_from_gcs(
    preprocessing_system_version,
    storage,
    bucket_name,
    markets,
    scrape_date_from,
    scrape_date_to=None,
    sites=None,
):
    """
    Generator of product features from all files matching the given criteria
    :param preprocessing_system_version:
    :param product_matching_base.clients.google.Storage storage:
    :param bucket_name: GCS bucket name
    :param markets: List of markets or single str of market code.
    :param scrape_date_from: str in format %Y-%m-%d e.g. 2018-01-01
    :param scrape_date_to: (optional) str in format %Y-%m-%d e.g. 2018-01-01, today if not specified
    :param sites: list of str, if not specified constants.SITES
    :yields: dicts with features including natural keys of products.
    """
    markets = [markets] if isinstance(markets, str) else markets
    for market in markets:
        blob_names = construct_features_blob_names(
            preprocessing_system_version,
            market,
            scrape_date_from,
            scrape_date_to=scrape_date_to,
            sites=sites,
        )
        for blob_name in blob_names:
            if storage.blob_exists(bucket_name, blob_name):
                yield from load_features_from_blob(storage, bucket_name, blob_name)



In [None]:
def load_features_from_blob(storage, bucket_name, blob_name):
    """
    Downloads blob from Google Cloud Storage and yields product feature dicts
    :param product_matching_base.clients.google.Storage storage:
    :param bucket_name: name of GCS bucket
    :param blob_name: name of GCS blob
    """
    with storage.tmpfile_blob(bucket_name, blob_name) as f:
        yield from msgpack.Unpacker(f, object_hook=msgpack_numpy.decode, raw=False)


In [None]:
def construct_product_id(feature_dict):
    return int(feature_dict['site_product_id'])
    

In [None]:
storage = Storage()

In [None]:
def ingest_features_to_store(feature_set, storage, bucket_name, markets, scrape_date_from, scrape_date_to, sites, preprocessing_system_version):
    features = list(load_features_from_gcs(
    preprocessing_system_version,
    storage,
    bucket_name,
    markets,
    scrape_date_from,
    scrape_date_to,
    sites=sites))
    
    print('Number of features for date {0}: {1}'.format(scrape_date_from, len(features)))
    
    product_ids = [construct_product_id(x) for x in features]
    product_features = pd.DataFrame(
        {
            "datetime": [dt.datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=utc) for _ in range(len(features))],
            "product_id": product_ids,
            "texture_embedding": [x['texture_embedding'].squeeze() for x in features],
            "spoc_embedding": [x['spoc'].squeeze() for x in features],
            "local_histogram": [np.hstack(x['local_histogram']) for x in features],
            "ccv": [np.hstack(x['ccv']) for x in features],
        })
    
    if product_features.shape[0] > 0:
        client.ingest(feature_set, product_features, chunk_size=10)

    return product_ids

In [None]:
def construct_store_keys(storage, bucket_name, markets, scrape_date_from, scrape_date_to, sites, preprocessing_system_version):
    features = list(load_features_from_gcs(
    preprocessing_system_version,
    storage,
    bucket_name,
    markets,
    scrape_date_from,
    scrape_date_to,
    sites=sites))
    
    print('Number of features for date {0}: {1}'.format(scrape_date_from, len(features)))
    
    return [construct_product_id(x) for x in features]

In [None]:
stored_product_keys = []

for scrape_date in construct_date_range(date_from_string=scrape_date_from, date_to_string=scrape_date_to):
    stored_product_keys.extend(ingest_features_to_store(
        feature_set,
        storage,
        bucket_name,
        markets,
        scrape_date_from=scrape_date,
        scrape_date_to=scrape_date,
        sites=sites,
        preprocessing_system_version=preprocessing_system_version))