In [1]:
import pandas as pd
import urllib
import os
import math
import simplejson as json
from logger import logger

In [5]:
pd.set_option('display.max_colwidth', 500)

### Import metadata

In [61]:
df = pd.read_csv('../data/metadata.csv')

In [65]:
# Save image and category data
# df = df[['asin', 'imUrl', 'categories']]
# df.to_csv('../data/image_category.csv', index=False)

### Prepare data

In [5]:
df = df[['asin', 'imUrl', 'categories']]

In [7]:
def get_category_path(category_path_list):
    """
    (Str of list of list(s)) -> str

    Returns the category path given a string of list of lists of 
    categories. If there are more than one list of categories provided, 
    returns the category path from the first list.

    >>> get_category_path("[['A', 'B', 'C'], ['D', 'E', 'F', 'G']]")
    'A -> B -> C'
    >>> get_category_path("[['P1', 'P2', 'P3', 'P4']]")
    'P1 -> P2 -> P3 -> P4'

    :type category_path_list: str
    :param category_path_list: A string containing a list of at least 
    one list of categories
    :return: A string showing the full category path of the FIRST 
    category in the list (assumed to be primary category)
    """
    try:
        return ' -> '.join(eval(category_path_list)[0])
    except IndexError: # Error if the outer list is empty
        return 'no_category'
    except TypeError: # Error if the outer list is missing
        return 'no_category'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
# Create column for category path
df['category_path'] = df['categories'].apply(get_category_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [11]:
df.drop(labels='categories', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


### Download images

In [59]:
def download_images(df, output_dir):
    """
    (DataFrame) -> Images separated by categories into directories
    
    Downloads images from imUrl provided and saves them into directories 
    based on the product category.
    
    >>> download_images(df, output_dir):
    INFO: 1,000 images downloaded
    INFO: 2,000 images downloaded
    ...
    ...
    ...
    INFO: 20,000 images downloaded
    INFO: Image downloads complete!
    
    :param df: Dataframe containing product ID (asin), image url (imUrl), and category (category_path)
    :param output_dir: Directory path to where to store images (../data/images)
    """
    for i, row in df.iterrows():
        product_id = row['asin']
        url = row['imUrl']
        category_path = row['category_path']
        logger.info('Category: {}, URL: {}'.format(category_path, url))
        
        dir_path = '{}/{}'.format(output_dir, category_path)
 
        try: 
            urllib.urlretrieve(url, '{}/{}.jpg'.format(dir_path, product_id))
        except IOError:  # If category_path directory has not been created yet
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            urllib.urlretrieve(url, '{}/{}.jpg'.format(dir_path, product_id))
        except AttributeError:  # If url cannot be processed
            continue
            
        if i % 10 == 0:
            logger.info('{:,} images downloaded'.format(i))
    
    logger.info('Image downloads complete!')

In [60]:
download_images(df, '../data/images')

2016-10-14 15:21:27,176 - Category: Books, URL: http://ecx.images-amazon.com/images/I/51MKP0T4DBL.jpg
INFO:__log__:Category: Books, URL: http://ecx.images-amazon.com/images/I/51MKP0T4DBL.jpg
2016-10-14 15:21:27,447 - 0 images downloaded
INFO:__log__:0 images downloaded
2016-10-14 15:21:27,448 - Category: Movies & TV -> Movies, URL: http://g-ecx.images-amazon.com/images/G/01/x-site/icons/no-img-sm._CB192198896_.gif
INFO:__log__:Category: Movies & TV -> Movies, URL: http://g-ecx.images-amazon.com/images/G/01/x-site/icons/no-img-sm._CB192198896_.gif
2016-10-14 15:21:27,879 - Category: Clothing, Shoes & Jewelry -> Girls, URL: http://ecx.images-amazon.com/images/I/31mCncNuAZL.jpg
INFO:__log__:Category: Clothing, Shoes & Jewelry -> Girls, URL: http://ecx.images-amazon.com/images/I/31mCncNuAZL.jpg
2016-10-14 15:21:27,895 - Category: Sports & Outdoors -> Other Sports -> Dance -> Clothing -> Girls -> Skirts, URL: http://ecx.images-amazon.com/images/I/51EzU6quNML._SX342_.jpg
INFO:__log__:Categor

KeyboardInterrupt: 

In [16]:
def download_image(url, product_id, save_path):
    """

    Downloads image from url and saves to save path, naming the image product ID

    :param url:
    :param product_id:
    :param save_path:
    :return:
    """
    try:
        urllib.urlretrieve(url, '{}/{}.jpg'.format(save_path, product_id))
        return True
    except IOError as io:
        logger.error('Download error: {} | Reason: {}'.format(url, io))
        return False

In [17]:
download_image('http://ecx.images-amazon.om/images/I/31iqJpBWiFL._SY300_.jpg', 
               'test_id', '../data')

2016-10-15 09:28:37,273 - Download error: http://ecx.images-amazon.om/images/I/31iqJpBWiFL._SY300_.jpg | Reason: [Errno socket error] [Errno 8] nodename nor servname provided, or not known
ERROR:__log__:Download error: http://ecx.images-amazon.om/images/I/31iqJpBWiFL._SY300_.jpg | Reason: [Errno socket error] [Errno 8] nodename nor servname provided, or not known


False

In [2]:
with open('../data/image_download_logs/base_log.json') as json_log:
    i = 0
    for line in json_log:
        try:
            entry = json.loads(line.strip())
        except ValueError as e:
            logger.error('Json error: {} on line {}'.format(e, i))
        i += 1