In [1]:
import pandas as pd
import numpy as np
from eutils.utils.logger import logger

In [2]:
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 500)

In [3]:
df = pd.read_csv('../data/metadata_categories_only.csv', )
logger.info('No. of rows in data: {}'.format(df.shape[0]))

2016-07-17 20:44:32,477 - No. of rows in data: 9430088
INFO:__log__:No. of rows in data: 9430088


In [4]:
# Drop rows where title is missing
df.dropna(how='any', inplace=True)
logger.info('No. of rows after dropping columns with missing values: {}'.format(df.shape[0]))

2016-07-17 20:44:37,239 - No. of rows after dropping columns with missing values: 7975697
INFO:__log__:No. of rows after dropping columns with missing values: 7975697


In [5]:
def get_category_lvl1(category_path_list):
    """
    (String of list of list(s)) -> str

    Returns the top level category given a string of a list of lists of categories.
    If there are more than one list of categories provided, returns the top level category from the first list.

    >>> get_category_lvl1("[['A', 'B', 'C'], ['D', 'E', 'F', 'G']]")
    'A'
    >>> get_category_lvl1("[['P1', 'P2', 'P3', 'P4']]")
    'P1'
    >>> get_category_lvl1("[['']]")
    ''

    :type category_path_list: str
    :param category_path_list: A string containing a list of at least one list of categories
    :return: A string showing the full category path of the FIRST category in the list (assumed to be primary category)
    """
    try:
        return eval(category_path_list)[0][0]
    except IndexError:
        return 'no_category'
    except TypeError:
        return 'no_category'
    
# Test cases for get_category_path
assert get_category_lvl1("[['Clothing, Shoes & Jewelry', 'Girls'], ['Clothing, Shoes & Jewelry', 'Novelty, Costumes & More', 'Costumes & Accessories', 'More Accessories', 'Kids & Baby']]") == 'Clothing, Shoes & Jewelry'
assert get_category_lvl1("[['Patio, Lawn & Garden', 'Patio Furniture & Accessories', 'Patio Seating', 'Sofas']]") == 'Patio, Lawn & Garden'
assert get_category_lvl1("[['']]") == ''

In [6]:
def get_category_path(category_path_list):
    """
    (String of list of list(s)) -> str

    Returns the category path given a string of list of lists of categories.
    If there are more than one list of categories provided, returns the category path from the first list.

    >>> get_category_path("[['A', 'B', 'C'], ['D', 'E', 'F', 'G']]")
    'A -> B -> C'
    >>> get_category_path("[['P1', 'P2', 'P3', 'P4']]")
    'P1 -> P2 -> P3 -> P4'

    :type category_path_list: str
    :param category_path_list: A string containing a list of at least one list of categories
    :return: A string showing the full category path of the FIRST category in the list (assumed to be primary category)
    """
    try:
        return ' -> '.join(eval(category_path_list)[0])
    except IndexError:
        return 'no_category'
    except TypeError:
        return 'no_category'

# Test cases for get_category_path
# assert get_category_path("[['Clothing, Shoes & Jewelry', 'Girls'], ['Clothing, Shoes & Jewelry', 'Novelty, Costumes & More', 'Costumes & Accessories', 'More Accessories', 'Kids & Baby']]") == 'Clothing, Shoes & Jewelry > Girls'
# assert get_category_path("[['Patio, Lawn & Garden', 'Patio Furniture & Accessories', 'Patio Seating', 'Sofas']]") == 'Patio, Lawn & Garden > Patio Furniture & Accessories > Patio Seating > Sofas'
# assert get_category_path("[[]]") == ''

In [7]:
# Create column for category
df['category_lvl1'] = df['categories'].apply(get_category_lvl1)

In [8]:
# Drop columns that have no category data
df = df[df['category_lvl1'] != '']
logger.info('No. of rows after dropping columns with no category data: {}'.format(df.shape[0]))

2016-07-17 20:46:12,483 - No. of rows after dropping columns with no category data: 7974462
INFO:__log__:No. of rows after dropping columns with no category data: 7974462


In [9]:
# Create df of category counts
category_df = df.groupby('category_lvl1').agg({'title': 'count'}).sort_values(by='title', ascending=False).reset_index()

In [10]:
# Exclude categories where titles are not indicative of category
category_df = category_df[category_df['category_lvl1'] != 'Books']
category_df = category_df[category_df['category_lvl1'] != 'CDs & Vinyl']
category_df = category_df[category_df['category_lvl1'] != 'Movies & TV']
category_df = category_df[category_df['category_lvl1'] != 'Musical Instruments']

In [11]:
# Exclude some other categories
category_df = category_df[category_df['category_lvl1'] != 'Musical Instruments']
category_df = category_df[category_df['category_lvl1'] != 'Amazon Fashion']
category_df = category_df[category_df['category_lvl1'] != 'All Electronics']
category_df = category_df[category_df['category_lvl1'] != 'All Beauty']
category_df = category_df[category_df['category_lvl1'] != 'Collectibles & Fine Art']
category_df = category_df[category_df['category_lvl1'] != 'Grocery & Gourmet Food']
category_df = category_df[category_df['category_lvl1'] != 'Pet Supplies']

In [12]:
# # Keep categories where the count of titles > 1500
category_df = category_df[category_df['title'] > 1500]

In [13]:
category_df

Unnamed: 0,category_lvl1,title
1,"Clothing, Shoes & Jewelry",1435416
2,Sports & Outdoors,528615
3,Electronics,488592
4,Home & Kitchen,435238
6,Cell Phones & Accessories,344535
7,Toys & Games,334659
8,Automotive,326615
9,Tools & Home Improvement,268130
10,Health & Personal Care,261943
11,Beauty,258726


In [None]:
# # Keep only rows where the category is in category_df
df = df[df['category_lvl1'].isin(category_df['category_lvl1'])]
logger.info('No. of rows after dropping categories where count < 1500: {}'.format(df.shape[0]))

In [None]:
# Create column for category path
df['category_path'] = df['categories'].apply(get_category_path)

In [None]:
category_path_df = df.groupby('category_path').agg({'title': 'count'}).sort_values(by='title', ascending=False).reset_index()

In [None]:
# Keep category_paths where the count of titles > 10
category_path_df = category_path_df[category_path_df['title'] >= 20]
logger.info('No. of category_paths after excluding those with < 10 products: {}'.format(category_path_df.shape[0]))

In [None]:
# Exclude category paths where category_path is at top level
category_path_df = category_path_df[category_path_df['category_path'].str.contains('->')]
logger.info('No. of category_paths after excluding top level categories: {}'.format(category_path_df.shape[0]))

In [None]:
# Exclude categories that are not deepest category
category_path_df.sort_values(by='category_path', inplace=True)
category_path_df['category_path_next'] = category_path_df['category_path'].shift(-1)
category_path_df.fillna('no_comparison', inplace=True)

In [None]:
# Create list of category_paths which are deepest category
category_path_list = []
for i, value in category_path_df.iterrows():
    category_path = value['category_path']
    category_path_next = value['category_path_next']
    if category_path not in category_path_next:
        category_path_list.append(category_path)

In [None]:
# Create df of category_path
category_path_df = pd.DataFrame(category_path_list, columns = ['category_path'])
logger.info('No. of category_paths at deepest category: {}'.format(category_path_df.shape[0]))

In [None]:
# Keep only rows where the category is in category_df
df = df[df['category_path'].isin(category_path_df['category_path'])]
logger.info('No. of rows in deepest category: {}'.format(df.shape[0]))