In [20]:
import cPickle as pickle
import os
import datetime
from collections import defaultdict
import heapq
from eutils.utils.logger import logger
from clean_titles import encode_string, tokenize_title_string, remove_words_list, remove_numeric_list, remove_chars, STOP_WORDS, HTML_PARSER

In [2]:
def merge_dicts(dicts, defaultdict=defaultdict, int=int):
    """ (list(dict), type, type) -> dict

    Returns a single dictionary given a list of dictionaries.
    Values with the same keys are summed and assigned to the key.

    :param dicts:
    :param defaultdict:
    :param int:
    :return:

    >>> merge_dicts([{'A': 1}, {'B': 2}])
    defaultdict(<type 'int'>, {'A': 1, 'B': 2})
    >>> merge_dicts([{'A': 1}, {'B': 2}, {'C': 3}, {'A': 10}])
    defaultdict(<type 'int'>, {'A': 11, 'C': 3, 'B': 2})
    """

    merged = defaultdict(int)
    for d in dicts:
        for k in d:
            merged[k] += d[k]

    return merged


def get_score(tokens, ngram_dict, int_to_category_dict, top_n):
    dict_list = []

    # get list of dictionaries based on tokens
    for token in tokens:
        try:
            dict_list.append(ngram_dict[token])
        except KeyError:
            pass

    # Merge list of dicts together and add values
    score = merge_dicts(dict_list)

    # Get top n regional ids based on score
    top_n_cats = heapq.nlargest(top_n, score, key=score.get)
    
    # Convert integers back to categories
    top_n_cats = [int_to_category_dict[idx] for idx in top_n_cats]

    return top_n_cats

In [3]:
x = ['bookshelf', 'wood', 'clock']

In [8]:
get_score(x, tfidf_dict, int_to_category_dict, 3)

['Toys & Games -> Hobbies -> Trains & Accessories -> Train Cars -> Freight Cars',
 'Home & Kitchen -> Furniture -> Home Office Furniture -> Home Office Desks',
 'Electronics -> Computers & Accessories -> Laptop & Netbook Computer Accessories']

### Load dictionary

In [5]:
def load_dict(dict_dir='categorize', dict_name='tfidf_dict'):
    """ (str, str) -> defaultdict

    Loads a dictionary for categorization into memory

    :param tfidf_dict:
    :param dict_dir:
    :param dict_name:
    :return:
    """
    output_dir_path = os.path.join(dict_dir, dict_name + '.pickle')

    with open(output_dir_path, 'rb') as handle:
        logger.info('Dictionary loading from: {}/{}.pickle'.format(dict_dir, dict_name))
        return pickle.load(handle)

In [6]:
tfidf_dict, int_to_category_dict = load_dict('../data/model', 'categorization_dicts_small')

2016-12-21 16:37:06,246 - Dictionary loading from: ../data/model/categorization_dicts_small.pickle


### Class for single categorization

In [22]:
class Title:

    def __init__(self, title):
        self.title = title
        self.processed_title = None

    def prepare(self, excluded='-.'):
        """ (str) -> list(str)

        Returns the title after it has been prepared by the process from clean titles

        :return:
        >>> CategorizeSingle('Crème brûlée &quot; &amp; &nbsp;').prepare()
        ['creme', 'brulee']
        >>> CategorizeSingle('test hyphen-word 0.9 20% green/blue').prepare()
        ['test', 'hyphen-word', '0.9']
        >>> CategorizeSingle('grapes come in purple and green').prepare()
        ['grapes', 'come']
        >>> CategorizeSingle('what remains of a word ! if wordlen is 2').prepare()
        ['remains', 'word', 'wordlen']
        """

        self.title = encode_string(self.title, HTML_PARSER)
        self.title = self.title.lower()
        self.title = tokenize_title_string(self.title, excluded)
        self.title = remove_words_list(self.title, STOP_WORDS)
        self.title = remove_numeric_list(self.title)
        self.title = remove_chars(self.title, 1)
        logger.info(self.title)
        return self

    def categorize(self):
        """ (CategorizeSingle(str)) -> dict

        Categorizes prepared title and returns a dictionary of form {1: 'Cat1', 2: 'Cat2', 3: 'Cat3}

        :return:
        """
        
        result_list = get_score(self.title, tfidf_dict, int_to_category_dict, 3)
        result_dict = dict()
        for i, category in enumerate(result_list):
            result_dict[i+1] = category

        return result_dict


In [14]:
x = CategorizeSingle('This is a bookshelf with wood and a clock').prepare()
logger.info(x)

2016-12-21 16:40:01,781 - ['bookshelf', 'wood', 'clock']
2016-12-21 16:40:01,782 - <__main__.CategorizeSingle instance at 0x11914bc20>


In [15]:
# get_score(x, tfidf_dict, 3)

In [16]:
CategorizeSingle('This is a bookshelf made with wood and a clock').prepare()

2016-12-21 16:40:02,579 - ['bookshelf', 'made', 'wood', 'clock']


<__main__.CategorizeSingle instance at 0x11914be60>

In [17]:
CategorizeSingle('This is a bookshelf with wood and a clock').prepare().categorize()

2016-12-21 16:40:03,275 - ['bookshelf', 'wood', 'clock']


{1: 'Toys & Games -> Hobbies -> Trains & Accessories -> Train Cars -> Freight Cars',
 2: 'Home & Kitchen -> Furniture -> Home Office Furniture -> Home Office Desks',
 3: 'Electronics -> Computers & Accessories -> Laptop & Netbook Computer Accessories'}

In [18]:
def categorize_single(title):
    """ (str) -> dict

    Initializes given title as CategorizeSingle class and returns a dictionary of top 3 options

    :param title:
    :return:
    """
    start_time = datetime.datetime.now()

    result = Title(title).prepare().categorize()

    end_time = datetime.datetime.now()
    elapsed_time = end_time - start_time
    elapsed_time = elapsed_time.total_seconds() * 1000
    logger.debug('Time taken: {} ms'.format(elapsed_time))

    return result, elapsed_time

In [71]:
@timer
def categorize_single_raw(title):
    """ (str) -> dict

    Initializes given title as CategorizeSingle class and returns a dictionary of top 3 options

    :param title:
    :return:
    """
    result = Title(title).prepare().categorize()

    return result

In [75]:
categorize_single('This is a bookshelf with wood and a clock')

2016-12-21 17:06:45,034 - ['bookshelf', 'wood', 'clock']


({1: 'Toys & Games -> Hobbies -> Trains & Accessories -> Train Cars -> Freight Cars',
  2: 'Home & Kitchen -> Furniture -> Home Office Furniture -> Home Office Desks',
  3: 'Electronics -> Computers & Accessories -> Laptop & Netbook Computer Accessories'},
 1.391)

In [76]:
categorize_single_raw('This is a bookshelf with wood and a clock')

2016-12-21 17:06:45,535 - ['bookshelf', 'wood', 'clock']


({1: 'Toys & Games -> Hobbies -> Trains & Accessories -> Train Cars -> Freight Cars',
  2: 'Home & Kitchen -> Furniture -> Home Office Furniture -> Home Office Desks',
  3: 'Electronics -> Computers & Accessories -> Laptop & Netbook Computer Accessories'},
 1.125)

In [74]:
def timer(function_to_time):
    
    def wrapper(*args, **kwargs):
        start_time = datetime.datetime.now()
        
        result = function_to_time(*args)
        
        end_time = datetime.datetime.now()
        elapsed_time = end_time - start_time
        elapsed_time = elapsed_time.total_seconds() * 1000
        logger.debug('Time taken: {} ms'.format(elapsed_time))
        
        return result, elapsed_time
        
    return wrapper