In [1]:
import pandas as pd
import numpy as np
import regex as re
from nltk.corpus import stopwords
import unicodedata
import string
import sys
import os
import matplotlib
import math
import heapq
from collections import defaultdict
from HTMLParser import HTMLParser
from sklearn.cross_validation import train_test_split
from eutils.utils.logger import logger

In [2]:
html_parser = HTMLParser()

In [3]:
pd.set_option('display.max_colwidth', 500)

### Load Data

In [4]:
df = pd.read_csv('../data/output/title_category_keep_samp.csv')

In [5]:
# Exclude categories where count < 3
cat_count = df[['title', 'category_path']].groupby('category_path').count().reset_index().sort_values(by='title')

# Filter those with > 10 categories
cat_count = cat_count[cat_count['title'] > 5]
cat_count.rename(columns={'title': 'cat_count'}, inplace=True)

# Filter df to only include those with cat_count > 5
df = df.merge(cat_count, on='category_path', how='inner')

# Keep only necessary columns
df = df[['asin', 'title', 'category_path']]

logger.info('Data loaded of size: {}'.format(df.shape))

2016-06-12 12:14:30,877 - Data loaded of size: (991078, 3)
INFO:__log__:Data loaded of size: (991078, 3)


### Process data

In [6]:
STOP_WORDS = set(stopwords.words('english'))
SPAM_WORDS = {'import', 'export', 'day', 'week', 'month', 'year', 'new', 'free', 'international', 'intl', 'oem', ''}
COLOURS = set(matplotlib.colors.cnames.keys())
STOP_WORDS = STOP_WORDS.union(SPAM_WORDS).union(COLOURS)

In [12]:
# Remove records with no category form df
def remove_no_category(df, category='category_path'):
    """ (DataFrame, str) -> DataFrame

    Returns a dataframe where the missing categories have been dropped.

    :param df:
    :param category:
    :return:
    """

    df = df[df[category] != np.nan]
    return df


# Function to encode string
def encode_string(title, parser):
    """ (str) -> str

    Returns a string that is encoded as ascii
    Note: While unicode(title, 'utf-8', 'ignore') seems to work correctly in doctest, it has led to errors in the past.
    If so, use iso-8859-1.

    :param title:
    :return:

    >>> encode_string('Crème brûlée')
    'Creme brulee'
    >>> encode_string('åöûëî')
    'aouei'
    """

    try:
        encoded_title = unicodedata.normalize('NFKD', unicode(title, 'utf-8', 'ignore')).encode('ascii', 'ignore')
        encoded_title = parser.unescape(encoded_title).encode('ascii', 'ignore')
    except TypeError:  # if title is missing and a float
        encoded_title = 'NA'

    return encoded_title


# Encode titles in df
def encode_title(df, title='title_processed', parser=html_parser):
    """ (DataFrame, str) -> DataFrame

    Returns a dataframe where the title has been encoded.

    :param df:
    :param title:
    :return:
    """

    df[title] = df[title].apply(encode_string, args=(parser, ))
    logger.info('{} encoded'.format(title))
    return df


# Lowercase titles in df
def lowercase_title(df, title='title_processed'):
    """ (DataFrame, str) -> DataFrame

    Returns a dataframe where the title has been lowercased.

    :param df:
    :param title:
    :return:
    """

    df[title] = df[title].apply(string.lower)
    logger.info('{} lowercased'.format(title))
    return df


# Tokenize strings
def tokenize_title_string(title, excluded):
    """ (str) -> list(str)

    Returns a list of string tokens given a string.
    It will exclude the following characters from the tokenization: - / . %

    :param title:
    :return:

    >>> tokenize_title_string('hello world')
    ['hello', 'world']
    >>> tokenize_title_string('test hyphen-word 0.9 20% green/blue')
    ['test', 'hyphen-word', '0.9', '20%', 'green/blue']
    """

    return re.split("[^" + excluded + "\w]+", title)


# Tokenize titles in df
def tokenize_title(df, title='title', excluded='-/.%'):
    """ (DataFrame, str) -> DataFrame

    Returns a dataframe where the title has been tokenized based on function tokenize_title_string

    :param df:
    :param title:
    :return:
    """

    df[title] = df[title].apply(tokenize_title_string, args=(excluded, ))
    logger.info('{} tokenized'.format(title))
    return df


# Remove stopwords from string
def remove_words(title, words_to_remove):
    """ (list(str), set) -> list(str)

    Returns a list of tokens where the stopwords/spam words/colours have been removed

    :param title:
    :param words_to_remove:
    :return:
    >>> remove_words(['python', 'is', 'the', 'best'], STOP_WORDS)
    ['python', 'best']
    >>> remove_words(['grapes', 'come', 'in', 'purple', 'and', 'green'], STOP_WORDS)
    ['grapes', 'come']
    >>> remove_words(['spammy', 'title', 'intl', 'buyincoins', 'export'], STOP_WORDS)
    ['spammy', 'title']
    """

    return [token for token in title if token not in words_to_remove]


# Remove stopwords from df
def remove_stopwords(df, stopwords, title='title_processed'):
    """ (DataFrame, set, str) -> DataFrame

    Returns a DataFrame where the stopwords have been removed from the titles

    :param df:
    :param stopwords:
    :param title:
    :return:
    """
    df[title] = df[title].apply(remove_words, args=(stopwords, ))
    logger.info('{} stopwords removed'.format(title))
    return df


# Remove words with character count below threshold from string
def remove_chars(title, word_len=1):
    """ (list(str), int) -> list(str)

    Returns a list of str (tokenized titles) where tokens of character length =< word_len is removed.

    :param title:
    :param word_len:
    :return:

    >>> remove_chars(['what', 'remains', 'of', 'a', 'word', '!', ''], 1)
    ['what', 'remains', 'of', 'word']
    >>> remove_chars(['what', 'remains', 'of', 'a', 'word', '!', '', 'if', 'word_len', 'is', '2'], 2)
    ['what', 'remains', 'word', 'word_len']
    """

    return [token for token in title if len(token) > word_len]


# Remove words that are fully numeric
def remove_numeric(title):
    """ (list(str)) -> list(str)

    Remove words which are fully numeric

    :param title:
    :return:

    >>> remove_numeric(['A', 'B', '1', '123', 'C'])
    ['A', 'B', 'C']
    """

    return [token for token in title if not token.isdigit()]


# Remove words that are solely numeric from df
def remove_numeric_from_df(df, title='title_processed'):
    df[title] = df[title].apply(remove_numeric)
    logger.info('{} solely numeric words removed'.format(title))
    return df


# Remove words that have words == 1 char from title
def remove_one_char_words(df, word_len=1, title='title_processed'):
    """ (DataFrame, int, str) -> DataFrame

    Returns a DataFrame where tokens of character length <= word_len is removed

    :param df:
    :param word_len:
    :param title:
    :return:
    """

    df[title] = df[title].apply(remove_chars, args=(word_len, ))
    logger.info('{} tokens with char length equals {} removed'.format(title, word_len))
    return df

In [13]:
def find_ngrams(input_list, n):
    """ list, int -> list(tuples)

    Return a list of ngram tuples, where each tuple contains n unigrams

    :param input_list:
    :param n:
    :return:
    >>> find_ngrams(['A', 'B', 'C', 'D'], 2)
    [('A', 'B'), ('B', 'C'), ('C', 'D')]
    >>> find_ngrams(['A', 'B', 'C', 'D'], 3)
    [('A', 'B', 'C'), ('B', 'C', 'D')]
    """
    return zip(*[input_list[i:] for i in range(n)])


def create_ngram_from_tokens(tokens):
    """ list(str) -> list(str)

    Returns a list of ngram strings from a list of ngram tokens

    :param tokens:
    :return:
    >>> create_ngram_from_tokens(['A', 'B', 'C', 'D'])
    ['A', 'B', 'C', 'D', 'A_B', 'B_C', 'C_D', 'A_B_C', 'B_C_D']
    """

    bigram_list = find_ngrams(tokens, 2)
    trigram_list = find_ngrams(tokens, 3)

    bigrams = [tuple[0] + '_' + tuple[1] for tuple in bigram_list]
    trigrams = [tuple[0] + '_' + tuple[1] + '_' + tuple[2] for tuple in trigram_list]

    ngram_list = tokens + bigrams + trigrams
    return ngram_list


def create_ngram(df, title='title'):
    """ (DataFrame) -> DataFrame

    Returns a DataFrame where the title is converted from str to ngrams

    :param df:
    :param title:
    :return:
    """

    df[title] = df[title].apply(create_ngram_from_tokens)
    logger.info('{} ngrams created'.format(title))
    return df


def create_tfidf_dict(train, title='title', category='regional_key'):
    """ (DataFrame, str, str) -> defaultdict

    Returns a tf-idf dict given a dataframe containing title and regional_key

    :param train:
    :param title:
    :param category:
    :return:
    """

    # Create tf dictionary (though the name is tfidf, it's only tf for now)
    ngram_dict_tfidf = defaultdict()

    # For each token in the titles, create a dict as its value
    for i, row in train.iterrows():
        tokens = row[title]
        for token in tokens:
            ngram_dict_tfidf[token] = defaultdict()

    logger.info('TF dict phase 1 done')

    # For each token in the titles, add the token frequency to the value of the token key
    # Token frequency = token count / total number of tokens in title
    for i, row in train.iterrows():
        tokens = row[title]
        regional_id = row[category]
        for token in tokens:
            token_tf = tokens.count(token) / float(len(tokens))
            try:
                ngram_dict_tfidf[token][regional_id] += token_tf
            except KeyError:
                ngram_dict_tfidf[token][regional_id] = token_tf

    logger.info('TF dict phase 2 done')

    # create idf dictionary and count the number of titles in train
    ngram_dict_idf = defaultdict()
    no_of_skus = len(train)

    # For each token in the title, add one to the value of the token key
    for i, row in train.iterrows():
        tokens = set(row[title])
        for token in tokens:
            try:
                ngram_dict_idf[token] += 1
            except KeyError:
                ngram_dict_idf[token] = 1

    # For each token in idf dict, divide the total number of skus (logged) by the count of token value
    # Add 1 to the numerator to prevent zero divison error
    for term, count in ngram_dict_idf.iteritems():
        ngram_dict_idf[term] = math.log(no_of_skus) / float(1 + count)

    logger.info('IDF dict done')

    # Multiple values in tf dictionary with idf dictionary to get tf-idf dictionary
    for ngram, cat_dict in ngram_dict_tfidf.iteritems():
        # print ngram
        ngram_idf = ngram_dict_idf[ngram]
        # print ngram_idf
        for regional_key, count in cat_dict.iteritems():
            ngram_dict_tfidf[ngram][regional_key] = count * ngram_idf

    logger.info('TF-IDF dict done')
    return ngram_dict_tfidf

In [14]:
def merge_dicts(dicts, defaultdict=defaultdict, int=int):
    """ (list(dict), type, type) -> dict

    Returns a single dictionary given a list of dictionaries.
    Values with the same keys are summed and assigned to the key.

    :param dicts:
    :param defaultdict:
    :param int:
    :return:

    >>> merge_dicts([{'A': 1}, {'B': 2}])
    defaultdict(<type 'int'>, {'A': 1, 'B': 2})
    >>> merge_dicts([{'A': 1}, {'B': 2}, {'C': 3}, {'A': 10}])
    defaultdict(<type 'int'>, {'A': 11, 'C': 3, 'B': 2})
    """

    merged = defaultdict(int)
    for d in dicts:
        for k in d:
            merged[k] += d[k]

    return merged


def get_score(tokens, ngram_dict, top_n):
    dict_list = []

    # get list of dictionaries based on tokens
    for token in tokens:
        try:
            dict_list.append(ngram_dict[token])
        except KeyError:
            pass

    # Merge list of dicts together and add values
    score = merge_dicts(dict_list)

    # Get top n regional ids based on score
    top_n_cats = heapq.nlargest(top_n, score, key=score.get)

    return top_n_cats


def get_top_n_score(scores, n):
    try:
        n_score = scores[n - 1]
    except IndexError:
        n_score = -1

    return n_score


def create_options(df, title, tfidf_dict):
    df['options'] = df.loc[:, title].apply(get_score, args=(tfidf_dict, 3, ))
    logger.info('Test set scored')
    
    df['option1'] = df.loc[:, 'options'].apply(get_top_n_score, args=(1, ))
    df['option2'] = df.loc[:, 'options'].apply(get_top_n_score, args=(2, ))
    df['option3'] = df.loc[:, 'options'].apply(get_top_n_score, args=(3, ))
    logger.info('Top 3 options created')
    return df

In [15]:
def validate_accuracy(df):
    
    df['option1_match'] = df['category_path'] == df['option1']
    df['option2_match'] = df['category_path'] == df['option2']
    df['option3_match'] = df['category_path'] == df['option3']
    
    score1 = df['option1_match'].sum() / float(len(df))
    score2 = df['option2_match'].sum() / float(len(df))
    score3 = df['option3_match'].sum() / float(len(df))
    score123 = score1 + score2 + score3
    
    print "Scores: {}, {}, {} ({})".format(score1, score2, score3, score123)
    return df

In [16]:
# # Process titles
df = encode_title(df, title='title') 
df = lowercase_title(df, title='title')
df = tokenize_title(df, title='title', excluded='-.')
df = remove_stopwords(df, stopwords=STOP_WORDS, title='title')
df = remove_numeric_from_df(df, title='title')
df = remove_one_char_words(df, word_len=1, title='title')
df = create_ngram(df, title='title')
logger.info('Data prepped')

2016-06-12 12:15:00,509 - title encoded
INFO:__log__:title encoded
2016-06-12 12:15:02,082 - title lowercased
INFO:__log__:title lowercased
2016-06-12 12:15:10,503 - title tokenized
INFO:__log__:title tokenized
2016-06-12 12:15:13,853 - title stopwords removed
INFO:__log__:title stopwords removed
2016-06-12 12:15:16,143 - title solely numeric words removed
INFO:__log__:title solely numeric words removed
2016-06-12 12:15:18,992 - title tokens with char length equals 1 removed
INFO:__log__:title tokens with char length equals 1 removed
2016-06-12 12:15:30,655 - title ngrams created
INFO:__log__:title ngrams created
2016-06-12 12:15:30,656 - Data prepped
INFO:__log__:Data prepped


### Split into train and test

In [18]:
train, test = train_test_split(df, test_size=0.1, stratify=df['category_path'], random_state=1368)
logger.info('Train test split done: Train({}), Test({})'.format(train.shape, test.shape))

2016-06-12 12:20:57,635 - Train test split done: Train((891216, 3)), Test((99862, 3))
INFO:__log__:Train test split done: Train((891216, 3)), Test((99862, 3))


### Create dictionary

In [19]:
tfidf_dict = create_tfidf_dict(train=train, title='title', category='category_path')

2016-06-12 12:23:01,456 - TF dict phase 1 done
INFO:__log__:TF dict phase 1 done
2016-06-12 12:24:45,645 - TF dict phase 2 done
INFO:__log__:TF dict phase 2 done
2016-06-12 12:26:05,244 - IDF dict done
INFO:__log__:IDF dict done
2016-06-12 12:26:13,736 - TF-IDF dict done
INFO:__log__:TF-IDF dict done


### Validation

In [20]:
test = create_options(test, 'title', tfidf_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
2016-06-12 12:30:39,083 - Test set scored
INFO:__log__:Test set scored
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pand

In [21]:
test = validate_accuracy(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Scores: 0.606226592698, 0.110662714546, 0.0472051430975 (0.764094450341)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Validation results (1 million samples)

In [None]:
# tokenizer: -.; remove numerics
# Scores: 0.605795998478, 0.110532534898, 0.0475456129459 (0.763874146322)

In [None]:
# tokenizer: none; remove numerics
# Scores: 0.605505597725, 0.111393723338, 0.0476257234984 (0.764525044561)

In [None]:
# tokenizer: none; remove numerics, no trigrams
# Scores: 0.585818429433, 0.120696561255, 0.0509102561535 (0.757425246841)

In [None]:
# tokenizer: -.; remove numerics; remove html
# Scores: 0.606226592698, 0.110662714546, 0.0472051430975 (0.764094450341)

### Validation results (full data set)

In [None]:
# tokenizer: none; remove numerics
# Scores: 0.680834112035, 0.109327693897, 0.0413770717553 (0.831538877687)

### Examine errors

In [None]:
test.loc[:, 'match'] = test.loc[:, 'option1_match'] + test.loc[:, 'option2_match'] + test.loc[:, 'option3_match']

In [None]:
check = test[test['match'] == False]

In [None]:
df = df[['asin', 'title']]

In [None]:
check = check.merge(df, how='left', on='asin')

In [None]:
check = check[['title_y', 'title_x', 'category_path', 'option1', 'option2', 'option3']]

In [None]:
check