In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/raw/ingredient_phrase_tagger_training_data.csv")
df.head()

Unnamed: 0,text,text_index,text_length,capital_letter,parenthesis_flag,label,ID
0,1$1/4,I1,L20,NoCAP,NoPAREN,B-QTY,1
1,cups,I2,L20,NoCAP,NoPAREN,B-UNIT,1
2,cooked,I3,L20,NoCAP,NoPAREN,B-COMMENT,1
3,and,I4,L20,NoCAP,NoPAREN,I-COMMENT,1
4,pureed,I5,L20,NoCAP,NoPAREN,I-COMMENT,1


In [4]:
# fix training data

def remove_label_prefix(label):
    """Remove the prefix in labels such as 'B-'
    and 'I-'"""
    if "-" in label: # if statement faster than split()
        return label[2:]
    return label

def replace_INDEX_with_QTY(label):
    """Remove and replace 'INDEX' label with 'QTY'"""
    if "INDEX" in label: # if statement faster than str.replace
        return "QTY"
    return label

def remove_QTY_symbol(text):
    """Remove the '$' symbol from ingredient quantities"""
    if "$" in text: # if faster when there is not a "$", slower when there is
        return text.replace("$", ' ') 
    return text

def remove_parenthesis(input_df):
    """Remove parenthesis from training data"""
    df = input_df.copy()
    return df.loc[df['parenthesis_flag'] != "YesPAREN",:]

def ingredient_has_name(df):
    """
    Ensure the ingredient has a name within the text
    """
    return any(df.label.str.match('^NAME$'))

def find_hyphen_ingredients(d):
    """Find units and qtys with hyphens in them.
    These instances happen in examples like '1-pound'
    and is incorrectly labelled just a qty or just a unit.

    Returns False if this occurs in the ingredient
    """
    hyphen_in = d.text.str.contains("-")
    not_hyphen_only = ~d.text.str.match('^-$')
    qty = d.label.str.match('^QTY$') 
    unit = d.label.str.match('^UNIT$')
    qty_or_unit = (qty | unit)
    return any(not_hyphen_only & (hyphen_in & qty_or_unit))

def or_to_comment(df):
    """
    Check that the first NAME comes before the first "or".
    If True, it typically means the text following the or
    is an alternative to the first NAME. If this is False,
    typically it means there are two comments to the NAME
    e.g. chicken or beef stock - where "stock" is the NAME
    and "chicken or beef" is the COMMENT. This scenario is
    fine for the parser, however it may be worth it to
    `split` this after the parser is applied.
    
    If there are two or more 'or's, we will see which ones
    come after the first or and replace all text after them
    as a comment.
    """
    or_ingr = df.copy()
    or_index = or_ingr[or_ingr['text'] ==  "or"].index
    first_name_index = or_ingr[or_ingr['label'] ==  "NAME"].index[0]

    or_replacement_index = or_index[or_index > first_name_index]
    if or_replacement_index.values.size > 0:
        or_ingr.loc[or_replacement_index[0]:, "label"] = "COMMENT"
    return or_ingr

def run_data_cleaning(df):
    """
    Cleanses training data of text that increases complexity
    or that will cause issues while creating the model.
    """
    print("Cleaning phrase tagger data...")
    df = remove_parenthesis(df)
    #df = input_df.copy()
    df.loc[:, 'label'] = df.loc[:, 'label'].apply(remove_label_prefix)
    df.loc[:, 'label'] = df.loc[:, 'label'].apply(replace_INDEX_with_QTY)
    df.loc[:, 'text']  = df.loc[:, 'text'].apply(remove_QTY_symbol)
    df_ID = df.groupby("ID")
    cleaned_df = []
    for d in iter(df_ID):
        print(d[0], end = "\r")
        ingredient_df = d[1].loc[:, ('text', 'label', 'ID')]
        if ((not find_hyphen_ingredients(ingredient_df))
                & (ingredient_has_name(ingredient_df))):
            if any(ingredient_df.text.str.match('^or$')):
                ingredient_df = or_to_comment(ingredient_df)
            cleaned_df.append(ingredient_df)
    return pd.concat(cleaned_df)

In [5]:
d = run_data_cleaning(df)

Cleaning phrase tagger data...
173415