In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/raw/ingredient_phrase_tagger_training_data.csv")
df.head()

Unnamed: 0,text,text_index,text_length,capital_letter,parenthesis_flag,label,ID
0,1$1/4,I1,L20,NoCAP,NoPAREN,B-QTY,1
1,cups,I2,L20,NoCAP,NoPAREN,B-UNIT,1
2,cooked,I3,L20,NoCAP,NoPAREN,B-COMMENT,1
3,and,I4,L20,NoCAP,NoPAREN,I-COMMENT,1
4,pureed,I5,L20,NoCAP,NoPAREN,I-COMMENT,1


In [86]:
# fix training data

def remove_label_prefix(label):
    """Remove the prefix in labels such as 'B-'
    and 'I-'"""
    if "-" in label: # if statement faster than split()
        return label[2:]
    return label

def replace_INDEX_with_QTY(label):
    """Remove and replace 'INDEX' label with 'QTY'"""
    if "INDEX" in label: # if statement faster than str.replace
        return "QTY"
    return label

def remove_QTY_symbol(text):
    """Remove the '$' symbol from ingredient quantities"""
    if "$" in text: # if faster when there is not a "$", slower when there is
        return text.replace("$", ' ') 
    return text

def remove_parenthesis(df):
    """Remove parenthesis from training data"""
    df = df[df['parenthesis_flag'] != "YesPAREN"]
    return df


def find_hyphen_texts(text, label, ID):
    """Remove units and qtys with hyphens in them.
    These instances happen in examples like '1-pound'
    and is incorrectly labelled just a qty or just a unit.
    
    Returns the ID of the entry
    """
    not_hyphen_only = (text != '-')
    hyphen_in = ('-' in text)
    qty = (label == 'QTY')
    unit = (label == 'UNIT')
    qty_or_unit = (qty | unit)
    if not_hyphen_only & (hyphen_in & qty_or_unit):
        return ID

def run_data_cleaning(df):
    df = remove_parenthesis(df)
    cleaned_data = []
    remove_IDs = []
    for i, row in df.iterrows():
        print(i, end = '\r')
        label = remove_label_prefix(row['label'])
        label = replace_INDEX_with_QTY(label)
        text = remove_QTY_symbol(row['text'])
        ID = row['ID']
        remove_IDs.append(find_hyphen_texts(text, label, ID))
        cleaned_data.append([text,label,ID])
    cleaned_df = pd.DataFrame(
        cleaned_data, columns = ['text', 'labels', 'ID']
    )
    return cleaned_df[~cleaned_df['ID'].isin(remove_IDs)]


In [87]:
new_df = run_data_cleaning(df)

1061068

In [92]:
new_df.head()

Unnamed: 0,text,labels,ID
0,11/4,QTY,1
1,cups,UNIT,1
2,cooked,COMMENT,1
3,and,COMMENT,1
4,pureed,COMMENT,1
