## Imports
nltk - use if its your first time using it in this env. 

In [40]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import nltk



In [41]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pipki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pipki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pipki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\pipki\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Loading data 
Pulling raw data, this is based on the assumption the raw data is in columns. 

In [42]:
df = pd.read_csv("../data/raw/groceries_raw.csv")

print("Raw data preview:")
display(df.head())

Raw data preview:


Unnamed: 0,item,category
0,apples,Fruit & Vegetables
1,bananas,Fruit & Vegetables
2,oranges,Fruit & Vegetables
3,spinach,Fruit & Vegetables
4,carrots,Fruit & Vegetables


## Pre cleaning 
Homoganising the dataset for processing. 

In [43]:
lemmatizer = WordNetLemmatizer()

def clean_item(text):
    # lowercase
    text = text.lower()
    # remove punctuation / numbers
    text = re.sub(r"[^a-z\s]", "", text)
    # tokenize
    tokens = word_tokenize(text)
    # lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # rejoin tokens
    return " ".join(tokens)


## Application 
Putting it to work! Table should appear with the clean data added. 


In [44]:
df['item_clean'] = df['item'].apply(clean_item)

print("After cleaning:")
display(df.head())


After cleaning:


Unnamed: 0,item,category,item_clean
0,apples,Fruit & Vegetables,apple
1,bananas,Fruit & Vegetables,banana
2,oranges,Fruit & Vegetables,orange
3,spinach,Fruit & Vegetables,spinach
4,carrots,Fruit & Vegetables,carrot


Encode the catagories to numbers!

In [45]:
label_encoder = LabelEncoder()
df['category_id'] = label_encoder.fit_transform(df['category'])

# Save mapping for later use
category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Category mapping:")
print(category_mapping)

Category mapping:
{'Bakery': np.int64(0), 'Drinks': np.int64(1), 'Eggs & Dairy': np.int64(2), 'Frozen Foods': np.int64(3), 'Fruit & Vegetables': np.int64(4), 'Household': np.int64(5), 'Meat & Fish': np.int64(6), 'Pantry / Dry Goods': np.int64(7), 'Snacks': np.int64(8)}


## Keep the data in x y, confirm the model will be trained on all data! 

In [46]:
# Use the entire dataset
X = df['item_clean'].values
y = df['category_id'].values

print(f"Total examples: {len(X)}")


Total examples: 50


## Save the processed data for tensorflow in the next step!


In [47]:
df.to_csv("../data/processed/groceries_processed.csv", index=False)