# Wordembedding
This Notebook creates the wordembedding model and bigram model based on the USDA fooddatabase



In [None]:
import os
import urllib.request

Download the dataset from the usda

In [None]:
if os.path.exists("zip.zip"):
  assert False
urllib.request.urlretrieve("https://fdc.nal.usda.gov/fdc-datasets/FoodData_Central_branded_food_csv_2021-10-28.zip", "zip.zip")
!unzip zip.zip -d data
!mkdir data/models
#urllib.request.urlretrieve("https://github.com/andreamorgar/recipe-adaptation/raw/main/models/v3/modelo3","data/models/modelo3")

Import packets and install necessary nltk toolkit packets

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, stem_text
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

Load usda model data into pandas

In [None]:
branded_food = pd.read_csv("data/branded_food.csv")
food2 = pd.read_csv("data/food.csv")
food2 = branded_food.merge(food2, on='fdc_id')
del branded_food
food = pd.DataFrame(food2["description"])
food["branded_food_category"] = food2["branded_food_category"].astype('category')
food["brand_name"] = food2["brand_name"].fillna("").astype(str)
del food2

Define the textual preprocessing pipeline


*   Tokenization
*   Stopword removal
*   Lemmatization



In [None]:
lst_stopwords = nltk.corpus.stopwords.words("english")
custom = ["gal", "oz", "t", "tsp", "teaspoon", 
          "tablespoon", "tbl", "tbs", "tbsp",
          "fl", "oz", "gil", "ounce", "ml", "l",
          "dl", "lb", "pund", "mg", "g", "kg", "gram", "cup"]
lst_stopwords.extend(custom)
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def preprocess2(name):
    name = re.sub(r'[^\w\s]', '', str(name).lower().strip())
    name = tokenizer.tokenize(name)
    name = [x for x in name if x not in lst_stopwords and len(x)>2]
    name = [x for x in name if not any(char.isdigit() for char in x)] 
    name = [lemmatizer.lemmatize(x) for x in name]
    return name

def preprocess(name):
    name = preprocess2(name)
    name = list(set(name))
    #todo bigrams
    return name

Convert food item names into token lists

And train bigram model on the token lists

In [None]:
lst_corpus = []
for index, row in food.iterrows():
  brand = row["brand_name"]
  x = str(row["description"])
  lst_brand = preprocess2(brand)
  lst_words = preprocess2(x)
  if not any(x in lst_words for x in lst_brand):
    lst_words = lst_brand + lst_words

  if index % 100000 == 0:
    print(index)
  #lst_grams = [" ".join(lst_words[i:i+2]) for i in range(0, len(lst_words), 1)]
  lst_corpus.append(lst_words)
bigram_mdl = Phrases(lst_corpus, min_count=5, threshold=2, delimiter=b' ')
bigrams = bigram_mdl[lst_corpus]

Save bigram model to file

In [None]:
bigram_mdl.save("data/models/bigram_model.pkl")
all_sentences = list(bigrams)

In [None]:
bigram_mdl["red","bull"]

Train word2vec model on word corpus and save to file

In [None]:
model = Word2Vec(all_sentences, min_count=3, size=300, workers=4, window=5, iter=30)
model.save("data/models/mymodel")


Now one just have to download the models to use in FoodClassification Notebook or in classifer.py
