In [12]:
import pandas as pd
import numpy as np
import json
from gensim.models import Word2Vec, FastText
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import FastText
from epochlogger import EpochLogger
from sentencegenerator import SentenceGenerator
import nltk
import re
from numpy import float32

### Import the original data

In [25]:
data = pd.read_csv('../data/mfp-diaries.tsv',sep='\t',header=None,chunksize=40000)

### Load the W2V and FastText models


In [14]:
w2v = Word2Vec.load('../word2vecmodels/foods_w2v_window6_mc10.model')
fasttext = FastText.load('../fasttextmodels/foods_fasttext_window6_mc10.model')

### Create a unique row ID based on user and date
We originally planned to do this, but realized we later need to aggregate by user anyway. This data is separated back out when we construct the table

In [15]:
def create_id(row):
    date = row[1][1]
    user = row[1][0]
    ID = str(user) + '-' + str(date)
    return [user, date]

### Consolidaton functions used to group vectors by sum or mean
If the item to be grouped is empty, we return an empty vector

In [16]:
def consolidate_sum(list_of_vectors):
    if len(list_of_vectors) == 0:
        return [np.zeros(100)]
    if len(list_of_vectors) > 1:
        return np.sum(list_of_vectors,axis=0)
    else:
        return np.asarray(list_of_vectors[0])
    
def consolidate_mean(list_of_vectors):
    if len(list_of_vectors) == 0:
        return [np.zeros(100)]
    if len(list_of_vectors) > 1:
        return np.mean(list_of_vectors,axis=0)
    else:
        return np.asarray(list_of_vectors[0])

In [17]:
def get_diary_vector(row,model):
    meal_vectors = []
    diary = json.loads(row[1][2])
    for meal in diary:
        meal_vectors.append(get_meal_vector(meal,model))
    diary_vector = consolidate_sum(meal_vectors)
    return diary_vector

### Get the vector representing a meal

In [18]:
def get_meal_vector(meal,model):
    dish_vectors = []
    for dish in meal['dishes']:
        dish_vectors.append(get_dish_vector(dish,model))
    if len(dish_vectors) == 0:
        meal_vector =  np.zeros(100).reshape(100,)
    else:
        meal_vector = consolidate_sum(dish_vectors)
    assert meal_vector.shape == (100,)
    return meal_vector

### Get the vector representing one dish (food item) within a meal

In [19]:
def get_dish_vector(dish,model):
    # Handle 'Quick Added Calories
    if dish['name'][0:20] == 'Quick Added Calories':
        name = 'Quick Added Calories'
    else:
        name, quant = dish['name'].split(',',maxsplit=1)
    # split based on delimiting characters
    tokens = re.split("[, \-!?*+()012345678~9&%=/\"#.>^<:]+",name) 
    #change to lowercase
    tokens_lower =  [token.lower() for token in tokens if len(token) > 2]
    
    # get dish vector
    word_vectors = []
    for word in tokens_lower:
        if word in model.wv.vocab:
            word_vectors.append(model.wv[word])
    if len(word_vectors) == 0:
        dish_vector = np.zeros(100).reshape(100,)
    else:
        dish_vector = consolidate_mean(word_vectors)
    assert dish_vector.shape == (100,)
    return dish_vector

### Label the user based on their goal vs actual calories
We follow the labeling convention in the original paper, labeling users who are below their goal by a >=20% margin to be "below", under their goal within the 20% margin as "on_target", and users who are above their goal as "above"

In [20]:
def create_label(row):
    summary = json.loads(row[1][3])
    if len(summary['goal']) > 0 and len(summary['total']) > 0:
        goal_cal = summary['goal'][0]['value']
        total_cal = summary['total'][0]['value']
        if goal_cal == 0:
            return None
        if total_cal > goal_cal:
            success = 'above'
        else:
            if (goal_cal-total_cal)/goal_cal > 0.2:
                success = 'below'
            else:
                success = 'on_target'
    else:
        return None
    return [goal_cal,total_cal,success]

### Loop through the original data to contruct the classifier input using the Word2Vec vectors

- Output rows are in the format `[user][date][.......fasttext vector........][goal][actual][label]`

- "mixed_1" in the file name indicates the mix of aggregation functions. We found that averaging words to get foods, summing foods to get meals, and summing meals to get days works best. We called this mix 1, as opposed to other mixes of sum and mean aggregation.

### Loop through the original data to contruct the classifier input using the FastText vectors

- Output rows are in the format `[user][date][.......fasttext vector........][goal][actual][label]`

- "mixed_1" in the file name indicates the mix of aggregation functions. We found that averaging words to get foods, summing foods to get meals, and summing meals to get days works best. We called this mix 1, as opposed to other mixes of sum and mean aggregation.

In [11]:
i = 0
with open("../data/food_vectors_fasttext.csv", "w") as sentences:
    for chunk in data:
        for row in chunk.iterrows():            
            ID = create_id(row)
            diary_vector = get_diary_vector(row,fasttext)
            label = create_label(row)
            if label:
                final_vec = [str(ID[0]),str(ID[1])]
                try:
                    len(diary_vector[0])
                    diary_vector = diary_vector[0]
                except:
                    pass
                for elem in diary_vector:
                    final_vec.append(str(elem))
                for elem in label:
                    final_vec.append(str(elem))
                sentences.write(','.join(final_vec)+'\n')
            i += 1
            print(i,end='\r')
            
sentences.close()

587187

In [11]:
sentences.close()

In [None]:
i = 0
with open("../data/food_vectors_fasttext.csv", "w") as sentences:
    for chunk in data:
        for row in chunk.iterrows():            
            ID = create_id(row)
            diary_vector = get_diary_vector(row,fasttext)
            label = create_label(row)
            if label:
                final_vec = [str(ID[0]),str(ID[1])]
                try:
                    len(diary_vector[0])
                    diary_vector = diary_vector[0]
                except:
                    pass
                for elem in diary_vector:
                    final_vec.append(str(elem))
                for elem in label:
                    final_vec.append(str(elem))
                sentences.write(','.join(final_vec)+'\n')
            i += 1
            print(i,end='\r')
            
sentences.close()

### Get vector for each individual food name for feature analysis

In [26]:
i = 0
with open("../data/food_vectors_individual_fasttext.csv", "w",encoding='utf-8') as sentences:
    for chunk in data:
        for row in chunk.iterrows():            
            ID = create_id(row)
            diary = json.loads(row[1][2])
            for meal in diary:
                for dish in meal['dishes']:
                    final_vec = [str(ID[0]),str(ID[1])]
                    if dish['name'][0:20] == 'Quick Added Calories':
                        name = 'Quick Added Calories'
                    else:
                        name, quant = dish['name'].split(',',maxsplit=1)
                    # split based on delimiting characters
                    tokens = re.split("[, \-!?*+()012345678~9&%=/\"#.>^<:]+",name) 
                    #change to lowercase
                    tokens_lower =  [token.lower() for token in tokens if len(token) > 2]
                    final_vec.append(' '.join(tokens_lower))
                    dish_vector = get_dish_vector(dish,fasttext)
                    for elem in dish_vector:
                        final_vec.append(str(elem))
                    sentences.write(','.join(final_vec)+'\n')
            i += 1
            print(i,end='\r')
            
sentences.close()

587187