In [12]:
import pandas as pd
import numpy as np
import json
from gensim.models import Word2Vec, FastText
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import FastText
from epochlogger import EpochLogger
from sentencegenerator import SentenceGenerator
import nltk
import re
from numpy import float32

### Import the original data

In [25]:
data = pd.read_csv('../data/mfp-diaries.tsv',sep='\t',header=None,chunksize=40000)

### Load the W2V and FastText models


In [14]:
w2v = Word2Vec.load('../word2vecmodels/foods_w2v_window6_mc10.model')
fasttext = FastText.load('../fasttextmodels/foods_fasttext_window6_mc10.model')

### Create a unique row ID based on user and date
We originally planned to do this, but realized we later need to aggregate by user anyway. This data is separated back out when we construct the table

In [15]:
def create_id(row):
    date = row[1][1]
    user = row[1][0]
    ID = str(user) + '-' + str(date)
    return [user, date]

### Consolidaton functions used to group vectors by sum or mean
If the item to be grouped is empty, we return an empty vector

In [16]:
def consolidate_sum(list_of_vectors):
    if len(list_of_vectors) == 0:
        return [np.zeros(100)]
    if len(list_of_vectors) > 1:
        return np.sum(list_of_vectors,axis=0)
    else:
        return np.asarray(list_of_vectors[0])
    
def consolidate_mean(list_of_vectors):
    if len(list_of_vectors) == 0:
        return [np.zeros(100)]
    if len(list_of_vectors) > 1:
        return np.mean(list_of_vectors,axis=0)
    else:
        return np.asarray(list_of_vectors[0])

In [17]:
def get_diary_vector(row,model):
    meal_vectors = []
    diary = json.loads(row[1][2])
    for meal in diary:
        meal_vectors.append(get_meal_vector(meal,model))
    diary_vector = consolidate_sum(meal_vectors)
    return diary_vector

### Get the vector representing a meal

In [18]:
def get_meal_vector(meal,model):
    dish_vectors = []
    for dish in meal['dishes']:
        dish_vectors.append(get_dish_vector(dish,model))
    if len(dish_vectors) == 0:
        meal_vector =  np.zeros(100).reshape(100,)
    else:
        meal_vector = consolidate_sum(dish_vectors)
    assert meal_vector.shape == (100,)
    return meal_vector

### Get the vector representing one dish (food item) within a meal

In [19]:
def get_dish_vector(dish,model):
    # Handle 'Quick Added Calories
    if dish['name'][0:20] == 'Quick Added Calories':
        name = 'Quick Added Calories'
    else:
        name, quant = dish['name'].split(',',maxsplit=1)
    # split based on delimiting characters
    tokens = re.split("[, \-!?*+()012345678~9&%=/\"#.>^<:]+",name) 
    #change to lowercase
    tokens_lower =  [token.lower() for token in tokens if len(token) > 2]
    
    # get dish vector
    word_vectors = []
    for word in tokens_lower:
        if word in model.wv.vocab:
            word_vectors.append(model.wv[word])
    if len(word_vectors) == 0:
        dish_vector = np.zeros(100).reshape(100,)
    else:
        dish_vector = consolidate_mean(word_vectors)
    assert dish_vector.shape == (100,)
    return dish_vector

### Label the user based on their goal vs actual calories
We follow the labeling convention in the original paper, labeling users who are below their goal by a >=20% margin to be "below", under their goal within the 20% margin as "on_target", and users who are above their goal as "above"

In [20]:
def create_label(row):
    summary = json.loads(row[1][3])
    if len(summary['goal']) > 0 and len(summary['total']) > 0:
        goal_cal = summary['goal'][0]['value']
        total_cal = summary['total'][0]['value']
        if goal_cal == 0:
            return None
        if total_cal > goal_cal:
            success = 'above'
        else:
            if (goal_cal-total_cal)/goal_cal > 0.2:
                success = 'below'
            else:
                success = 'on_target'
    else:
        return None
    return [goal_cal,total_cal,success]

### Loop through the original data to contruct the classifier input using the Word2Vec vectors

- Output rows are in the format `[user][date][.......fasttext vector........][goal][actual][label]`

- "mixed_1" in the file name indicates the mix of aggregation functions. We found that averaging words to get foods, summing foods to get meals, and summing meals to get days works best. We called this mix 1, as opposed to other mixes of sum and mean aggregation.

### Loop through the original data to contruct the classifier input using the FastText vectors

- Output rows are in the format `[user][date][.......fasttext vector........][goal][actual][label]`

- "mixed_1" in the file name indicates the mix of aggregation functions. We found that averaging words to get foods, summing foods to get meals, and summing meals to get days works best. We called this mix 1, as opposed to other mixes of sum and mean aggregation.

In [11]:
i = 0
with open("../data/food_vectors_fasttext.csv", "w") as sentences:
    for chunk in data:
        for row in chunk.iterrows():            
            ID = create_id(row)
            diary_vector = get_diary_vector(row,fasttext)
            label = create_label(row)
            if label:
                final_vec = [str(ID[0]),str(ID[1])]
                try:
                    len(diary_vector[0])
                    diary_vector = diary_vector[0]
                except:
                    pass
                for elem in diary_vector:
                    final_vec.append(str(elem))
                for elem in label:
                    final_vec.append(str(elem))
                sentences.write(','.join(final_vec)+'\n')
            i += 1
            print(i,end='\r')
            
sentences.close()

587187

In [11]:
sentences.close()

In [None]:
i = 0
with open("../data/food_vectors_fasttext.csv", "w") as sentences:
    for chunk in data:
        for row in chunk.iterrows():            
            ID = create_id(row)
            diary_vector = get_diary_vector(row,fasttext)
            label = create_label(row)
            if label:
                final_vec = [str(ID[0]),str(ID[1])]
                try:
                    len(diary_vector[0])
                    diary_vector = diary_vector[0]
                except:
                    pass
                for elem in diary_vector:
                    final_vec.append(str(elem))
                for elem in label:
                    final_vec.append(str(elem))
                sentences.write(','.join(final_vec)+'\n')
            i += 1
            print(i,end='\r')
            
sentences.close()

### Get vector for each individual food name for feature analysis

In [26]:
i = 0
with open("../data/food_vectors_individual_fasttext.csv", "w",encoding='utf-8') as sentences:
    for chunk in data:
        for row in chunk.iterrows():            
            ID = create_id(row)
            diary = json.loads(row[1][2])
            for meal in diary:
                for dish in meal['dishes']:
                    final_vec = [str(ID[0]),str(ID[1])]
                    if dish['name'][0:20] == 'Quick Added Calories':
                        name = 'Quick Added Calories'
                    else:
                        name, quant = dish['name'].split(',',maxsplit=1)
                    # split based on delimiting characters
                    tokens = re.split("[, \-!?*+()012345678~9&%=/\"#.>^<:]+",name) 
                    #change to lowercase
                    tokens_lower =  [token.lower() for token in tokens if len(token) > 2]
                    final_vec.append(' '.join(tokens_lower))
                    dish_vector = get_dish_vector(dish,fasttext)
                    for elem in dish_vector:
                        final_vec.append(str(elem))
                    sentences.write(','.join(final_vec)+'\n')
            i += 1
            print(i,end='\r')
            
sentences.close()

587187

In [55]:
food_vectors_individual = pd.read_csv('../data/food_vectors_individual_fasttext.csv',encoding='utf-8',sep=',',header=None,chunksize=200000)

In [56]:
for chunk in food_vectors_individual:
    chunk = chunk.iloc[:,2:].drop_duplicates()
    chunk.to_csv('../data/food_vectors_individual_dr.csv',mode='a')

In [57]:
food_vectors_individual_dr = pd.read_csv('../data/food_vectors_individual_dr.csv',encoding='utf-8',sep=',',header=None,chunksize=200000)

In [72]:
first_200k = food_vectors_individual_dr.get_chunk()

first_200k = first_200k.rename(columns={0:'user',1:'food name'})
first_200k = first_200k.rename(columns=lambda x: str(int(x) - 1) if x in first_200k.columns[2:103] else x)

In [62]:
first_200k.head()

Unnamed: 0,user,food name,2,2.1,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100
200000,842734.0,mike and ike original fruit,-0.909312,1.140406,0.116137,-0.032984,-0.031175,-1.295006,1.626059,0.437072,...,-0.554569,0.100655,-0.778651,0.375614,-0.320792,-1.333206,-1.033525,2.083867,0.484071,-0.533212
200001,842744.0,golden corral fried chicken leg,-0.154206,0.959297,1.208461,-0.233636,0.6949,0.206451,0.162395,-0.885888,...,0.192346,-0.206138,-0.528747,-1.19102,0.303544,1.47125,0.56981,0.288745,-1.383345,1.455299
200002,842751.0,pringles the original,-0.432562,0.670915,0.024695,-1.123932,-0.177828,0.976913,1.569923,-1.223803,...,-0.777765,-1.442282,0.573384,1.802094,2.510322,1.328245,0.895291,-0.637386,2.307601,-1.043916
200003,842754.0,giant milk,-0.21463,-0.82708,1.662962,1.804257,0.169105,1.628011,-1.039294,-1.965879,...,-1.12447,-2.150162,0.737115,0.266021,0.657365,-0.41171,-1.862235,-1.080738,0.984438,0.779022
200004,842758.0,spring valley all natural regular strength fis...,0.178859,-1.063153,-0.016401,0.447001,-0.667813,1.592318,0.501254,2.508908,...,-1.107123,2.343293,1.335611,2.266904,-0.348053,2.042654,-1.136993,-0.72514,-0.015,-0.074826


In [76]:
first_200k.sort_values(axis=0,by='positive_sum')

Unnamed: 0,user,food name,1,2,3,4,5,6,7,8,...,93,94,95,96,97,98,99,100,negative_sum,positive_sum
596983,2530718.0,paraibuna bananinha com acúcar,-5.432514,-0.967945,-0.052075,-6.514199,0.183858,4.190347,-0.172158,3.706911,...,2.541309,-0.071750,1.406863,-0.141958,-4.557740,1.326809,4.431367,-6.683545,2.006192,-60.885292
430068,1818912.0,juicyfruit gum,0.068392,-5.687073,-3.347557,-0.382224,-0.252405,-3.455936,4.148216,1.189397,...,0.469623,6.116527,-3.461977,-1.987651,2.880161,1.165119,-1.760766,-1.363866,10.049620,-56.924225
460891,1941428.0,spong cake,-1.743882,-1.874052,-0.268702,3.877843,-0.979897,-4.409330,-0.697004,-4.826160,...,-2.200099,-1.444067,0.651166,1.559504,-0.592841,0.162593,-3.101581,2.549046,-0.879106,-51.987491
487606,2058873.0,hotess twinkes cake,-1.743882,-1.874052,-0.268702,3.877843,-0.979897,-4.409330,-0.697004,-4.826160,...,-2.200099,-1.444067,0.651166,1.559504,-0.592841,0.162593,-3.101581,2.549046,-0.879106,-51.987491
597474,2533043.0,yoghuu cake,-1.743882,-1.874052,-0.268702,3.877843,-0.979897,-4.409330,-0.697004,-4.826160,...,-2.200099,-1.444067,0.651166,1.559504,-0.592841,0.162593,-3.101581,2.549046,-0.879106,-51.987491
548988,2321402.0,cake,-1.743882,-1.874052,-0.268702,3.877843,-0.979897,-4.409330,-0.697004,-4.826160,...,-2.200099,-1.444067,0.651166,1.559504,-0.592841,0.162593,-3.101581,2.549046,-0.879106,-51.987491
439927,1852539.0,cake,-1.743882,-1.874052,-0.268702,3.877843,-0.979897,-4.409330,-0.697004,-4.826160,...,-2.200099,-1.444067,0.651166,1.559504,-0.592841,0.162593,-3.101581,2.549046,-0.879106,-51.987491
500624,2112455.0,cake cake,-1.743882,-1.874052,-0.268702,3.877843,-0.979897,-4.409330,-0.697004,-4.826160,...,-2.200099,-1.444067,0.651166,1.559504,-0.592841,0.162593,-3.101581,2.549046,-0.879106,-51.987491
483866,2041474.0,cake,-1.743882,-1.874052,-0.268702,3.877843,-0.979897,-4.409330,-0.697004,-4.826160,...,-2.200099,-1.444067,0.651166,1.559504,-0.592841,0.162593,-3.101581,2.549046,-0.879106,-51.987491
560947,2379351.0,kfc kubełek frytek,-3.678723,-0.929177,5.495633,-2.225295,-2.317735,-3.356167,1.450994,1.808906,...,-6.275899,1.877828,4.078472,1.891192,4.486222,-0.463514,3.950010,3.746642,-5.381137,-50.474861


In [66]:
import pickle
with open('./negative.pkl','rb') as f:
    negative = pickle.load(f)
    
with open('./positive.pkl','rb') as f:
    positive = pickle.load(f)

In [73]:
first_200k['negative_sum'] = first_200k[list(negative)].sum(axis=1)

In [74]:
first_200k['positive_sum'] = first_200k[list(positive)].sum(axis=1)