In [2]:
import pandas as pd
import glob, os
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [2]:
entire_dataset = pd.DataFrame()
for i,file in enumerate(glob.glob('./data/*.csv')):
    df_temp = pd.read_csv(file)
    df_temp['subject'] = np.ones(df_temp.shape[0]) * (i+1)
    entire_dataset = pd.concat([entire_dataset, df_temp])

demographic = pd.read_csv('./data/demo/demographics.csv')
entire_dataset = entire_dataset.merge(demographic, left_on='subject', right_on='ID',how='inner')

Checking for missing values

In [3]:
entire_dataset.isna().any()

Unnamed: 0          True
date               False
time                True
time_begin          True
time_end            True
logged_food        False
amount              True
unit                True
searched_food       True
calorie            False
total_carb         False
dietary_fiber       True
sugar              False
protein             True
total_fat           True
subject            False
time_of_day         True
ID_x                True
Gender_x            True
HbA1c_x             True
datetime            True
hour                True
class               True
simplified_food     True
ID_y               False
Gender_y           False
HbA1c_y            False
dtype: bool

In [4]:
def fix_time(df):
    if 'time_begin' in df.columns:
        df['time_begin'].fillna(df['date'] + ' ' + df['time'], inplace=True)
    else:
        df['time_begin'] = df['date'] + ' ' + df['time']
    return df
entire_dataset.transform(fix_time)

Unnamed: 0.1,Unnamed: 0,date,time,time_begin,time_end,logged_food,amount,unit,searched_food,calorie,...,ID_x,Gender_x,HbA1c_x,datetime,hour,class,simplified_food,ID_y,Gender_y,HbA1c_y
0,0.0,2020-02-13,2025-02-13 18:00:00,2020-02-13 18:00:00,,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,...,1.0,FEMALE,5.5,2020-02-13 18:00:00,18.0,beverages,,1,FEMALE,5.5
1,1.0,2020-02-13,2025-02-13 20:30:00,2020-02-13 20:30:00,,Chicken Leg,1.0,,chicken leg,475.0,...,1.0,FEMALE,5.5,2020-02-13 20:30:00,20.0,meals,,1,FEMALE,5.5
2,2.0,2020-02-13,2025-02-13 20:30:00,2020-02-13 20:30:00,,Asparagus,4.0,,Asparagus,13.0,...,1.0,FEMALE,5.5,2020-02-13 20:30:00,20.0,fruits/veggies,,1,FEMALE,5.5
3,3.0,2020-02-14,2025-02-13 07:10:00,2020-02-14 07:10:00,,Natrel Lactose Free 2 Percent,8.0,fluid ounce,(Natrel) Lactose Free 2% Partly Skimmed Milk,120.0,...,1.0,FEMALE,5.5,2020-02-14 07:10:00,7.0,beverages,,1,FEMALE,5.5
4,4.0,2020-02-14,2025-02-13 07:10:00,2020-02-14 07:10:00,,Standard Breakfast,0.75,cup,"(Kellogg's) Frosted Flakes, Cereal",110.0,...,1.0,FEMALE,5.5,2020-02-14 07:10:00,7.0,meals,,1,FEMALE,5.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3806,,2020-06-12,12:45:00,2020-06-12 12:45:00,,Uncle Al's Strawberry Cremes Cookies,6.0,cookie,,140.0,...,,,,,,,,16,MALE,5.5
3807,,2020-06-12,17:15:00,2020-06-12 17:15:00,17:30:00,Sloppy Joe with 1 Bun,1.0,sandwich,,399.0,...,,,,,,,,16,MALE,5.5
3808,,2020-06-12,17:15:00,2020-06-12 17:15:00,,Lay's Wavy Potato Chips,1.25,,,298.0,...,,,,,,,,16,MALE,5.5
3809,,2020-06-12,17:15:00,2020-06-12 17:15:00,,Lut Petit Ecolier Milk Chocolate Biscuit Cookie,2.0,biscuits,,123.0,...,,,,,,,,16,MALE,5.5


In [5]:
def convert_date_time(df):
    df[['date', 'time']] = df['time_begin'].str.split(' ', expand=True)
    return df

entire_dataset = entire_dataset.transform(convert_date_time)

In [6]:
def convert_to_datetime(df):
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], errors='coerce')
    df['date'] = pd.to_datetime(df['date'])
    df['time'] = pd.to_datetime(df['time'])
    return df

entire_dataset = convert_to_datetime(entire_dataset)

In [7]:
entire_dataset['hour'] = entire_dataset['datetime'].dt.hour

## Food Classification:  
## 0 = meat, 1 = fruit/veggies, 2 = snack-other, 3 = beverage, 4 = supplement, 5 = full meal, 6 = sides/auxillary food items, 7 = desserts

In [8]:
import json
with open('./data/food_classes.json', 'r') as file:
    food_classes = json.load(file)

food_classes = {k:int(v) for k, v in food_classes.items()}

In [9]:
food_class_dict = {0:'meal', 1: 'fruit/veggies', 2:'snacks', 3:'beverages', 4:'supplements', 5:'meals', 6:'meals', 7:'meals'}

In [10]:
def add_classes(df):
    df['class'] = food_class_dict[food_classes[df['logged_food']]]
    return df
entire_dataset = entire_dataset.apply(add_classes, axis=1)

In [11]:
filtered = entire_dataset[(entire_dataset['class']=='beverages')|(entire_dataset['class']=='meals')|(entire_dataset['class']=='snacks')]

Further generalization of foods

In [12]:
def generalize_food(row):
    food = row['logged_food'].lower()  # Ensure case-insensitivity
    if 'smoothie' in food or 'essential' in food or 'shake' in food:
        return 'smoothie'
    if 'burrito' in food or 'taco' in food or 'tortilla' in food or 'chipotle' in food or 'salsa' in food:
        return 'mexican food'
    elif 'kashi' in food or 'flake' in food or 'special k' in food or 'cheerio' in food:
        return 'cereal'
    elif 'bean' in food or 'peas' in food:
        return 'bean'
    elif 'salad' in food or 'slaw' in food:
        return 'salad'
    elif 'pizza' in food:
        return 'pizza'
    elif 'tea' in food:
        return 'tea'
    elif 'biscuit' in food or 'sub' in food or 'sandw' in food or 'sloppy joe' in food:
        return 'sandwich'
    elif 'juice' in food or 'v8' in food or 'kombucha' in food:
        return 'juice'
    elif any(alcohol in food for alcohol in ['beer', 'corona', 'wine', 'bourbon', 'vodka', 'martini', 'moscato', 'mojito']):
        return 'alcohol'
    elif 'milk' in food or 'percent' in food:
        return 'milk'
    elif any(soda in food for soda in ['coca cola', 'coke', 'pepsi', 'sunkist', 'mello yello', 'soda', 'lemonade','mountain dew','moutain dew']):
        return 'soda'
    elif any(sports in food for sports in ['gatorade', 'powerade']):
        return 'sports drink'
    elif any(pasta in food for pasta in ['lasagna', 'spaghetti','ziti','mac','ravioli','risotto']):
        return 'pasta'
    elif any(pasta in food for pasta in ['omelet']):
        return 'omelet'
    elif 'burger' in food or 'hot dog' in food:
        return 'burger'
    elif 'cheese' in food or 'moz' in food:
        return 'cheese'
    elif 'soup' in food or 'stew' in food or 'chowder' in food:
        return 'soup'
    elif 'chicken' in food or 'turkey' in food or 'wing' in food:
        return 'poultry'
    elif 'beef' in food:
        return 'beef'
    elif 'lamb' in food:
        return 'lamb'
    elif any(pork in food for pork in ['pork','sausage','pepperoni','salami','bacon']):
        return 'pork'
    elif any(fish in food for fish in ['fish', 'salmon','tuna','sardine','anchovies','crab','tilapia','shrimp']):
        return 'seafood'
    elif any(bread in food for bread in ['bread', 'toast','bagel','croissant','scone','muffin','loaf','twirl','donut','biscot']):
        return 'bread/pastry'
    elif any(candy in food for candy in ['candy', 'bark', 'square','reese','m & m', 'm&m', 'hershey','gum','chocolate','tootsie roll','caramel','baby ruth','toffee']):
        return 'candy'
    elif ('bar' in food and 'sauce' not in food) or ('fig newton' in food):
        return 'snack bar'
    elif 'potat' in food or 'fries' in food or 'tater' in food:
        return 'potato'
    elif 'rice' in food:
        return 'rice'
    elif any(sweet in food for sweet in ['cookie', 'cake','ice cream','frozen','brownie','cinnamon roll','pie','cobbler','waffle','cinammon roll']):
        return 'desserts'
    elif 'pop' in food:
        return 'popcorn'
    elif any(coffee in food for coffee in ['coffee','latte','cream','sweetener','sugar','sweetner','stevia','half and half']):
        return 'coffee'
    elif any(brekky in food for brekky in ['bfast', 'breakfast']):
        return 'standard breakfast'
    elif 'egg' in food:
        return 'egg'
    elif 'yogurt' in food:
        return 'yogurt'
    elif 'oat' in food or 'flax' in food or 'grit' in food or 'faro' in food:
        return 'oat/grains'
    elif 'cracker' in food or 'wafer' in food or 'pretzel' in food:
        return 'crackers'
    elif 'chip' in food or 'chex' in food or 'cheeto' in food or 'frito' in food:
        return 'chips'
    elif any(nut in food for nut in ['peanut','walnut','pistachio','almond','pecan','nut']):
        return 'nuts'
    elif 'california roll' in food or 'sushi' in food:
        return 'sushi'
    elif 'mint' in food:
        return 'candy'
    else:
        return food  # Preserve original if no match


filtered['simplified_food'] = filtered.apply(generalize_food, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['simplified_food'] = filtered.apply(generalize_food, axis=1)


# more filtering

In [3]:
data = pd.read_csv('./data/filtered.csv')
data['time'] = pd.to_datetime(data['time']).dt.strftime('%H:%M')
data['date'] = pd.to_datetime(data['time']).dt.strftime('%Y-%m-%d')

def get_time_of_day(time):
    hour = int(time.split(':')[0])
    if 5 <= hour < 9:
        return 'Early Morning'
    elif 9 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 14:
        return 'Noon'
    elif 14 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 22:
        return 'Night'
    else:
        return 'Late Night'

data['time_of_day'] = data['time'].apply(get_time_of_day)
data = data.drop(['time_end', 'ID', 'dietary_fiber', 'total_fat', 'HbA1c','searched_food','time_begin'], axis=1)

data.rename(columns={'Gender': 'gender'}, inplace=True)
data['gender'] = data['gender'].replace({'FEMALE': 'Female', 'MALE': 'Male'})
data['logged_food'] = data['logged_food'].str.title()

data['class'] = data['class'].str.rstrip('s').str.capitalize()
data['date'] = pd.to_datetime(data['datetime']).dt.date
data['day'] = data.groupby('subject')['date'].rank(method='dense').astype(int)

poo = data.drop('Unnamed: 0', axis=1)
poo = poo.reindex(columns=['subject', 'date', 'day'] + list(poo.columns.drop(['subject', 'day', 'date'])))

poo.head()

Unnamed: 0,subject,date,day,time,logged_food,amount,unit,calorie,total_carb,sugar,protein,time_of_day,gender,datetime,hour,class,simplified_food
0,1.0,2020-02-13,1,18:00,Berry Smoothie,20.0,fluid ounce,456.0,85.0,83.0,16.0,Night,Female,2020-02-13 18:00:00,18,Beverage,smoothie
1,1.0,2020-02-14,2,07:10,Natrel Lactose Free 2 Percent,8.0,fluid ounce,120.0,9.0,8.0,12.0,Early Morning,Female,2020-02-14 07:10:00,7,Beverage,milk
2,1.0,2020-02-14,2,07:10,Standard Breakfast,0.75,cup,110.0,26.0,10.0,1.0,Early Morning,Female,2020-02-14 07:10:00,7,Meal,standard breakfast
3,1.0,2020-02-14,2,09:38,Breakfast Trail Mix,0.5,cup,280.0,30.0,22.0,4.0,Morning,Female,2020-02-14 09:38:00,9,Snack,standard breakfast
4,1.0,2020-02-14,2,12:38,Spinach Salad W/ Strawberries And Cheese,200.0,grams,286.0,14.0,8.5,7.6,Noon,Female,2020-02-14 12:38:00,12,Meal,salad
