In [25]:
import pandas as pd
import glob, os
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [26]:
entire_dataset = pd.DataFrame()
for i,file in enumerate(glob.glob('./data/*.csv')):
    df_temp = pd.read_csv(file)
    df_temp['subject'] = np.ones(df_temp.shape[0]) * (i+1)
    entire_dataset = pd.concat([entire_dataset, df_temp])

In [27]:
demographic = pd.read_csv('./data/demo/demographics.csv')
entire_dataset = entire_dataset.merge(demographic, left_on='subject', right_on='ID',how='inner')

In [28]:
entire_dataset.isna().any()

date             False
time              True
time_begin        True
time_end          True
logged_food      False
amount            True
unit              True
searched_food     True
calorie          False
total_carb       False
dietary_fiber     True
sugar            False
protein           True
total_fat         True
subject          False
time_of_day       True
ID               False
Gender           False
HbA1c            False
dtype: bool

In [30]:
def fix_time(df):
    if 'time_begin' in df.columns:
        df['time_begin'].fillna(df['date'] + ' ' + df['time'], inplace=True)
    else:
        df['time_begin'] = df['date'] + ' ' + df['time']
    return df
entire_dataset.transform(fix_time)

Unnamed: 0,date,time,time_begin,time_end,logged_food,amount,unit,searched_food,calorie,total_carb,dietary_fiber,sugar,protein,total_fat,subject,time_of_day,ID,Gender,HbA1c
0,2020-02-13,18:00:00,2020-02-13 18:00:00,,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,85.0,1.7,83.0,16.0,3.3,1.0,,1,FEMALE,5.5
1,2020-02-13,20:30:00,2020-02-13 20:30:00,,Chicken Leg,1.0,,chicken leg,475.0,0.0,0.0,0.0,62.0,23.0,1.0,,1,FEMALE,5.5
2,2020-02-13,20:30:00,2020-02-13 20:30:00,,Asparagus,4.0,,Asparagus,13.0,2.5,1.2,0.8,1.4,0.1,1.0,,1,FEMALE,5.5
3,2020-02-14,07:10:00,2020-02-14 07:10:00,,Natrel Lactose Free 2 Percent,8.0,fluid ounce,(Natrel) Lactose Free 2% Partly Skimmed Milk,120.0,9.0,,8.0,12.0,,1.0,,1,FEMALE,5.5
4,2020-02-14,07:10:00,2020-02-14 07:10:00,,Standard Breakfast,0.75,cup,"(Kellogg's) Frosted Flakes, Cereal",110.0,26.0,,10.0,1.0,,1.0,,1,FEMALE,5.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1417,2/26/2020,,2020-02-26 18:30:00,,Lemonade,32,oz,,99.0,26.0,0.0,25.0,0.2,0.1,16.0,18:30,16,MALE,5.5
1418,2/27/2020,,2020-02-27 10:30:00,,Standard breakfast,,,,280.0,56.5,1.0,24.0,8.0,2.5,16.0,10:30,16,MALE,5.5
1419,2/27/2020,,2020-02-27 11:30:00,,Plain cheese pizza,1,slices,,452.0,57.0,3.9,6.1,19.0,16.0,16.0,11:30,16,MALE,5.5
1420,2/27/2020,,2020-02-27 11:30:00,,cooked black eyed peas,1,cup,,198.0,35.0,11.0,5.6,13.0,0.9,16.0,11:30,16,MALE,5.5


In [31]:
entire_dataset['time_begin'].iloc[0].split(' ')

['2020-02-13', '18:00:00']

entire_dataset[entire_dataset['time'] != entire_dataset['time_begin'].str.split(' ')[-1]]

In [32]:
def convert_date_time(df):
    df[['date', 'time']] = df['time_begin'].str.split(' ', expand=True)
    return df

entire_dataset = entire_dataset.transform(convert_date_time)


In [33]:
entire_dataset.isna().any()

date             False
time             False
time_begin       False
time_end          True
logged_food      False
amount            True
unit              True
searched_food     True
calorie          False
total_carb       False
dietary_fiber     True
sugar            False
protein           True
total_fat         True
subject          False
time_of_day       True
ID               False
Gender           False
HbA1c            False
dtype: bool

In [34]:
def convert_to_datetime(df):
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], errors='coerce')
    df['date'] = pd.to_datetime(df['date'])
    df['time'] = pd.to_datetime(df['time'])
    return df

entire_dataset = convert_to_datetime(entire_dataset)

In [35]:
entire_dataset.dtypes

date             datetime64[ns]
time             datetime64[ns]
time_begin               object
time_end                 object
logged_food              object
amount                   object
unit                     object
searched_food            object
calorie                 float64
total_carb              float64
dietary_fiber           float64
sugar                   float64
protein                 float64
total_fat               float64
subject                 float64
time_of_day              object
ID                        int64
Gender                   object
HbA1c                   float64
datetime         datetime64[ns]
dtype: object

In [36]:
entire_dataset['hour'] = entire_dataset['time'].dt.hour
display(entire_dataset)

Unnamed: 0,date,time,time_begin,time_end,logged_food,amount,unit,searched_food,calorie,total_carb,...,sugar,protein,total_fat,subject,time_of_day,ID,Gender,HbA1c,datetime,hour
0,2020-02-13,2025-02-10 18:00:00,2020-02-13 18:00:00,,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,85.0,...,83.0,16.0,3.3,1.0,,1,FEMALE,5.5,2020-02-13 18:00:00,18
1,2020-02-13,2025-02-10 20:30:00,2020-02-13 20:30:00,,Chicken Leg,1.0,,chicken leg,475.0,0.0,...,0.0,62.0,23.0,1.0,,1,FEMALE,5.5,2020-02-13 20:30:00,20
2,2020-02-13,2025-02-10 20:30:00,2020-02-13 20:30:00,,Asparagus,4.0,,Asparagus,13.0,2.5,...,0.8,1.4,0.1,1.0,,1,FEMALE,5.5,2020-02-13 20:30:00,20
3,2020-02-14,2025-02-10 07:10:00,2020-02-14 07:10:00,,Natrel Lactose Free 2 Percent,8.0,fluid ounce,(Natrel) Lactose Free 2% Partly Skimmed Milk,120.0,9.0,...,8.0,12.0,,1.0,,1,FEMALE,5.5,2020-02-14 07:10:00,7
4,2020-02-14,2025-02-10 07:10:00,2020-02-14 07:10:00,,Standard Breakfast,0.75,cup,"(Kellogg's) Frosted Flakes, Cereal",110.0,26.0,...,10.0,1.0,,1.0,,1,FEMALE,5.5,2020-02-14 07:10:00,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1417,2020-02-26,2025-02-10 18:30:00,2020-02-26 18:30:00,,Lemonade,32,oz,,99.0,26.0,...,25.0,0.2,0.1,16.0,18:30,16,MALE,5.5,2020-02-26 18:30:00,18
1418,2020-02-27,2025-02-10 10:30:00,2020-02-27 10:30:00,,Standard breakfast,,,,280.0,56.5,...,24.0,8.0,2.5,16.0,10:30,16,MALE,5.5,2020-02-27 10:30:00,10
1419,2020-02-27,2025-02-10 11:30:00,2020-02-27 11:30:00,,Plain cheese pizza,1,slices,,452.0,57.0,...,6.1,19.0,16.0,16.0,11:30,16,MALE,5.5,2020-02-27 11:30:00,11
1420,2020-02-27,2025-02-10 11:30:00,2020-02-27 11:30:00,,cooked black eyed peas,1,cup,,198.0,35.0,...,5.6,13.0,0.9,16.0,11:30,16,MALE,5.5,2020-02-27 11:30:00,11


In [37]:
agged = entire_dataset.groupby(['hour','subject']).agg({'logged_food':list, 'calorie':'mean'}).reset_index()
display(agged)

Unnamed: 0,hour,subject,logged_food,calorie
0,0,2.0,[(Powerade) Grape],65.000000
1,0,6.0,"[Blue Bunny fudge bar, M&Ms]",180.000000
2,0,13.0,[Pretzel Rod],115.000000
3,1,1.0,[Kale and Fruit Smoothie],307.500000
4,1,6.0,"[Fruit smoothie, Baked cheetos]",171.000000
...,...,...,...,...
236,22,12.0,"[(Babybel) Cheese Bite, (Mich) Ultra Beer]",83.000000
237,22,13.0,[ice cream sandwich],284.000000
238,22,15.0,[Outback cheesecake with chocolate sauce],480.000000
239,23,6.0,"[Baked cheetos, Oreo shake, Fruit smoothie, PB...",387.866667


In [38]:
px.scatter(data_frame=agged, x='hour', y='calorie', color='subject',labels='logged_food', title='Average Caloric Intake Throughout the Day')

In [14]:
px.scatter(data_frame=entire_dataset, x='hour', y='calorie', color='subject')

In [15]:
data = entire_dataset[['date', 'time', 'logged_food', 'calorie', 'subject']]
display(data.head())

Unnamed: 0,date,time,logged_food,calorie,subject
0,2020-02-13,2025-02-10 18:00:00,Berry Smoothie,456.0,1.0
1,2020-02-13,2025-02-10 20:30:00,Chicken Leg,475.0,1.0
2,2020-02-13,2025-02-10 20:30:00,Asparagus,13.0,1.0
3,2020-02-14,2025-02-10 07:10:00,Natrel Lactose Free 2 Percent,120.0,1.0
4,2020-02-14,2025-02-10 07:10:00,Standard Breakfast,110.0,1.0


Food Classification:  
0 = meat, 1 = fruit/veggies, 2 = snack-other, 3 = beverage, 4 = supplement, 5 = full meal

food_classes = []
for i in range(entire_dataset.shape[0]):
    print(entire_dataset['logged_food'].iloc[i])
    label = input('enter here: ')
    food_classes.append(label)

In [39]:
foods = entire_dataset['logged_food'].unique()

foods.shape

(675,)