In [45]:
import pandas as pd
import glob, os
import numpy as np
import re
# import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [46]:
# ! pip --version

In [47]:
entire_dataset = pd.DataFrame()
for i,file in enumerate(glob.glob('./data/*.csv')):
    df_temp = pd.read_csv(file)
    df_temp['subject'] = np.ones(df_temp.shape[0]) * (i+1)
    entire_dataset = pd.concat([entire_dataset, df_temp])

In [48]:
demographic = pd.read_csv('./data/demo/demographics.csv')
entire_dataset = entire_dataset.merge(demographic, left_on='subject', right_on='ID',how='inner')

In [49]:
entire_dataset.isna().any()

date             False
time              True
time_begin        True
time_end          True
logged_food      False
amount            True
unit              True
searched_food     True
calorie          False
total_carb       False
dietary_fiber     True
sugar            False
protein           True
total_fat         True
subject          False
time_of_day       True
ID               False
Gender           False
HbA1c            False
dtype: bool

In [50]:
def fix_time(df):
    if 'time_begin' in df.columns:
        df['time_begin'].fillna(df['date'] + ' ' + df['time'], inplace=True)
    else:
        df['time_begin'] = df['date'] + ' ' + df['time']
    return df
entire_dataset.transform(fix_time)

Unnamed: 0,date,time,time_begin,time_end,logged_food,amount,unit,searched_food,calorie,total_carb,dietary_fiber,sugar,protein,total_fat,subject,time_of_day,ID,Gender,HbA1c
0,2020-02-13,18:00:00,2020-02-13 18:00:00,,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,85.0,1.7,83.0,16.0,3.3,1.0,,1,FEMALE,5.5
1,2020-02-13,20:30:00,2020-02-13 20:30:00,,Chicken Leg,1.0,,chicken leg,475.0,0.0,0.0,0.0,62.0,23.0,1.0,,1,FEMALE,5.5
2,2020-02-13,20:30:00,2020-02-13 20:30:00,,Asparagus,4.0,,Asparagus,13.0,2.5,1.2,0.8,1.4,0.1,1.0,,1,FEMALE,5.5
3,2020-02-14,07:10:00,2020-02-14 07:10:00,,Natrel Lactose Free 2 Percent,8.0,fluid ounce,(Natrel) Lactose Free 2% Partly Skimmed Milk,120.0,9.0,,8.0,12.0,,1.0,,1,FEMALE,5.5
4,2020-02-14,07:10:00,2020-02-14 07:10:00,,Standard Breakfast,0.75,cup,"(Kellogg's) Frosted Flakes, Cereal",110.0,26.0,,10.0,1.0,,1.0,,1,FEMALE,5.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1417,2/26/2020,,2020-02-26 18:30:00,,Lemonade,32,oz,,99.0,26.0,0.0,25.0,0.2,0.1,16.0,18:30,16,MALE,5.5
1418,2/27/2020,,2020-02-27 10:30:00,,Standard breakfast,,,,280.0,56.5,1.0,24.0,8.0,2.5,16.0,10:30,16,MALE,5.5
1419,2/27/2020,,2020-02-27 11:30:00,,Plain cheese pizza,1,slices,,452.0,57.0,3.9,6.1,19.0,16.0,16.0,11:30,16,MALE,5.5
1420,2/27/2020,,2020-02-27 11:30:00,,cooked black eyed peas,1,cup,,198.0,35.0,11.0,5.6,13.0,0.9,16.0,11:30,16,MALE,5.5


In [51]:
entire_dataset['time_begin'].iloc[0].split(' ')

['2020-02-13', '18:00:00']

entire_dataset[entire_dataset['time'] != entire_dataset['time_begin'].str.split(' ')[-1]]

In [52]:
def convert_date_time(df):
    df[['date', 'time']] = df['time_begin'].str.split(' ', expand=True)
    return df

entire_dataset = entire_dataset.transform(convert_date_time)


In [53]:
entire_dataset.isna().any()

date             False
time             False
time_begin       False
time_end          True
logged_food      False
amount            True
unit              True
searched_food     True
calorie          False
total_carb       False
dietary_fiber     True
sugar            False
protein           True
total_fat         True
subject          False
time_of_day       True
ID               False
Gender           False
HbA1c            False
dtype: bool

In [54]:
def convert_to_datetime(df):
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], errors='coerce')
    df['date'] = pd.to_datetime(df['date'])
    df['time'] = pd.to_datetime(df['time'])
    return df

entire_dataset = convert_to_datetime(entire_dataset)

In [55]:
entire_dataset.dtypes

date             datetime64[ns]
time             datetime64[ns]
time_begin               object
time_end                 object
logged_food              object
amount                   object
unit                     object
searched_food            object
calorie                 float64
total_carb              float64
dietary_fiber           float64
sugar                   float64
protein                 float64
total_fat               float64
subject                 float64
time_of_day              object
ID                        int64
Gender                   object
HbA1c                   float64
datetime         datetime64[ns]
dtype: object

In [56]:
entire_dataset['hour'] = entire_dataset['datetime'].dt.hour
display(entire_dataset)

Unnamed: 0,date,time,time_begin,time_end,logged_food,amount,unit,searched_food,calorie,total_carb,...,sugar,protein,total_fat,subject,time_of_day,ID,Gender,HbA1c,datetime,hour
0,2020-02-13,2025-02-11 18:00:00,2020-02-13 18:00:00,,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,85.0,...,83.0,16.0,3.3,1.0,,1,FEMALE,5.5,2020-02-13 18:00:00,18
1,2020-02-13,2025-02-11 20:30:00,2020-02-13 20:30:00,,Chicken Leg,1.0,,chicken leg,475.0,0.0,...,0.0,62.0,23.0,1.0,,1,FEMALE,5.5,2020-02-13 20:30:00,20
2,2020-02-13,2025-02-11 20:30:00,2020-02-13 20:30:00,,Asparagus,4.0,,Asparagus,13.0,2.5,...,0.8,1.4,0.1,1.0,,1,FEMALE,5.5,2020-02-13 20:30:00,20
3,2020-02-14,2025-02-11 07:10:00,2020-02-14 07:10:00,,Natrel Lactose Free 2 Percent,8.0,fluid ounce,(Natrel) Lactose Free 2% Partly Skimmed Milk,120.0,9.0,...,8.0,12.0,,1.0,,1,FEMALE,5.5,2020-02-14 07:10:00,7
4,2020-02-14,2025-02-11 07:10:00,2020-02-14 07:10:00,,Standard Breakfast,0.75,cup,"(Kellogg's) Frosted Flakes, Cereal",110.0,26.0,...,10.0,1.0,,1.0,,1,FEMALE,5.5,2020-02-14 07:10:00,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1417,2020-02-26,2025-02-11 18:30:00,2020-02-26 18:30:00,,Lemonade,32,oz,,99.0,26.0,...,25.0,0.2,0.1,16.0,18:30,16,MALE,5.5,2020-02-26 18:30:00,18
1418,2020-02-27,2025-02-11 10:30:00,2020-02-27 10:30:00,,Standard breakfast,,,,280.0,56.5,...,24.0,8.0,2.5,16.0,10:30,16,MALE,5.5,2020-02-27 10:30:00,10
1419,2020-02-27,2025-02-11 11:30:00,2020-02-27 11:30:00,,Plain cheese pizza,1,slices,,452.0,57.0,...,6.1,19.0,16.0,16.0,11:30,16,MALE,5.5,2020-02-27 11:30:00,11
1420,2020-02-27,2025-02-11 11:30:00,2020-02-27 11:30:00,,cooked black eyed peas,1,cup,,198.0,35.0,...,5.6,13.0,0.9,16.0,11:30,16,MALE,5.5,2020-02-27 11:30:00,11


In [57]:
agged = entire_dataset.groupby(['hour','subject']).agg({'logged_food':list, 'calorie':'mean'}).reset_index()
display(agged)

Unnamed: 0,hour,subject,logged_food,calorie
0,0,2.0,[(Powerade) Grape],65.000000
1,0,6.0,"[Blue Bunny fudge bar, M&Ms]",180.000000
2,0,13.0,[Pretzel Rod],115.000000
3,1,1.0,[Kale and Fruit Smoothie],307.500000
4,1,6.0,"[Fruit smoothie, Baked cheetos]",171.000000
...,...,...,...,...
236,22,12.0,"[(Babybel) Cheese Bite, (Mich) Ultra Beer]",83.000000
237,22,13.0,[ice cream sandwich],284.000000
238,22,15.0,[Outback cheesecake with chocolate sauce],480.000000
239,23,6.0,"[Baked cheetos, Oreo shake, Fruit smoothie, PB...",387.866667


In [58]:
px.scatter(data_frame=agged, x='hour', y='calorie', color='subject',labels='logged_food', title='Average Caloric Intake Throughout the Day')

In [59]:
px.scatter(data_frame=entire_dataset, x='hour', y='calorie', color='subject')

In [60]:
data = entire_dataset[['date', 'time', 'logged_food', 'calorie', 'subject']]
display(data.head())

Unnamed: 0,date,time,logged_food,calorie,subject
0,2020-02-13,2025-02-11 18:00:00,Berry Smoothie,456.0,1.0
1,2020-02-13,2025-02-11 20:30:00,Chicken Leg,475.0,1.0
2,2020-02-13,2025-02-11 20:30:00,Asparagus,13.0,1.0
3,2020-02-14,2025-02-11 07:10:00,Natrel Lactose Free 2 Percent,120.0,1.0
4,2020-02-14,2025-02-11 07:10:00,Standard Breakfast,110.0,1.0


## Food Classification:  
## 0 = meat, 1 = fruit/veggies, 2 = snack-other, 3 = beverage, 4 = supplement, 5 = full meal, 6 = sides/auxillary food items, 7 = desserts

In [61]:
import json
with open('food_classes.json', 'r') as file:
    food_classes = json.load(file)

food_classes = {k:int(v) for k, v in food_classes.items()}
food_classes

{'Berry Smoothie': 3,
 'Chicken Leg': 0,
 'Asparagus': 1,
 'Natrel Lactose Free 2 Percent': 3,
 'Standard Breakfast': 5,
 'Breakfast Trail Mix': 2,
 'Spinach Salad w/ strawberries and cheese': 5,
 'Egg': 0,
 'Acai Smoothie': 3,
 "(Trader Joe's) Mac and Cheese": 5,
 'Coconut Shrimp': 5,
 'Spinach Smoothie': 3,
 'Spinach Salad w/ blueberries, egg, and cheese': 5,
 'Babel Cheese': 2,
 'Bourbon Chicken': 0,
 'Rice': 6,
 'Shrimp': 0,
 'Cabbage': 1,
 'Hot Chocolate': 3,
 'Salty Sweet Popcorn': 2,
 'Chai Tea': 3,
 'Maple Brown Sugar Oatmeal': 5,
 'Salad with Cranberries': 5,
 'Chicken Nuggets': 5,
 'Kale Salad': 5,
 'Pizza': 5,
 'Oreo Cookies': 7,
 'Muffin': 2,
 'Grilled Chicken Wrap': 5,
 'Kale and Fruit Smoothie': 3,
 'Ranch Wings': 5,
 'Lemon Loaf': 7,
 'Turkey Slider': 5,
 'Chicken and Rice': 5,
 'Green Smoothie': 3,
 'Bagel': 5,
 'Salad': 5,
 'Babel bell cheese': 2,
 'Tangerine Orange': 1,
 'Babybel Cheese': 2,
 'Chicken Salad': 5,
 'Pita Bread': 6,
 'Cheese Pita': 6,
 'Boost': 4,
 'Mell

In [62]:
food_class_dict = {0:'meals', 1: 'fruits/veggies', 2:'snacks', 3:'beverages', 4:'supplements', 5:'meals', 6:'meals', 7:'meals'}

In [63]:
def add_classes(df):
    df['class'] = food_class_dict[food_classes[df['logged_food']]]
    return df
entire_dataset = entire_dataset.apply(add_classes, axis=1)

In [64]:
entire_dataset

Unnamed: 0,date,time,time_begin,time_end,logged_food,amount,unit,searched_food,calorie,total_carb,...,protein,total_fat,subject,time_of_day,ID,Gender,HbA1c,datetime,hour,class
0,2020-02-13,2025-02-11 18:00:00,2020-02-13 18:00:00,,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,85.0,...,16.0,3.3,1.0,,1,FEMALE,5.5,2020-02-13 18:00:00,18,beverages
1,2020-02-13,2025-02-11 20:30:00,2020-02-13 20:30:00,,Chicken Leg,1.0,,chicken leg,475.0,0.0,...,62.0,23.0,1.0,,1,FEMALE,5.5,2020-02-13 20:30:00,20,meals
2,2020-02-13,2025-02-11 20:30:00,2020-02-13 20:30:00,,Asparagus,4.0,,Asparagus,13.0,2.5,...,1.4,0.1,1.0,,1,FEMALE,5.5,2020-02-13 20:30:00,20,fruits/veggies
3,2020-02-14,2025-02-11 07:10:00,2020-02-14 07:10:00,,Natrel Lactose Free 2 Percent,8.0,fluid ounce,(Natrel) Lactose Free 2% Partly Skimmed Milk,120.0,9.0,...,12.0,,1.0,,1,FEMALE,5.5,2020-02-14 07:10:00,7,beverages
4,2020-02-14,2025-02-11 07:10:00,2020-02-14 07:10:00,,Standard Breakfast,0.75,cup,"(Kellogg's) Frosted Flakes, Cereal",110.0,26.0,...,1.0,,1.0,,1,FEMALE,5.5,2020-02-14 07:10:00,7,meals
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1417,2020-02-26,2025-02-11 18:30:00,2020-02-26 18:30:00,,Lemonade,32,oz,,99.0,26.0,...,0.2,0.1,16.0,18:30,16,MALE,5.5,2020-02-26 18:30:00,18,beverages
1418,2020-02-27,2025-02-11 10:30:00,2020-02-27 10:30:00,,Standard breakfast,,,,280.0,56.5,...,8.0,2.5,16.0,10:30,16,MALE,5.5,2020-02-27 10:30:00,10,meals
1419,2020-02-27,2025-02-11 11:30:00,2020-02-27 11:30:00,,Plain cheese pizza,1,slices,,452.0,57.0,...,19.0,16.0,16.0,11:30,16,MALE,5.5,2020-02-27 11:30:00,11,meals
1420,2020-02-27,2025-02-11 11:30:00,2020-02-27 11:30:00,,cooked black eyed peas,1,cup,,198.0,35.0,...,13.0,0.9,16.0,11:30,16,MALE,5.5,2020-02-27 11:30:00,11,meals


In [65]:
graphed = entire_dataset.groupby(['hour','class']).count().reset_index()

In [66]:
px.bar(data_frame=graphed[graphed['class'] != 'meals'], y='calorie', x='hour', color='class')

In [67]:
px.bar(data_frame=graphed[(graphed['class']!='supplements') & (graphed['class']!='fruits/veggies')], y='calorie', x='hour', color='class', title="Types of Food Eaten Over a Day", labels={'hour': 'Time of Day (Hour)', 'calorie': 'Count of Foods Logged'})

In [68]:
graph_this=graphed[(graphed['class']!='supplements') & (graphed['class']!='fruits/veggies')]

In [69]:
# Find the most popular item for a given class (e.g., 'meals' or 'beverages')
def most_popular_item_by_hour(df, food_class):
    # Filter data by class (e.g., meals or beverages)
    filtered_df = df[df['class'] == food_class]
    
    # Count occurrences of each food item per hour
    grouped = filtered_df.groupby(['hour', 'logged_food']).size().reset_index(name='count')
    
    # Find the most popular item per hour
    most_popular = grouped.loc[grouped.groupby('hour')['count'].idxmax()]
    
    return most_popular

# Get most popular meals and beverages by hour
most_popular_meals = most_popular_item_by_hour(entire_dataset, 'meals')
most_popular_beverages = most_popular_item_by_hour(entire_dataset, 'beverages')

print("Most Popular Meals by Hour:")
print(most_popular_meals)

print("\nMost Popular Beverages by Hour:")
print(most_popular_beverages)


Most Popular Meals by Hour:
     hour                                 logged_food  count
0       0                        Blue Bunny fudge bar      1
4       4                              Frosted Flakes      3
10      5                               Frosted Flake      3
26      6                          Standard Breakfast      5
37      7                          Standard Breakfast      3
41      8           (Fage) Greek Yogurt, plain nonfat      6
74      9                              Frosted Flakes      6
92     10                 (Quaker) Old Fashioned Oats      2
119    11                                       Bacon      2
173    12             Boneless Skinless Chicken Thigh      3
236    13                                       Bacon      2
289    14                      Granola Oats and Honey      2
301    15                           2% Cottage Cheese      4
323    16                           2% Cottage Cheese      2
353    17             Boneless Skinless Chicken Thigh    

In [70]:
import plotly.express as px
import pandas as pd

# Aggregate by hour and class, summing calorie values
sorted_data = graph_this.groupby(['hour', 'class'], as_index=False)['calorie'].sum()

# Sort by total calories across all hours to determine order
class_order = sorted_data.groupby('class')['calorie'].sum().sort_values(ascending=True).index

# Plot the bar chart with sorted class categories
fig = px.bar(
    data_frame=sorted_data, 
    x='hour', 
    y='calorie', 
    color='class', 
    title="Types of Food Eaten Over a Day", 
    labels={'hour': 'Time of Day (Hour)', 'calorie': 'Count of Foods Logged'},
    category_orders={"class": class_order},
    color_discrete_sequence=px.colors.qualitative.Set2
)

fig.show()


In [71]:
# Step 1: Sum total intake per day per gender
daily_intake = entire_dataset.groupby(['Gender', 'date']).sum().reset_index()

# Step 2: Compute the average daily intake per gender
avg_daily_intake = daily_intake.groupby('Gender').mean().reset_index()

# Step 3: Plot the results
px.bar(avg_daily_intake, x='Gender', y='calorie', title='Average Daily Calorie Intake by Gender',
       labels={'calorie': 'Average Daily Calories'})


In [72]:
# Reshape data: Convert columns into long format for grouped bar plot
melted_data = avg_daily_intake.melt(id_vars=['Gender'], 
                                    value_vars=['protein', 'dietary_fiber', 'sugar'], 
                                    var_name='Nutrient', 
                                    value_name='Average Intake')

# Create grouped bar chart
fig = px.bar(melted_data, x='Gender', y='Average Intake', color='Nutrient',
             barmode='group', title='Average Daily Nutrient Intake by Gender')

fig.show()


In [73]:
fig = px.bar(melted_data, x='Gender', y='Average Intake', color='Nutrient',
             barmode='stack', title='Average Daily Nutrient Intake by Gender')

fig.show()


In [74]:
entire_dataset.to_csv('data.csv')

In [75]:
filtered = entire_dataset[(entire_dataset['class']=='beverages')|(entire_dataset['class']=='meals')|(entire_dataset['class']=='snacks')]
filtered

Unnamed: 0,date,time,time_begin,time_end,logged_food,amount,unit,searched_food,calorie,total_carb,...,protein,total_fat,subject,time_of_day,ID,Gender,HbA1c,datetime,hour,class
0,2020-02-13,2025-02-11 18:00:00,2020-02-13 18:00:00,,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,85.0,...,16.0,3.3,1.0,,1,FEMALE,5.5,2020-02-13 18:00:00,18,beverages
1,2020-02-13,2025-02-11 20:30:00,2020-02-13 20:30:00,,Chicken Leg,1.0,,chicken leg,475.0,0.0,...,62.0,23.0,1.0,,1,FEMALE,5.5,2020-02-13 20:30:00,20,meals
3,2020-02-14,2025-02-11 07:10:00,2020-02-14 07:10:00,,Natrel Lactose Free 2 Percent,8.0,fluid ounce,(Natrel) Lactose Free 2% Partly Skimmed Milk,120.0,9.0,...,12.0,,1.0,,1,FEMALE,5.5,2020-02-14 07:10:00,7,beverages
4,2020-02-14,2025-02-11 07:10:00,2020-02-14 07:10:00,,Standard Breakfast,0.75,cup,"(Kellogg's) Frosted Flakes, Cereal",110.0,26.0,...,1.0,,1.0,,1,FEMALE,5.5,2020-02-14 07:10:00,7,meals
5,2020-02-14,2025-02-11 09:38:00,2020-02-14 09:38:00,,Breakfast Trail Mix,0.5,cup,"(Giant) Breakfast Blend, Trail Mix",280.0,30.0,...,4.0,,1.0,,1,FEMALE,5.5,2020-02-14 09:38:00,9,snacks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1416,2020-02-26,2025-02-11 18:30:00,2020-02-26 18:30:00,,Blue Cheese dressing,2,Tbsp,,146.0,1.4,...,0.4,15.4,16.0,18:30,16,MALE,5.5,2020-02-26 18:30:00,18,meals
1417,2020-02-26,2025-02-11 18:30:00,2020-02-26 18:30:00,,Lemonade,32,oz,,99.0,26.0,...,0.2,0.1,16.0,18:30,16,MALE,5.5,2020-02-26 18:30:00,18,beverages
1418,2020-02-27,2025-02-11 10:30:00,2020-02-27 10:30:00,,Standard breakfast,,,,280.0,56.5,...,8.0,2.5,16.0,10:30,16,MALE,5.5,2020-02-27 10:30:00,10,meals
1419,2020-02-27,2025-02-11 11:30:00,2020-02-27 11:30:00,,Plain cheese pizza,1,slices,,452.0,57.0,...,19.0,16.0,16.0,11:30,16,MALE,5.5,2020-02-27 11:30:00,11,meals


In [76]:
filtered['logged_food'].unique()

array(['Berry Smoothie', 'Chicken Leg', 'Natrel Lactose Free 2 Percent',
       'Standard Breakfast', 'Breakfast Trail Mix',
       'Spinach Salad w/ strawberries and cheese', 'Egg', 'Acai Smoothie',
       "(Trader Joe's) Mac and Cheese", 'Coconut Shrimp',
       'Spinach Smoothie',
       'Spinach Salad w/ blueberries, egg, and cheese', 'Babel Cheese',
       'Bourbon Chicken', 'Rice', 'Shrimp', 'Hot Chocolate',
       'Salty Sweet Popcorn', 'Chai Tea', 'Maple Brown Sugar Oatmeal',
       'Salad with Cranberries', 'Chicken Nuggets', 'Kale Salad', 'Pizza',
       'Oreo Cookies', 'Muffin', 'Grilled Chicken Wrap',
       'Kale and Fruit Smoothie', 'Ranch Wings', 'Lemon Loaf',
       'Turkey Slider', 'Chicken and Rice', 'Green Smoothie', 'Bagel',
       'Salad', 'Babel bell cheese', 'Babybel Cheese', 'Chicken Salad',
       'Pita Bread', 'Cheese Pita', 'Mello Yello',
       '(Jimmy Dean) Chicken Biscuit', 'Beef Jerky',
       '(Gatorade) Fierce Grape', 'Banquet Chicken Pot Pie',
       '

In [97]:
def generalize_food(row):
    food = row['logged_food'].lower()  # Ensure case-insensitivity
    if 'smoothie' in food or 'essential' in food:
        return 'smoothie'
    elif 'kashi' in food or 'flake' in food:
        return 'cereal'
    elif 'salad' in food:
        return 'salad'
    elif any(coffee in food for coffee in ['coffee','latte','cream','sweetener','sugar','sweetner','stevia','half and half']):
        return 'coffee'
    elif 'tea' in food:
        return 'tea'
    elif 'biscuit' in food or 'sub' in food or 'sandw' in food:
        return 'sandwich'
    elif 'juice' in food or 'v8' in food or 'kombucha' in food:
        return 'juice'
    elif any(alcohol in food for alcohol in ['beer', 'corona', 'wine', 'bourbon', 'vodka', 'martini', 'moscato', 'mojito']):
        return 'alcohol'
    elif 'milk' in food or 'percent' in food:
        return 'milk'
    elif any(soda in food for soda in ['coca cola', 'coke', 'pepsi', 'sunkist', 'mello yello', 'soda', 'lemonade','mountain dew','moutain dew']):
        return 'soda'
    elif any(sports in food for sports in ['gatorade', 'powerade']):
        return 'sports drink'
    else:
        return food  # Preserve original if no match

# Apply function to each row
filtered['simplified_food'] = filtered.apply(generalize_food, axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [98]:
filtered

Unnamed: 0,date,time,time_begin,time_end,logged_food,amount,unit,searched_food,calorie,total_carb,...,total_fat,subject,time_of_day,ID,Gender,HbA1c,datetime,hour,class,simplified_food
0,2020-02-13,2025-02-11 18:00:00,2020-02-13 18:00:00,,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,85.0,...,3.3,1.0,,1,FEMALE,5.5,2020-02-13 18:00:00,18,beverages,smoothie
1,2020-02-13,2025-02-11 20:30:00,2020-02-13 20:30:00,,Chicken Leg,1.0,,chicken leg,475.0,0.0,...,23.0,1.0,,1,FEMALE,5.5,2020-02-13 20:30:00,20,meals,chicken leg
3,2020-02-14,2025-02-11 07:10:00,2020-02-14 07:10:00,,Natrel Lactose Free 2 Percent,8.0,fluid ounce,(Natrel) Lactose Free 2% Partly Skimmed Milk,120.0,9.0,...,,1.0,,1,FEMALE,5.5,2020-02-14 07:10:00,7,beverages,milk
4,2020-02-14,2025-02-11 07:10:00,2020-02-14 07:10:00,,Standard Breakfast,0.75,cup,"(Kellogg's) Frosted Flakes, Cereal",110.0,26.0,...,,1.0,,1,FEMALE,5.5,2020-02-14 07:10:00,7,meals,standard breakfast
5,2020-02-14,2025-02-11 09:38:00,2020-02-14 09:38:00,,Breakfast Trail Mix,0.5,cup,"(Giant) Breakfast Blend, Trail Mix",280.0,30.0,...,,1.0,,1,FEMALE,5.5,2020-02-14 09:38:00,9,snacks,breakfast trail mix
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1416,2020-02-26,2025-02-11 18:30:00,2020-02-26 18:30:00,,Blue Cheese dressing,2,Tbsp,,146.0,1.4,...,15.4,16.0,18:30,16,MALE,5.5,2020-02-26 18:30:00,18,meals,blue cheese dressing
1417,2020-02-26,2025-02-11 18:30:00,2020-02-26 18:30:00,,Lemonade,32,oz,,99.0,26.0,...,0.1,16.0,18:30,16,MALE,5.5,2020-02-26 18:30:00,18,beverages,soda
1418,2020-02-27,2025-02-11 10:30:00,2020-02-27 10:30:00,,Standard breakfast,,,,280.0,56.5,...,2.5,16.0,10:30,16,MALE,5.5,2020-02-27 10:30:00,10,meals,standard breakfast
1419,2020-02-27,2025-02-11 11:30:00,2020-02-27 11:30:00,,Plain cheese pizza,1,slices,,452.0,57.0,...,16.0,16.0,11:30,16,MALE,5.5,2020-02-27 11:30:00,11,meals,plain cheese pizza


In [103]:
filtered['simplified_food'].unique()

array(['smoothie', 'chicken leg', 'milk', 'standard breakfast',
       'breakfast trail mix', 'salad', 'egg',
       "(trader joe's) mac and cheese", 'coconut shrimp', 'babel cheese',
       'alcohol', 'rice', 'shrimp', 'hot chocolate',
       'salty sweet popcorn', 'tea', 'coffee', 'chicken nuggets', 'pizza',
       'oreo cookies', 'muffin', 'grilled chicken wrap', 'ranch wings',
       'lemon loaf', 'turkey slider', 'chicken and rice', 'bagel',
       'babel bell cheese', 'babybel cheese', 'pita bread', 'cheese pita',
       'soda', 'sandwich', 'beef jerky', 'sports drink',
       'banquet chicken pot pie',
       '(red baron) brick oven pepperoni pizza', 'm & m ', 'cereal',
       'mashed potato', 'frozen pop',
       'omelet (3 egg, bacon 3 strip, cheese 2 tsp)',
       "(arby's) classic roast beef ff", 'chip', 'salsa',
       'cheeseburger (mayo , mustard, chili)', 'tater tots',
       'onion rings', 'fig newton', 'water ', 'vienna sausage',
       'lance toast chee', 'tootsie rol

In [99]:
bevs = filtered[filtered['class'] == 'beverages']
bevs

Unnamed: 0,date,time,time_begin,time_end,logged_food,amount,unit,searched_food,calorie,total_carb,...,total_fat,subject,time_of_day,ID,Gender,HbA1c,datetime,hour,class,simplified_food
0,2020-02-13,2025-02-11 18:00:00,2020-02-13 18:00:00,,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,85.0,...,3.3,1.0,,1,FEMALE,5.5,2020-02-13 18:00:00,18,beverages,smoothie
3,2020-02-14,2025-02-11 07:10:00,2020-02-14 07:10:00,,Natrel Lactose Free 2 Percent,8.0,fluid ounce,(Natrel) Lactose Free 2% Partly Skimmed Milk,120.0,9.0,...,,1.0,,1,FEMALE,5.5,2020-02-14 07:10:00,7,beverages,milk
8,2020-02-14,2025-02-11 19:30:00,2020-02-14 19:30:00,,Acai Smoothie,20.0,fluid ounce,(Smoothie King) Acai Adventure Smoothie,440.0,92.0,...,,1.0,,1,FEMALE,5.5,2020-02-14 19:30:00,19,beverages,smoothie
11,2020-02-15,2025-02-11 07:30:00,2020-02-15 07:30:00,,Spinach Smoothie,20.0,fluid ounce,Spinach Smoothie,308.0,69.0,...,,1.0,,1,FEMALE,5.5,2020-02-15 07:30:00,7,beverages,smoothie
21,2020-02-16,2025-02-11 07:00:00,2020-02-16 07:00:00,,Natrel Lactose Free 2 Percent,8.0,fluid ounce,(Natrel) Lactose Free 2% Partly Skimmed Milk,120.0,9.0,...,,1.0,,1,FEMALE,5.5,2020-02-16 07:00:00,7,beverages,milk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,2020-02-22,2025-02-11 12:45:00,2020-02-22 12:45:00,,1 cup coffee,,,,84.0,2.6,...,7.0,16.0,12:45,16,MALE,5.5,2020-02-22 12:45:00,12,beverages,coffee
1402,2020-02-24,2025-02-11 19:30:00,2020-02-24 19:30:00,,Beer,1,pint,,204.0,17.0,...,0.0,16.0,19:30,16,MALE,5.5,2020-02-24 19:30:00,19,beverages,alcohol
1405,2020-02-25,2025-02-11 08:00:00,2020-02-25 08:00:00,,1 coffee with cream,,,,84.0,2.6,...,7.0,16.0,8:00,16,MALE,5.5,2020-02-25 08:00:00,8,beverages,coffee
1409,2020-02-25,2025-02-11 20:00:00,2020-02-25 20:00:00,,Sweat tea ice,32,fluid oz,,320.0,84.0,...,0.0,16.0,20:00,16,MALE,5.5,2020-02-25 20:00:00,20,beverages,tea


In [100]:
bevs['simplified_food'].unique()

array(['smoothie', 'milk', 'hot chocolate', 'tea', 'soda', 'sports drink',
       'water ', 'alcohol', 'coffee', 'juice'], dtype=object)

In [101]:
grouped_bevs = bevs.groupby(['hour','simplified_food']).agg({'class':'count','calorie':'mean','sugar':'mean'}).reset_index()
px.bar(grouped_bevs, x='hour', y='class', color='simplified_food')

In [36]:
filtered[filtered['class']=='beverages']['logged_food'].unique()

array(['Berry Smoothie', 'Natrel Lactose Free 2 Percent', 'Acai Smoothie',
       'Spinach Smoothie', 'Hot Chocolate', 'Chai Tea',
       'Kale and Fruit Smoothie', 'Green Smoothie', 'Mello Yello',
       '(Gatorade) Fierce Grape', '(Powerade) Grape', 'Chocolate Milk',
       'Moutain Dew', 'Gatorade', 'Sweet Tea', 'Water ', 'Powerade',
       'Power Smoothie', 'Milk', 'Greek Yogurt Power Smoothie',
       'Merlot Wine', 'Lemon Drop Martini', 'Coffee', 'Creamers',
       'Tea with Lemon', 'Tea', 'Soda', 'Creamer', 'Sugar',
       '(Outback Steakhouse) Mixed Drink', 'Fruit smoothie',
       "Virgil's Zero Sugar Black Cherry Soda",
       '(Harris Teeter) Fat Free Organic Milk', "(Alex's) Lemonade",
       'Pineapple and Ginger Juice', 'Moscato', 'Corona', 'Mojito',
       'Splenda/Sugar Blend', 'Low Sugar Apple Juice',
       '(Dunkin Coffee) Coffee', 'Cream', '(Equal) Sweetner Bag',
       "(Florida's Natural) Orange Juice",
       'Smoothie (spinach, celery, banana, and strawberry)', 