In [42]:
import pandas as pd
import glob, os
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [43]:
entire_dataset = pd.DataFrame()
for i,file in enumerate(glob.glob('./data/*.csv')):
    df_temp = pd.read_csv(file)
    df_temp['subject'] = np.ones(df_temp.shape[0]) * (i+1)
    entire_dataset = pd.concat([entire_dataset, df_temp])
entire_dataset

Unnamed: 0,date,time,time_begin,time_end,logged_food,amount,unit,searched_food,calorie,total_carb,dietary_fiber,sugar,protein,total_fat,subject,time_of_day
0,2020-02-13,18:00:00,2020-02-13 18:00:00,,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,85.0,1.7,83.0,16.0,3.3,1.0,
1,2020-02-13,20:30:00,2020-02-13 20:30:00,,Chicken Leg,1.0,,chicken leg,475.0,0.0,0.0,0.0,62.0,23.0,1.0,
2,2020-02-13,20:30:00,2020-02-13 20:30:00,,Asparagus,4.0,,Asparagus,13.0,2.5,1.2,0.8,1.4,0.1,1.0,
3,2020-02-14,07:10:00,2020-02-14 07:10:00,,Natrel Lactose Free 2 Percent,8.0,fluid ounce,(Natrel) Lactose Free 2% Partly Skimmed Milk,120.0,9.0,,8.0,12.0,,1.0,
4,2020-02-14,07:10:00,2020-02-14 07:10:00,,Standard Breakfast,0.75,cup,"(Kellogg's) Frosted Flakes, Cereal",110.0,26.0,,10.0,1.0,,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40,2/26/2020,,2020-02-26 18:30:00,,Lemonade,32,oz,,99.0,26.0,0.0,25.0,0.2,0.1,16.0,18:30
41,2/27/2020,,2020-02-27 10:30:00,,Standard breakfast,,,,280.0,56.5,1.0,24.0,8.0,2.5,16.0,10:30
42,2/27/2020,,2020-02-27 11:30:00,,Plain cheese pizza,1,slices,,452.0,57.0,3.9,6.1,19.0,16.0,16.0,11:30
43,2/27/2020,,2020-02-27 11:30:00,,cooked black eyed peas,1,cup,,198.0,35.0,11.0,5.6,13.0,0.9,16.0,11:30


In [44]:
entire_dataset.isna().any()

date             False
time              True
time_begin        True
time_end          True
logged_food      False
amount            True
unit              True
searched_food     True
calorie          False
total_carb       False
dietary_fiber     True
sugar            False
protein           True
total_fat         True
subject          False
time_of_day       True
dtype: bool

In [55]:
def fix_time(df):
    if 'time_begin' in df.columns:
        df['time_begin'].fillna(df['date'] + ' ' + df['time'], inplace=True)
    else:
        df['time_begin'] = df['date'] + ' ' + df['time']
    return df
entire_dataset.transform(fix_time)

Unnamed: 0,date,time,time_begin,time_end,logged_food,amount,unit,searched_food,calorie,total_carb,dietary_fiber,sugar,protein,total_fat,subject,time_of_day
0,2020-02-13,18:00:00,2020-02-13 18:00:00,,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,85.0,1.7,83.0,16.0,3.3,1.0,
1,2020-02-13,20:30:00,2020-02-13 20:30:00,,Chicken Leg,1.0,,chicken leg,475.0,0.0,0.0,0.0,62.0,23.0,1.0,
2,2020-02-13,20:30:00,2020-02-13 20:30:00,,Asparagus,4.0,,Asparagus,13.0,2.5,1.2,0.8,1.4,0.1,1.0,
3,2020-02-14,07:10:00,2020-02-14 07:10:00,,Natrel Lactose Free 2 Percent,8.0,fluid ounce,(Natrel) Lactose Free 2% Partly Skimmed Milk,120.0,9.0,,8.0,12.0,,1.0,
4,2020-02-14,07:10:00,2020-02-14 07:10:00,,Standard Breakfast,0.75,cup,"(Kellogg's) Frosted Flakes, Cereal",110.0,26.0,,10.0,1.0,,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40,2/26/2020,,2020-02-26 18:30:00,,Lemonade,32,oz,,99.0,26.0,0.0,25.0,0.2,0.1,16.0,18:30
41,2/27/2020,,2020-02-27 10:30:00,,Standard breakfast,,,,280.0,56.5,1.0,24.0,8.0,2.5,16.0,10:30
42,2/27/2020,,2020-02-27 11:30:00,,Plain cheese pizza,1,slices,,452.0,57.0,3.9,6.1,19.0,16.0,16.0,11:30
43,2/27/2020,,2020-02-27 11:30:00,,cooked black eyed peas,1,cup,,198.0,35.0,11.0,5.6,13.0,0.9,16.0,11:30


In [58]:
entire_dataset['time_begin'].iloc[0].split(' ')

['2020-02-13', '18:00:00']

In [61]:
def convert_date_time(df):
    df[['date', 'time']] = df['time_begin'].str.split(' ', expand=True)
    return df

entire_dataset = entire_dataset.transform(convert_date_time)


In [63]:
entire_dataset.isna().any()

date             False
time             False
time_begin       False
time_end          True
logged_food      False
amount            True
unit              True
searched_food     True
calorie          False
total_carb       False
dietary_fiber     True
sugar            False
protein           True
total_fat         True
subject          False
time_of_day       True
dtype: bool

In [64]:
data = entire_dataset[['date', 'time', 'logged_food', 'calorie', 'subject']]
display(data.head())

Unnamed: 0,date,time,logged_food,calorie,subject
0,2020-02-13,18:00:00,Berry Smoothie,456.0,1.0
1,2020-02-13,20:30:00,Chicken Leg,475.0,1.0
2,2020-02-13,20:30:00,Asparagus,13.0,1.0
3,2020-02-14,07:10:00,Natrel Lactose Free 2 Percent,120.0,1.0
4,2020-02-14,07:10:00,Standard Breakfast,110.0,1.0


In [65]:
data['logged_food'].unique()

array(['Berry Smoothie', 'Chicken Leg', 'Asparagus',
       'Natrel Lactose Free 2 Percent', 'Standard Breakfast',
       'Breakfast Trail Mix', 'Spinach Salad w/ strawberries and cheese',
       'Egg', 'Acai Smoothie', "(Trader Joe's) Mac and Cheese",
       'Coconut Shrimp', 'Spinach Smoothie',
       'Spinach Salad w/ blueberries, egg, and cheese', 'Babel Cheese',
       'Bourbon Chicken', 'Rice', 'Shrimp', 'Cabbage', 'Hot Chocolate',
       'Salty Sweet Popcorn', 'Chai Tea', 'Maple Brown Sugar Oatmeal',
       'Salad with Cranberries', 'Chicken Nuggets', 'Kale Salad', 'Pizza',
       'Oreo Cookies', 'Muffin', 'Grilled Chicken Wrap',
       'Kale and Fruit Smoothie', 'Ranch Wings', 'Lemon Loaf',
       'Turkey Slider', 'Chicken and Rice', 'Green Smoothie', 'Bagel',
       'Salad', 'Babel bell cheese', 'Tangerine Orange', 'Babybel Cheese',
       'Chicken Salad', 'Pita Bread', 'Cheese Pita', 'Boost',
       'Mello Yello', '(Jimmy Dean) Chicken Biscuit', 'Beef Jerky',
       '(Gatorad