In [1]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

# Load the main files
food_df = pd.read_csv('food_data/food.csv')
food_nutrient_df = pd.read_csv('food_data/food_nutrient.csv', low_memory = False)
nutrient_df = pd.read_csv('food_data/nutrient.csv')

# Take a look at what we have
print("Food data shape:", food_df.shape)
print("\nFirst few foods:")
print(food_df.head())

Food data shape: (2064912, 5)

First few foods:
    fdc_id          data_type  \
0  1105904       branded_food   
1  1105905       branded_food   
2  1105906       branded_food   
3  1105907       branded_food   
4  1105898  experimental_food   

                                         description       food_category_id  \
0                         WESSON Vegetable Oil 1 GAL            Oils Edible   
1                                 SWANSON BROTH BEEF  Herbs/Spices/Extracts   
2           CAMPBELL'S SLOW KETTLE SOUP CLAM CHOWDER         Prepared Soups   
3        CAMPBELL'S SLOW KETTLE SOUP CHEESE BROCCOLI         Prepared Soups   
4  Discrepancy between the Atwater factor predict...                    NaN   

  publication_date  
0       2020-11-13  
1       2020-11-13  
2       2020-11-13  
3       2020-11-13  
4       2020-10-30  


In [2]:
# Look at the nutrient definitions
print("Nutrients available:")
print(nutrient_df[['id', 'name', 'unit_name']].head(20))

print("\n" + "="*50 + "\n")

# Look at how food_nutrient links them
print("Food-Nutrient relationships:")
print(food_nutrient_df.head())


Nutrients available:
      id                                  name unit_name
0   2047      Energy (Atwater General Factors)      KCAL
1   2048     Energy (Atwater Specific Factors)      KCAL
2   1001                                Solids         G
3   1002                              Nitrogen         G
4   1003                               Protein         G
5   1004                     Total lipid (fat)         G
6   1005           Carbohydrate, by difference         G
7   1006  Fiber, crude (DO NOT USE - Archived)         G
8   1007                                   Ash         G
9   1008                                Energy      KCAL
10  1009                                Starch         G
11  1010                               Sucrose         G
12  1011                               Glucose         G
13  1012                              Fructose         G
14  1013                               Lactose         G
15  1014                               Maltose         G
16  1015  

In [3]:
# Find all the nutrients we need
nutrients_we_want = [
    'Energy',
    'Protein', 
    'Total lipid',
    'Carbohydrate',
    'Fatty acids, total saturated',
    'Fiber',
    'Sugars, total',
    'Sodium',
    'Cholesterol',
    'Vitamin A',
    'Vitamin C',
    'Vitamin D',
    'Calcium',
    'Iron',
    'Potassium'
]

# Search for these nutrients
for nutrient_name in nutrients_we_want:
    matches = nutrient_df[nutrient_df['name'].str.contains(nutrient_name, case=False, na=False)]
    if not matches.empty:
        print(f"\n{nutrient_name}:")
        print(matches[['id', 'name', 'unit_name']])


Energy:
      id                               name unit_name
0   2047   Energy (Atwater General Factors)      KCAL
1   2048  Energy (Atwater Specific Factors)      KCAL
9   1008                             Energy      KCAL
63  1062                             Energy        kJ

Protein:
      id              name unit_name
4   1003           Protein         G
54  1053  Adjusted Protein         G

Total lipid:
     id               name unit_name
5  1004  Total lipid (fat)         G

Carbohydrate:
       id                         name unit_name
6    1005  Carbohydrate, by difference         G
51   1050   Carbohydrate, by summation         G
73   1072          Carbohydrate, other         G
450  2039                Carbohydrates         G

Fatty acids, total saturated:
       id                          name unit_name
259  1258  Fatty acids, total saturated         G

Fiber:
       id                                              name unit_name
7    1006              Fiber, crude (DO NOT

In [4]:
# Define the nutrients we want with their IDs
nutrient_mapping = {
    1008: 'calories_per_100g',
    1003: 'protein_per_100g',
    1005: 'carbs_per_100g',
    1004: 'fat_per_100g',
    1258: 'saturated_fat_per_100g',
    1079: 'fiber_per_100g',
    1063: 'sugar_per_100g',
    1093: 'sodium_per_100g',
    1253: 'cholesterol_per_100g',
    1106: 'vitamin_a_per_100g',
    1162: 'vitamin_c_per_100g',
    1114: 'vitamin_d_per_100g',
    1087: 'calcium_per_100g',
    1089: 'iron_per_100g',
    1092: 'potassium_per_100g'
}

# Filter for only the nutrients we want
filtered_nutrients = food_nutrient_df[food_nutrient_df['nutrient_id'].isin(nutrient_mapping.keys())]

print(f"Total food-nutrient relationships: {len(food_nutrient_df)}")
print(f"Filtered to our nutrients: {len(filtered_nutrients)}")
print(f"\nSample of filtered data:")
print(filtered_nutrients.head())

Total food-nutrient relationships: 26805037
Filtered to our nutrients: 19132407

Sample of filtered data:
         id   fdc_id  nutrient_id  amount  data_points  derivation_id  min  \
2  13706926  1105904         1253     0.0          NaN           75.0  NaN   
3  13706921  1105904         1092     0.0          NaN           75.0  NaN   
4  13706916  1105904         1008   867.0          NaN           71.0  NaN   
5  13706920  1105904         1089     0.0          NaN           75.0  NaN   
6  13706925  1105904         1162     0.0          NaN           75.0  NaN   

   max  median  loq footnote  min_year_acquired  percent_daily_value  
2  NaN     NaN  NaN      NaN                NaN                  0.0  
3  NaN     NaN  NaN      NaN                NaN                  0.0  
4  NaN     NaN  NaN      NaN                NaN                  NaN  
5  NaN     NaN  NaN      NaN                NaN                  0.0  
6  NaN     NaN  NaN      NaN                NaN                  0.0  

In [5]:
# Load the portion data
food_portion_df = pd.read_csv('food_data/food_portion.csv')

print("Food portion data:")
print(food_portion_df.head(20))
print("\nColumns:", food_portion_df.columns.tolist())


Food portion data:
       id  fdc_id  seq_num  amount  measure_unit_id portion_description  \
0   81549  167512      1.0    1.00             9999                 NaN   
1   81550  167513      1.0    1.00             9999                 NaN   
2   81551  167514      1.0    1.00             9999                 NaN   
3   81552  167515      1.0    1.00             9999                 NaN   
4   81553  167516      1.0    1.00             9999                 NaN   
5   81554  167516      2.0    1.00             9999                 NaN   
6   81555  167517      1.0    1.00             9999                 NaN   
7   81556  167517      2.0    1.00             9999                 NaN   
8   81557  167518      1.0    1.00             9999                 NaN   
9   81558  167519      1.0    1.00             9999                 NaN   
10  81559  167520      1.0    1.00             9999                 NaN   
11  81560  167520      2.0    1.00             9999                 NaN   
12  81

In [6]:
# Pivot the data - each food gets one row with all nutrients as columns
pivoted = filtered_nutrients.pivot_table(
    index='fdc_id',
    columns='nutrient_id', 
    values='amount',
    aggfunc='first'  # In case of duplicates, take first value
)

# Rename columns to match our database
pivoted.columns = [nutrient_mapping[col] for col in pivoted.columns]

# Reset index to make fdc_id a column
pivoted = pivoted.reset_index()

print("Pivoted data shape:", pivoted.shape)
print("\nFirst few rows:")
print(pivoted.head())
print("\nColumn names:")
print(pivoted.columns.tolist())


Pivoted data shape: (1900486, 16)

First few rows:
   fdc_id  protein_per_100g  fat_per_100g  carbs_per_100g  calories_per_100g  \
0  167512              5.88         13.24           41.18              307.0   
1  167513              4.34         11.27           53.42              330.0   
2  167514              6.10          3.70           79.80              377.0   
3  167515              8.00          1.80           46.00              232.0   
4  167516              6.58          9.22           41.05              273.0   

   sugar_per_100g  fiber_per_100g  calcium_per_100g  iron_per_100g  \
0             NaN             1.2               NaN           2.12   
1             NaN             1.4              28.0           1.93   
2             NaN             NaN               NaN            NaN   
3             NaN             NaN             180.0           1.40   
4             NaN             2.2             279.0           6.04   

   potassium_per_100g  sodium_per_100g  vitamin

In [7]:
# Step 1: Get food names
foods = food_df[['fdc_id', 'description']].copy()

# Step 2: Join with pivoted nutrients
foods_with_nutrients = foods.merge(pivoted, on='fdc_id', how='left')

# Step 3: Get serving sizes (take the first/most common serving per food)
servings = food_portion_df.groupby('fdc_id').first()[['gram_weight', 'modifier']].reset_index()
servings.columns = ['fdc_id', 'serving_size', 'serving_unit']

# Step 4: Join servings
final_foods = foods_with_nutrients.merge(servings, on='fdc_id', how='left')

print("Final dataset shape:", final_foods.shape)
print("\nFirst few rows:")
print(final_foods.head())
print("\nColumns:", final_foods.columns.tolist())

Final dataset shape: (2064912, 19)

First few rows:
    fdc_id                                        description  \
0  1105904                         WESSON Vegetable Oil 1 GAL   
1  1105905                                 SWANSON BROTH BEEF   
2  1105906           CAMPBELL'S SLOW KETTLE SOUP CLAM CHOWDER   
3  1105907        CAMPBELL'S SLOW KETTLE SOUP CHEESE BROCCOLI   
4  1105898  Discrepancy between the Atwater factor predict...   

   protein_per_100g  fat_per_100g  carbs_per_100g  calories_per_100g  \
0              0.00         93.33            0.00              867.0   
1              0.83          0.00            0.42                4.0   
2              2.45          5.31            6.12               82.0   
3              1.22          6.12            5.31               82.0   
4               NaN           NaN             NaN                NaN   

   sugar_per_100g  fiber_per_100g  calcium_per_100g  iron_per_100g  \
0             NaN             0.0               0.0   

In [8]:
# See all available data types
print("Available data types:")
print(food_df['data_type'].value_counts())
print("\n" + "="*50)

# Let's look at examples from each type
for data_type in food_df['data_type'].unique():
    print(f"\n{data_type} examples:")
    examples = food_df[food_df['data_type'] == data_type]['description'].head(5)
    for ex in examples:
        print(f"  - {ex}")
        

Available data types:
data_type
branded_food                1977398
sub_sample_food               62022
sr_legacy_food                 7793
market_acquistion              7215
survey_fndds_food              5432
sample_food                    3717
agricultural_acquisition        810
foundation_food                 411
experimental_food               114
Name: count, dtype: int64


branded_food examples:
  - WESSON Vegetable Oil 1 GAL
  - SWANSON BROTH BEEF
  - CAMPBELL'S SLOW KETTLE SOUP CLAM CHOWDER
  - CAMPBELL'S SLOW KETTLE SOUP CHEESE BROCCOLI
  - SWANSON BROTH CHICKEN

experimental_food examples:
  - Discrepancy between the Atwater factor predicted and empirically measured energy values of almonds in human diets
  - Food processing and structure impact the metabolizable energy of almonds
  - Metabolizable Energy from Cashew Nuts is Less than that Predicted by Atwater Factors
  - Measured energy value of pistachios in the human diet
  - Walnuts Consumed by Healthy Adults Provide Le

In [9]:
# Filter to useful food types
useful_types = ['sr_legacy_food', 'survey_fndds_food', 'foundation_food']
filtered_food_df = food_df[food_df['data_type'].isin(useful_types)]

print(f"Total foods: {len(filtered_food_df)}")
print("\nBreakdown:")
print(filtered_food_df['data_type'].value_counts())

# Rebuild the final dataset
foods = filtered_food_df[['fdc_id', 'description']].copy()
foods_with_nutrients = foods.merge(pivoted, on='fdc_id', how='left')
servings = food_portion_df[food_portion_df['fdc_id'].isin(foods['fdc_id'])]
servings = servings.groupby('fdc_id').first()[['gram_weight', 'modifier']].reset_index()
servings.columns = ['fdc_id', 'serving_size', 'serving_unit']
final_foods = foods_with_nutrients.merge(servings, on='fdc_id', how='left')

print(f"\nFinal dataset shape: {final_foods.shape}")
print("\nSample foods:")
print(final_foods[['description', 'calories_per_100g', 'protein_per_100g', 'serving_size', 'serving_unit']].head(20))

Total foods: 13636

Breakdown:
data_type
sr_legacy_food       7793
survey_fndds_food    5432
foundation_food       411
Name: count, dtype: int64

Final dataset shape: (13636, 19)

Sample foods:
                                          description  calories_per_100g  \
0   Pillsbury Golden Layer Buttermilk Biscuits, Ar...              307.0   
1   Pillsbury, Cinnamon Rolls with Icing, refriger...              330.0   
2   Kraft Foods, Shake N Bake Original Recipe, Coa...              377.0   
3      George Weston Bakeries, Thomas English Muffins              232.0   
4          Waffles, buttermilk, frozen, ready-to-heat              273.0   
5   Waffle, buttermilk, frozen, ready-to-heat, toa...              309.0   
6   Waffle, buttermilk, frozen, ready-to-heat, mic...              289.0   
7     Waffle, plain, frozen, ready-to-heat, microwave              298.0   
8   Pie Crust, Cookie-type, Graham Cracker, Ready ...              501.0   
9      Pie Crust, Cookie-type, Chocolate, Read

In [10]:
# Save the processed data to CSV
output_path = 'food_data/processed_foods.csv'
final_foods.to_csv(output_path, index=False)

print(f"Saved {len(final_foods)} foods to {output_path}")
print("\nReady to import into PostgreSQL!")
print("\nNext steps:")
print("1. Open pgAdmin")
print("2. Right-click on 'foods' table → Import/Export Data")
print("3. Select the processed_foods.csv file")
print("4. Make sure 'Header' is turned ON")
print("5. Click OK to import")

Saved 13636 foods to food_data/processed_foods.csv

Ready to import into PostgreSQL!

Next steps:
1. Open pgAdmin
2. Right-click on 'foods' table → Import/Export Data
3. Select the processed_foods.csv file
4. Make sure 'Header' is turned ON
5. Click OK to import


In [11]:
import os

# Check current working directory
print("Current directory:", os.getcwd())

# Save with absolute path to make sure it goes to the right place
output_path = os.path.join(os.getcwd(), 'food_data', 'processed_foods.csv')
print(f"\nSaving to: {output_path}")

final_foods.to_csv(output_path, index=False)

print(f"\nSaved! File should be at: {output_path}")

# Verify it exists
if os.path.exists(output_path):
    print("✓ File confirmed to exist!")
    file_size = os.path.getsize(output_path) / (1024 * 1024)  # Size in MB
    print(f"File size: {file_size:.2f} MB")
else:
    print("✗ File not found at that location")

Current directory: /Users/graysonorr/Desktop/JS:CSS:HTML/fitnesstracker/FitnessApp

Saving to: /Users/graysonorr/Desktop/JS:CSS:HTML/fitnesstracker/FitnessApp/food_data/processed_foods.csv

Saved! File should be at: /Users/graysonorr/Desktop/JS:CSS:HTML/fitnesstracker/FitnessApp/food_data/processed_foods.csv
✓ File confirmed to exist!
File size: 1.79 MB


In [12]:
# Check for any data issues
print("Checking data quality...")
print(f"\nTotal rows: {len(final_foods)}")
print(f"Columns: {len(final_foods.columns)}")
print("\nNull counts per column:")
print(final_foods.isnull().sum())

print("\n" + "="*50)
print("Sample of first few rows:")
print(final_foods.head())

# Check for long strings in serving_unit
print("\n" + "="*50)
print("Longest serving_unit values:")
print(final_foods['serving_unit'].str.len().nlargest(5))
max_serving = final_foods.loc[final_foods['serving_unit'].str.len().idxmax()]
print(f"\nLongest serving_unit: '{max_serving['serving_unit']}'")

Checking data quality...

Total rows: 13636
Columns: 19

Null counts per column:
fdc_id                        0
description                   0
protein_per_100g             13
fat_per_100g                 24
carbs_per_100g               60
calories_per_100g           277
sugar_per_100g            13476
fiber_per_100g              767
calcium_per_100g             99
iron_per_100g                94
potassium_per_100g          291
sodium_per_100g             134
vitamin_a_per_100g         1209
vitamin_d_per_100g         2958
vitamin_c_per_100g          752
cholesterol_per_100g        704
saturated_fat_per_100g      622
serving_size                592
serving_unit                655
dtype: int64

Sample of first few rows:
   fdc_id                                        description  \
0  167512  Pillsbury Golden Layer Buttermilk Biscuits, Ar...   
1  167513  Pillsbury, Cinnamon Rolls with Icing, refriger...   
2  167514  Kraft Foods, Shake N Bake Original Recipe, Coa...   
3  167515     G

In [13]:
# Check how many foods have sugar data
print("Foods with sugar data:")
print(final_foods['sugar_per_100g'].notna().sum())
print(f"out of {len(final_foods)} total foods")

print("\n" + "="*50)

# Check the original nutrient data
print("Sugar nutrient in food_nutrient_df:")
sugar_data = food_nutrient_df[food_nutrient_df['nutrient_id'] == 1063]
print(f"Total sugar entries: {len(sugar_data)}")

print("\n" + "="*50)

# Sample some foods and check their nutrients
sample_food_id = final_foods['fdc_id'].iloc[0]
print(f"\nNutrients for food {sample_food_id}:")
print(food_nutrient_df[food_nutrient_df['fdc_id'] == sample_food_id][['nutrient_id', 'amount']])

Foods with sugar data:
160
out of 13636 total foods

Sugar nutrient in food_nutrient_df:
Total sugar entries: 858


Nutrients for food 167512:
        nutrient_id    amount
446148         1051    35.500
446149         1005    41.180
446150         1093  1059.000
446151         1257     4.412
446152         1003     5.880
446153         1079     1.200
446154         2000     5.880
446155         1089     2.120
446156         1062  1286.000
446157         1004    13.240
446158         1258     2.941
446159         1253     0.000
446160         1007     3.500
446161         1008   307.000


In [28]:
# Check if the sugar data exists for our filtered foods
our_food_ids = final_foods['fdc_id'].tolist()
sugar_for_our_foods = food_nutrient_df[
    (food_nutrient_df['fdc_id'].isin(our_food_ids)) & 
    (food_nutrient_df['nutrient_id'] == 1063)
]

print(f"Sugar entries for our {len(final_foods)} foods: {len(sugar_for_our_foods)}")
print("\n" + "="*50)

# Let's look at which foods have sugar
foods_with_sugar = final_foods[final_foods['sugar_per_100g'].notna()]
print("\nSample foods WITH sugar data:")
print(foods_with_sugar[['description', 'sugar_per_100g']].head(10))

print("\n" + "="*50)

# And which don't
foods_without_sugar = final_foods[final_foods['sugar_per_100g'].isna()]
print("\nSample foods WITHOUT sugar data:")
print(foods_without_sugar[['description', 'calories_per_100g']].head(10))

Sugar entries for our 13636 foods: 160


Sample foods WITH sugar data:
                                            description  sugar_per_100g
7793                                 Hummus, commercial            0.34
7794  Milk, reduced fat, fluid, 2% milkfat, with add...            4.89
7797  Beans, snap, green, canned, regular pack, drai...            1.29
7798                                      Broccoli, raw            1.40
7799  Milk, lowfat, fluid, 1% milkfat, with added vi...            4.96
7800  Milk, nonfat, fluid, with added vitamin A and ...            5.05
7801   Milk, whole, 3.25% milkfat, with added vitamin D            4.81
7802                        Frankfurter, beef, unheated            1.26
7803        Nuts, almonds, dry roasted, with salt added            4.17
7805                                          Kale, raw            0.80


Sample foods WITHOUT sugar data:
                                         description  calories_per_100g
0  Pillsbury Golden Layer Butt