# Initial EDA of Nutrition5k Image Data

- Started Date: 9/22/2023
- By: Ben Fulroth

In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import re
import math
import time
import random

from time import sleep
from collections import Counter
from collections import defaultdict
from glob import glob
from tqdm import tqdm
from IPython.display import Image
import shutil
#from sci_nb_functools import nb_functools

### __Data Source:__ Nutrition5k-Realsense-overhead

- Note: Focus on overhead images taken by a robot to simplify
- Overhead images of food are stored in a public GCS bucket [here](https://console.cloud.google.com/storage/browser/nutrition5k_dataset)
- Bucket has a nested structure where the images we want are in a subfolder: `imagery/realsense_overhead`
- Within `imagery/realsense_overhead` are folders with names that match the meal id.
- Within these folders are 3 image files; depth_color.png, depth_raw.png, rgb.png
- We want the rgb but all are named this so we need to rename it.
- Rename these images the folder name so we can find their nutrition info.


### Downloading the images from GCS using `gsutil`

- Using the `gsutil` you can target a single directory or file for download on the Google Cloud Storage.  Here is a link to download the tool. [https://cloud.google.com/storage/docs/gsutil](https://cloud.google.com/storage/docs/gsutil)
- Only the folder containing realsense_overhead images was downloaded locally using the following commmand: `gsutil -m cp -r "gs://nutrition5k_dataset/nutrition5k_dataset/imagery/realsense_overhead" .`

### Psudo Code for Container

- Images are stored in a GSB in a folder called img_original
- Images are downloaded resized and sent back to another folder called img_resized
- The the entire bucket is data version controled using DVC

In [63]:


# Source and destination directories
src_path_of_ori_dir = '/Users/fulroth/Downloads/realsense_overhead'
dst_of_save_dir = '/Users/fulroth/Downloads/realsense_curated'

i = 0

for path, dirc, files in os.walk(src_path_of_ori_dir):
    
    if i > 0:
    
        print(os.path.basename(path))
    # Save the folder name which is the dish id
    for name in files:
        
        if 'rgb' in name:
            
            shutil.copy(src=os.path.join(path, name), dst=os.path.join(dst_of_save_dir, os.path.basename(path).strip('dish_') + '.png'))
    
    i += 1
    
    if i == 10:
        break

dish_1565379827
dish_1563468269
dish_1563389786
dish_1561662458
dish_1559593794
dish_1562787783
dish_1558725353
dish_1574184971
dish_1557936555


### Loop through the curated images and resize

In [64]:
currated_img_dir = dst_of_save_dir
resize_img_dir = '/Users/fulroth/Downloads/resized_currated_imgs'

from PIL import Image

def resize_image(image_path, target_size):
    
    # Open the image using Pillow
    image = Image.open(image_path)
    
    # Resize the image while maintaining the aspect ratio
    image.resize(target_size)
    
    # Return the resized image
    return image


In [65]:
ls_imgs = glob(currated_img_dir + '/*.png')


for img in ls_imgs:
    
    image = Image.open(img)
    print(image.size)

(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)
(640, 480)


__It turns out that all images are of the same size!__

### Let's move on to creating a TF Dataset