# Imports y definiciones

In [2]:
import os, re, json
import numpy as np
import pandas as pd
from ast import literal_eval
from collections import defaultdict

In [3]:
databases = 'C:/Users/Chris/Desktop/Databases'
display(os.listdir(databases))

# Se elige el JSON
db = f'{databases}/{os.listdir(databases)[0]}'
db

['foodb_2020_04_07_json', 'foodb_2020_4_7_csv.tar.gz']

'C:/Users/Chris/Desktop/Databases/foodb_2020_04_07_json'

# Definición del diccionario de componentes a buscar (modificado en comparación con el script anterior)

In [4]:
# Definición del diccionario y lista de interés
compound_dict = dict()
compound_nums = list()
compound_list_publicID = list()

# Obteniendo la información de los componentes a utilizar
with open('interest_drink_comps_codes.csv', 'r', encoding='utf8') as file:
    for line in file:
        # Formateando la línea
        data = line.strip().split(';')
        # Agregando al diccionario
        compound_dict[data[0]] = data[1:]
        # Y a la lista
        compound_list_publicID.append((data[0], data[2]))

# Definición de una lista para pasarlos a números ID
compound_list_ID = [i for i in compound_list_publicID]

# Pasando a número los "compounds"
with open(f'{db}/Compound.json', 'r', encoding='utf8') as file:
    for line in file:
        # Transformando el dato a diccionario
        dict_to_rev = json.loads(line.strip())
        
        if dict_to_rev['public_id'] in list(zip(*compound_list_ID))[1]:
            # Reemplazando en la lista
            compound_list_ID = [i if dict_to_rev['public_id'] != i[1] 
                                else (i[0], dict_to_rev['id'], 'Compound')
                                for i in compound_list_ID]
            
# Pasando a número los "compounds" que no aparecen en la lista
with open(f'{db}/AccessionNumber.json', 'r', encoding='utf8') as file:
    for line in file:
        # Transformando el dato a diccionario
        dict_to_rev = json.loads(line.strip())
        
        if dict_to_rev['number'] in list(zip(*compound_list_ID))[1]:
            # Reemplazando en la lista
            compound_list_ID = [i if dict_to_rev['number'] != i[1] 
                                else (i[0], dict_to_rev['compound_id'], 'Compound')
                                for i in compound_list_ID]

# Finalmente, pasando a número los "nutrients"
with open(f'{db}/Nutrient.json', 'r', encoding='utf8') as file:
    for line in file:
        # Transformando el dato a diccionario
        dict_to_rev = json.loads(line.strip())
        
        if dict_to_rev['public_id'] in list(zip(*compound_list_ID))[1]:
            # Reemplazando en la lista
            compound_list_ID = [i if dict_to_rev['public_id'] != i[1] 
                                else (i[0], dict_to_rev['id'], 'Nutrient')
                                for i in compound_list_ID]

In [5]:
compound_dict

{'Protein': ['Protein', 'FDBN00002', 'Nutrient'],
 'Water': ['Water', 'FDB013390', 'Compound'],
 'Caffeine': ['Caffeine', 'FDB002100', 'Compound'],
 'Sugars': ['Sugars', 'FDB003715', 'Compound'],
 'Calcium, Ca': ['Calcium', 'FDB003513', 'Compound'],
 'Iron, Fe': ['Iron', 'FDB016251', 'Compound'],
 'Magnesium, Mg': ['Magnesium', 'FDB003518', 'Compound'],
 'Phosphorus, P': ['Phosphorus', 'FDB003520', 'Compound'],
 'Potassium, K': ['Potassium', 'FDB003521', 'Compound'],
 'Sodium, Na': ['Sodium', 'FDB003523', 'Compound'],
 'Zinc, Zn': ['Zinc', 'FDB003729', 'Compound'],
 'Copper, Cu': ['Copper', 'FDB003582', 'Compound'],
 'Selenium, Se': ['Selenium', 'FDB013400', 'Compound'],
 'Vitamin C': ['Ascorbic acid', 'FDB001223', 'Compound'],
 'Thiamin': ['Thiamine', 'FDB008424', 'Compound'],
 'Riboflavin': ['Riboflavine', 'FDB012160', 'Compound'],
 'Niacin': ['Nicotinic acid', 'FDB001014', 'Compound'],
 'Vitamin B6': ['Pyridoxine', 'FDB000574', 'Compound'],
 'Choline': ['Choline', 'FDB000710', 'Comp

In [6]:
compound_list_publicID

[('Protein', 'FDBN00002'),
 ('Water', 'FDB013390'),
 ('Caffeine', 'FDB002100'),
 ('Sugars', 'FDB003715'),
 ('Calcium, Ca', 'FDB003513'),
 ('Iron, Fe', 'FDB016251'),
 ('Magnesium, Mg', 'FDB003518'),
 ('Phosphorus, P', 'FDB003520'),
 ('Potassium, K', 'FDB003521'),
 ('Sodium, Na', 'FDB003523'),
 ('Zinc, Zn', 'FDB003729'),
 ('Copper, Cu', 'FDB003582'),
 ('Selenium, Se', 'FDB013400'),
 ('Vitamin C', 'FDB001223'),
 ('Thiamin', 'FDB008424'),
 ('Riboflavin', 'FDB012160'),
 ('Niacin', 'FDB001014'),
 ('Vitamin B6', 'FDB000574'),
 ('Choline', 'FDB000710'),
 ('Vitamin K', 'FDB023087'),
 ('Folic acid', 'FDB014504'),
 ('Vitamin B12, added', 'FDB022886')]

In [7]:
compound_list_ID

[('Protein', 2, 'Nutrient'),
 ('Water', 31063, 'Compound'),
 ('Caffeine', 2100, 'Compound'),
 ('Sugars', 3716, 'Compound'),
 ('Calcium, Ca', 3514, 'Compound'),
 ('Iron, Fe', 16258, 'Compound'),
 ('Magnesium, Mg', 31167, 'Compound'),
 ('Phosphorus, P', 3521, 'Compound'),
 ('Potassium, K', 3522, 'Compound'),
 ('Sodium, Na', 3524, 'Compound'),
 ('Zinc, Zn', 3730, 'Compound'),
 ('Copper, Cu', 3583, 'Compound'),
 ('Selenium, Se', 13403, 'Compound'),
 ('Vitamin C', 1223, 'Compound'),
 ('Thiamin', 8425, 'Compound'),
 ('Riboflavin', 12163, 'Compound'),
 ('Niacin', 1014, 'Compound'),
 ('Vitamin B6', 574, 'Compound'),
 ('Choline', 710, 'Compound'),
 ('Vitamin K', 23250, 'Compound'),
 ('Folic acid', 14507, 'Compound'),
 ('Vitamin B12, added', 23049, 'Compound')]

# Creación de las tablas en bruto

In [8]:
with open('summary/food_compounds.txt', 'r', encoding='utf8') as file:
    food_dict = file.readline().strip()

# Modificando el string para parsearlo adecuadamente
pattern = re.compile("^defaultdict\(<class '(\w+)'>")
class_to = pattern.findall(food_dict)[0]
food_dict = food_dict.replace(f"<class '{class_to}'>", class_to)
food_dict = eval(food_dict)

In [9]:
# Definición del diccionario de la tabla para cada alimento
table_data = dict()

for food_id in food_dict.keys():
    # Definición del diccionario de la tabla para cada componente
    compound_data = dict()
    
    for comp_info in compound_list_ID:
        # Obtener los datos de interés
        comp_name, source_id, source_type = comp_info
        
        # Definición de una lista que almacenará los datos para 
        # la comida "food_id" con componente "source_id"
        orig_content_list_i = list()
        
        # Revisando en la información filtrada 
        for dataline in food_dict[food_id]:
            if (dataline['source_id'] == source_id and 
                dataline['source_type'] == source_type and
                dataline['orig_unit'] == 'mg/100 g'): # Filtro de unidad
                # Se obtienen los componentes de interés
                if dataline['orig_content'] is not None:
                    orig_content_list_i.append((float(dataline['orig_content']), 
                                                dataline['orig_min'], 
                                                dataline['orig_max'], 
                                                dataline['orig_unit']))
                else:
                    orig_content_list_i.append((None, 
                                                dataline['orig_min'], 
                                                dataline['orig_max'], 
                                                dataline['orig_unit']))
                    
        
        # Agregando al diccionario de componentes
        compound_data[comp_name] = orig_content_list_i
    
    # Agregando a los datos de cada alimento
    table_data[food_id] = compound_data

In [10]:
with open('summary/raw_food_compounds_dict.txt', 'w', encoding='utf8') as file:
    file.write(str(table_data))

# Sanidad para asegurarnos de que todos estén en mg/100g

In [11]:
units_total = set()

for food_id in table_data.keys():
    for comp_name in table_data[food_id]:
        for i in set([i[3] for i in table_data[food_id][comp_name]]):
            units_total.add(i)

units_total

{'mg/100 g'}

# Creando un diccionario de traducción food_id $\rightarrow$ food_name

In [12]:
with open('summary/raw_food_compounds_dict.txt', 'r', encoding='utf8') as file:
    table_data = literal_eval(file.readline().strip())

# Definición del diccionario de alimentos
food_translate_dict = dict()

# Pasando a número
with open(f'{db}/Food.json', 'r', encoding='utf8') as file:
    for line in file:
        # Transformando el dato a diccionario
        dict_to_rev = json.loads(line.strip())
        
        if dict_to_rev['id'] in table_data.keys():
            # Reemplazando en la lista
            food_translate_dict[dict_to_rev['id']] = dict_to_rev['name']

In [13]:
food_translate_dict

{40: 'Pepper',
 58: 'Coffee',
 68: 'Turmeric',
 109: 'Mentha',
 159: 'Rosemary',
 165: 'Common sage',
 168: 'Winter savory',
 183: 'Common thyme',
 206: 'Ginger',
 219: 'Ginseng',
 246: 'Triticale',
 341: 'Coconut',
 372: 'Ginkgo nuts',
 384: 'Horseradish tree',
 562: 'Shiitake',
 569: 'Maitake',
 586: 'Cinnamon',
 666: 'Salt',
 748: 'Hibiscus tea',
 905: 'Root vegetables',
 947: 'Guarana'}

# Creación de las tablas sintetizadas brutas

In [14]:
with open('summary/raw_food_compounds_dict.txt', 'r', encoding='utf8') as file:
    table_data = literal_eval(file.readline().strip())

In [15]:
# Para cada alimento
for food_id in table_data.keys():
    # Guardar la tabla
    with open(f'summary/foodTables_raw/{food_id}_{food_translate_dict[food_id]}.txt', 
              'w', encoding='utf8') as file:
        for comp_name in table_data[food_id]:
            if table_data[food_id][comp_name] != []:
                print(food_id, comp_name, np.array(table_data[food_id][comp_name])[:,0].astype(float))
                comp_value = np.mean(np.array(table_data[food_id][comp_name])[:,0].astype(float) / 1000 / 100)
                file.write(f'{comp_name};{comp_value}\n')

341 Protein [6400. 1200.    0.    0. 4400.  800.    0.    0.    0.    0.    0.    0.
 3330. 6880. 3130. 3350. 5300. 3630. 1170. 2290. 2020.  720. 1610. 5300.
 2880. 5450.    0.    0. 6400. 4400.]
341 Caffeine [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
341 Sugars [ 6400.     0.  3700.     0.     0.     0.     0.     0.  6230.  7350.
 36750. 51500.  3340.  2610. 43170.]
341 Calcium, Ca [10.8   1.   10.8   0.    0.    0.    2.    1.   14.   26.   11.   14.
 27.   11.    4.   16.   18.   24.    4.   26.   15.   27.35  0.    1.
 10.8  10.8  14.  ]
341 Iron, Fe [3.6  0.06 1.8  0.04 0.04 0.04 0.15 0.15 2.43 3.32 1.51 1.84 3.39 2.28
 0.13 1.64 3.3  0.29 0.81 3.36 1.92 2.8  0.04 0.06 3.6  1.8  2.43]
341 Magnesium, Mg [32.]
341 Phosphorus, P [187.    0.  117.    0.    0.    0.    0.    0.  113.  206.  100.  103.
 211.  122.   22.  100.   96.   20.   59.  209.  107.  161.5 113. ]
341 Potassium, K [750.     0.   331.     0.     0.     0.     2.     2.   356.   543.
 361.   324.   554.   325.   101.   2

In [16]:
display(table_data[586])

{'Protein': [(4000.0, None, None, 'mg/100 g'),
  (600.0, None, None, 'mg/100 g'),
  (4000.0, None, None, 'mg/100 g')],
 'Water': [],
 'Caffeine': [(0.0, None, None, 'mg/100 g')],
 'Sugars': [(0.0, None, None, 'mg/100 g')],
 'Calcium, Ca': [(1002.0, None, None, 'mg/100 g'),
  (1002.0, None, None, 'mg/100 g'),
  (181.0, '903.0', '1265.0', 'mg/100 g')],
 'Iron, Fe': [(8.32, None, None, 'mg/100 g'),
  (8.32, None, None, 'mg/100 g'),
  (8.595, '1.15', '18.34', 'mg/100 g')],
 'Magnesium, Mg': [(20.0, '47.0', '87.0', 'mg/100 g')],
 'Phosphorus, P': [(64.0, None, None, 'mg/100 g'),
  (17.0, '49.0', '83.0', 'mg/100 g')],
 'Potassium, K': [(431.0, None, None, 'mg/100 g'),
  (431.0, None, None, 'mg/100 g'),
  (104.0, '304.0', '512.0', 'mg/100 g')],
 'Sodium, Na': [(10.0, None, None, 'mg/100 g'),
  (10.0, None, None, 'mg/100 g'),
  (8.0, '3.0', '19.0', 'mg/100 g')],
 'Zinc, Zn': [(1.83, None, None, 'mg/100 g'),
  (1.83, None, None, 'mg/100 g'),
  (0.71, '1.34', '2.76', 'mg/100 g')],
 'Copper, Cu':

# Tablas a sacar

Estas tablas en general no poseen mucha de la información relevante que se necesita, por lo cual no se considerará en el MVP0. 

In [17]:
dont_consider = (109, 168, 219, 905, 947)

# Filtrando las características que no aparezcan en todas las tablas

In [18]:
# Definición de la lista de componentes finales a considerar
compounds_mvp0 = [i[0] for i in compound_list_ID]

for filename in os.listdir('summary/foodTables_raw'):
    # Indice del alimento a revisar
    index_to_rev = int(filename.split('_')[0])
    
    # Si es que la tabla es válida, se revisan sus campos
    if not index_to_rev in dont_consider:
        with open(f'summary/foodTables_raw/{filename}', 'r', encoding='utf8') as file:
            # Definición de una lista auxiliar de los compuestos que contiene este alimento
            aux_compounds = list()
            
            for line in file:
                # Agregando los compuestos a la lista auxiliar
                aux_compounds.append(line.split(';')[0])
                
            # Finalmente se actualiza la lista solo con los componentes que se encontraron
            # en este alimento
            compounds_mvp0 = [i for i in compounds_mvp0 if i in aux_compounds]
            
print(compounds_mvp0)

['Protein', 'Calcium, Ca', 'Iron, Fe', 'Phosphorus, P', 'Potassium, K', 'Sodium, Na', 'Zinc, Zn', 'Copper, Cu', 'Thiamin', 'Riboflavin', 'Niacin', 'Vitamin B6', 'Folic acid']


# Creación de tablas finales (filtradas por alimento sin información y componentes que no aparecen en todas las tablas)

In [19]:
for filename in os.listdir('summary/foodTables_raw'):
    # Indice del alimento a revisar
    index_to_rev = int(filename.split('_')[0])
    
    # Si es que el alimento es válido, se revisan sus campos
    if not index_to_rev in dont_consider:
        with open(f'summary/foodTables_raw/{filename}', 'r', encoding='utf8') as file:
            # Definición de una lista auxiliar de la información a agregar en la tabla definitiva
            info_to_append = list()
            
            for line in file:
                # Agregando los compuestos a la lista auxiliar
                compound_to_rev = line.split(';')[0]
                
                # Si es que el compuesto es válido, se agrega a la lista de interés
                if compound_to_rev in compounds_mvp0:
                    info_to_append.append(line)
            
        # Finalmente, con toda la información recolectada, se escribe la tabla definitiva
        with open(f'summary/foodTables/{filename}', 'w', encoding='utf8') as file:
            file.writelines(info_to_append)