# 14.4 USDA Food Database

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
pd.options.display.max_rows = 10

In [3]:
db = json.load(open('usda_food/database.json'))
len(db)

6636

In [4]:
db[0].keys()

dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])

In [5]:
db[0]['nutrients'][0]

{'description': 'Protein',
 'group': 'Composition',
 'units': 'g',
 'value': 25.18}

In [6]:
nutrients = pd.DataFrame(db[0]['nutrients'])
nutrients

Unnamed: 0,description,group,units,value
0,Protein,Composition,g,25.180
1,Total lipid (fat),Composition,g,29.200
2,"Carbohydrate, by difference",Composition,g,3.060
3,Ash,Other,g,3.280
4,Energy,Energy,kcal,376.000
...,...,...,...,...
157,Serine,Amino Acids,g,1.472
158,Cholesterol,Other,mg,93.000
159,"Fatty acids, total saturated",Other,g,18.584
160,"Fatty acids, total monounsaturated",Other,g,8.275


当把由字典组成的list转换为DataFrame的时候，我们可以吹创业提取的list部分。这里我们提取食品名，群（group），ID，制造商：

In [7]:
info_keys = ['description', 'group', 'id', 'manufacturer']
info = pd.DataFrame(db, columns=info_keys)
info[:5]

Unnamed: 0,description,group,id,manufacturer
0,"Cheese, caraway",Dairy and Egg Products,1008,
1,"Cheese, cheddar",Dairy and Egg Products,1009,
2,"Cheese, edam",Dairy and Egg Products,1018,
3,"Cheese, feta",Dairy and Egg Products,1019,
4,"Cheese, mozzarella, part skim milk",Dairy and Egg Products,1028,


In [8]:
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6636 entries, 0 to 6635
Data columns (total 4 columns):
description     6636 non-null object
group           6636 non-null object
id              6636 non-null int64
manufacturer    5195 non-null object
dtypes: int64(1), object(3)
memory usage: 207.5+ KB


我们可以看到食物群的分布，使用value_counts:

In [None]:
pd.value_counts(info.group)[:10]

Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Fast Foods                           365
Legumes and Legume Products          365
Lamb, Veal, and Game Products        345
Sweets                               341
Fruits and Fruit Juices              328
Pork Products                        328
Name: group, dtype: int64

这里我们对所有的nutrient数据做一些分析，把每种食物的nutrient部分组合成一个大表格。首先，把每个食物的nutrient列表变为DataFrame，添加一列为id，然后把id添加到DataFrame中，接着使用concat联结到一起：

In [None]:
# 先创建一个空DataFrame用来保存最后的结果
# 这部分代码运行时间较长，请耐心等待
nutrients_all = pd.DataFrame()

for food in db:
    nutrients = pd.DataFrame(food['nutrients'])
    nutrients['id'] = food['id']
    nutrients_all = nutrients_all.append(nutrients, ignore_index=True)

> 译者：虽然作者在书中说了用concat联结在一起，但我实际测试后，这个concat的方法非常耗时，用时几乎是append方法的两倍，所以上面的代码中使用了append方法。

一切正常的话出来的效果是这样的：

In [None]:
nutrients_all

这个DataFrame中有一些重复的部分，看一下有多少重复的行：

In [None]:
nutrients_all.duplicated().sum() # number of duplicates

把重复的部分去掉：

In [None]:
nutrients_all = nutrients_all.drop_duplicates()
nutrients_all

为了与info_keys中的group和descripton区别开，我们把列名更改一下：

In [None]:
col_mapping = {'description': 'food',
               'group': 'fgroup'}

In [None]:
info = info.rename(columns=col_mapping, copy=False)
info.info()

In [None]:
col_mapping = {'description' : 'nutrient',
               'group': 'nutgroup'}

In [None]:
nutrients_all = nutrients_all.rename(columns=col_mapping, copy=False)
nutrients_all

上面所有步骤结束后，我们可以把info和nutrients_all合并（merge）：

In [None]:
ndata = pd.merge(nutrients_all, info, on='id', how='outer')
ndata.info()

In [None]:
ndata.iloc[30000]

我们可以对食物群（food group）和营养类型（nutrient type）分组后，对中位数进行绘图：

In [None]:
result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)

In [None]:
%matplotlib inline

In [None]:
result['Zinc, Zn'].sort_values().plot(kind='barh', figsize=(10, 8))

我们还可以找到每一种营养成分含量最多的食物是什么：

In [None]:
by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])

get_maximum = lambda x: x.loc[x.value.idxmax()]
get_minimum = lambda x: x.loc[x.value.idxmin()]

max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

# make the food a little smaller
max_foods.food = max_foods.food.str[:50]

因为得到的DataFrame太大，这里只输出'Amino Acids'(氨基酸)的营养群（nutrient group）:

In [None]:
max_foods.loc['Amino Acids']['food']