<h1> Libraries </h1>

In [1]:
# basic libraries
import time
import numpy as np

In [2]:
from pyspark import SparkContext
sc = SparkContext('local[*]', 'Recipe')

In [3]:
# spark sql related
from pyspark.sql import DataFrameWriter, SQLContext, Row
from pyspark.sql.types import *

sqlContext = SQLContext(sc)
sql = sqlContext.sql

In [4]:
add = lambda x, y: x+y
conv2string = lambda s: s.encode("UTF-8") if isinstance(s, unicode) else s

<h1> Get Ingredient Data </h1>

In [5]:
ingredient_txt = sc.textFile('ingre7.csv').map(lambda t: t.split(','))

In [6]:
# get ingredient csv header
ingredient_header = ingredient_txt.first()
print ingredient_header

[u'id', u'ing_name', u'prot.ing.g.', u'fat.ing.g.', u'total_cab.ing.g.', u'modified_cal.ing.kcal.']


In [7]:
ingredient_txt = ingredient_txt.filter(lambda t: t != ingredient_header)

In [8]:
ingredient_txt_num = ingredient_txt.count()
ingredient_cnt = ingredient_txt.map(lambda t: t[1]).distinct().count()
print "total {} data with {} distinct ingredient".format(ingredient_txt_num, ingredient_cnt)

total 7404 data with 4449 distinct ingredient


In [9]:
# to avoid error from split txt
print "Checked" if ingredient_txt.filter(lambda t: len(t) != 6).count() == 0 else "Error"

Checked


In [10]:
# Usage: Transfer 4 nutrition 2 np.array
def tran2np(t):
    return np.array([round(float(t[2]), 5) if t[2] != 'NA' else 0, 
                     round(float(t[3]), 5) if t[3] != 'NA' else 0, 
                     round(float(t[4]), 5) if t[4] != 'NA' else 0, 
                     round(float(t[5]), 5) if t[5] != 'NA' else 0, 
                     1])

# calculate average weight of each nutrition and total cal of each ingredient
ingredient_cal = ingredient_txt \
                    .map(lambda t: (t[1], tran2np(t))) \
                    .reduceByKey(add) \
                    .map(lambda (x, y): (x, np.round(y[:-1] / y[-1], 3))).collectAsMap()

In [11]:
# show top 3 ingredient, protein, fat, cabon, total cal
for x in ingredient_cal.keys()[:3]:
    ingc = ingredient_cal[x]
    print "{:>4}:".format(conv2string(x))
    print "{:>3.4f}, {:>3.4f}, {:>3.4f}, {:>3.4f}".format(ingc[0], ingc[1], ingc[2], ingc[3])

皇帝魚:
17.3000, 6.1500, 0.1500, 129.3440
貓竹:
3.9580, 0.1500, 7.3010, 36.0400
支骨湯:
0.6930, 0.6400, 0.0000, 8.7320


<h1> Get Recipe Data </h1>

In [12]:
recipe_txt = sc.textFile('recipe.csv').map(lambda t: t.replace('\"', "")).map(lambda t: t.split(','))

In [13]:
# get recipe csv header
recipe_header = recipe_txt.first()
print recipe_header

[u'recipe_name', u'filter', u'category', u'cook', u'ing', u'img', u'link', u'note']


In [14]:
recipe_txt = recipe_txt.filter(lambda t: t != recipe_header)

In [15]:
# to avoid error from split txt
print "Checked" if recipe_txt.filter(lambda t: len(t) != 8).count() == 0 else "Error"

Checked


In [16]:
# first example
testre = recipe_txt.first()
for x in testre:
    print x

煎蛋捲
腎臟病
早餐
煎
蛋:2 個=100|水:2 湯匙=30|黃油:2 湯匙=30|肉:0.5 罐=50
https://www.nwkidney.org/wp-content/uploads/2014/07/40-Second-Omelet.jpg
https://www.nwkidney.org/recipe/40-second-omlette/



In [17]:
# Usage: Calculate total nutrition of each recipe
def cal_cal(re):
    ings = re[4]
    cal = np.array([0.0]*4)
    for ing in ings.split('|'):
        ingl = ing.split(':')
        
        if "=" in ingl[1]:
            nl = ingl[1].split('=')
            # ul = nl[0].split(' ')
            
            if ingl[0] in ingredient_cal.keys():
                cal += ingredient_cal[ingl[0]] * float(nl[1]) / 100
        
        else:
            if '少許' not in conv2string(ingl[1]) and '適量' not in conv2string(ingl[1]):
                ul = ingl[1].split(' ')
                if ul[1] == 'g':
                    if ingl[0] in ingredient_cal.keys():
                        cal += ingredient_cal[ingl[0]] * float(ul[0]) / 100.0
            else:
                continue
    return re + cal.tolist()

In [18]:
sr_txt = recipe_txt.map(cal_cal)

In [19]:
# first example after calculate cal, protein, fat, cabon, total cal
for x in sr_txt.first():
    print x

煎蛋捲
腎臟病
早餐
煎
蛋:2 個=100|水:2 湯匙=30|黃油:2 湯匙=30|肉:0.5 罐=50
https://www.nwkidney.org/wp-content/uploads/2014/07/40-Second-Omelet.jpg
https://www.nwkidney.org/recipe/40-second-omlette/

23.1267
36.3686
2.3109
425.0703


<h1> Write Total Recipe File </h1>

In [20]:
srctotal_txt = sr_txt.collect()

In [21]:
# write combination to recipe_cal.csv
with open("recipe_cal.csv", "wb") as opf:
    for i, rc in enumerate(srctotal_txt):
        opf.write('{},{},{},{},{},{},{},{},{},{},{},{},{}\n' \
                  .format(str(i+1), conv2string(rc[0]), conv2string(rc[1]), conv2string(rc[2]), conv2string(rc[3]), \
                         conv2string(rc[4]), conv2string(rc[5]), conv2string(rc[6]), conv2string(rc[7]), \
                          rc[8], rc[9], rc[10], rc[11]))