# Section 1 (Regression)

In [4]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
import random
from sklearn import linear_model

In [5]:
def parse(f):
    for l in gzip.open(f):
        yield eval(l)

In [6]:
# Download data from below:
# https://cseweb.ucsd.edu/classes/fa21/cse258-b/files/
dataset = list(parse("trainRecipes.json.gz"))

In [7]:
len(dataset)

200000

In [8]:
train = dataset[:150000]
valid = dataset[150000:175000]
test = dataset[175000:]

In [9]:
dataset[1]

{'name': 'double delicious cookie bars',
 'minutes': 40,
 'contributor_id': '26865936',
 'submitted': '2007-08-27',
 'steps': 'preheat oven to 350f\tin 13x9-inch baking pan , melt butter in oven\tsprinkle crumbs evenly over butter\tpour milk evenly over crumbs\ttop with remaining ingredients\tpress down firmly\tbake 25-30 minutes or until lightly browned\tcool completely , chill if desired , and cut into bars',
 'description': 'from "all time favorite recipes". for fun, try substituting butterscotch or white chocolate chips for the semi-sweet and/or peanut butter chips. make sure you cool it completely or the bottom will crumble!',
 'ingredients': ['butter',
  'graham cracker crumbs',
  'sweetened condensed milk',
  'semi-sweet chocolate chips',
  'peanut butter chips'],
 'recipe_id': '98015212'}

In [29]:
import dateutil.parser
year_list = [(dateutil.parser.parse(d['submitted']).year,dateutil.parser.parse(d['submitted']).month) for d in dataset]
print(min(year_list)) ## 1999 Aug
print(max(year_list)) ## 2018 Dec 

(1999, 8)
(2018, 12)


In [38]:
from collections import Counter
ingredients_list = []
for d in dataset :
    ingredients_list += d['ingredients']
ingredients_count = Counter(ingredients_list)
#print(ingredients_count)
#print(ingredients_count.most_common(50))
top_50_ingredients = [i[0] for i in ingredients_count.most_common(50)]
print(top_50_ingredients)

['salt', 'butter', 'sugar', 'onion', 'water', 'eggs', 'olive oil', 'flour', 'milk', 'garlic cloves', 'pepper', 'brown sugar', 'garlic', 'all-purpose flour', 'baking powder', 'egg', 'salt and pepper', 'parmesan cheese', 'lemon juice', 'baking soda', 'vegetable oil', 'vanilla', 'black pepper', 'cinnamon', 'tomatoes', 'sour cream', 'garlic powder', 'vanilla extract', 'oil', 'honey', 'garlic clove', 'cream cheese', 'onions', 'celery', 'cheddar cheese', 'unsalted butter', 'mayonnaise', 'soy sauce', 'chicken broth', 'paprika', 'extra virgin olive oil', 'worcestershire sauce', 'fresh parsley', 'cornstarch', 'fresh ground black pepper', 'parsley', 'carrots', 'chili powder', 'ground cinnamon', 'bacon']


### Question 1
<img src="question1.png">

In [43]:
def feat1a(d):
    return [len(d['steps'])]

print(feat1a(dataset[0]))

[743]


In [57]:
def feat1b(d): ## 1999 to 2018 -> 20 dimensional vector
    year_feat = [0]*20
    year_feat[dateutil.parser.parse(d['submitted']).year - 1999] = 1
    month_feat = [0]*12
    month_feat[dateutil.parser.parse(d['submitted']).month-1] = 1
    return year_feat+month_feat

print(feat1b(dataset[0]))

[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]


In [42]:
def feat1c(d):
    return [int(i in d['ingredients']) for i in top_50_ingredients]

print(feat1c(dataset[0]))

[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [55]:
def feat(d, a = True, b = True, c = True):
    # Hint: for Questions 1 and 2, might be useful to set up a function like this
    #       which allows you to "select" which features are included
    feat = []
    if a==True:
        feat += feat1a(d)
    if b==True:
        feat += feat1b(d)
    if c==True:
        feat += feat1c(d)
    return feat
    

In [59]:
import numpy as np
def MSE(y, ypred):
    return np.square(y - ypred).mean(axis=0)
    # Can use library if you prefer

In [68]:
def experiment(a = True, b = True, c = True, mod= linear_model.LinearRegression()):
    X_train = [feat(d, a, b, c) for d in train]
    y_train = [d['minutes'] for d in train]
    X_test = [feat(d, a, b, c) for d in test]
    y_test = [d['minutes'] for d in test]
    mod.fit(X_train, y_train)
    ypred = model.predict(X_test)
    mse = MSE(y_test, ypred)
    print(mse)

    # Hint: might be useful to write this function which extracts features and 
    #       computes the performance of a particular model on those features

Answer for Question 1:
a. feat1a(dataset[0]) = [743]
b. feat1b(dataset[0]) = [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
c. feat1c(dataset[0]) = [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

### Question 2
<img src="question2.png">

In [70]:
# all features
model = linear_model.LinearRegression()
experiment(True, True, True, model)

5880.429911272418


In [71]:
# ablation feature 1a
model = linear_model.LinearRegression()
experiment(False, True, True, model)

5992.733953105812


In [72]:
# ablation feature 1b
model = linear_model.LinearRegression()
experiment(True, False, True, model)

5888.916117624437


In [73]:
# ablation feature 1c
model = linear_model.LinearRegression()
experiment(True, True, False, model)

6240.125264343656


Answer for Question 2:

a. MSE on test ser for ablation feature 1a : 5992.733953105812

b. MSE on test ser for ablation feature 1b : 5888.916117624437

c. MSE on test ser for ablation feature 1c : 6240.125264343656

d. feature 1c is most important cause MSE explodes the most when feature 1c is excluded

### Question 3

In [29]:
def pipeline():
    for lamb in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
        

### Question 4
<img src="question4.png">

Answer for Question 4:

a. Since the most cooking time lie in 15-30 minutes, and only few (long tail) has 8+ hour range, the model may find it hard to predict the long tail data, and might have huge penalization on those 8+ hour recipes.

b. Suggestion1 : I would try to replicate the data in tails so that the cooking time has more equal distribution.

c. Suggestion2 : 

# Section 2 (Classification)

### Question 5
<img src="question5.png">

In [137]:
def BER(predictions, y):
    # Implement following this logic or otherwise
    TP = sum([(p and l) for (p,l) in zip(predictions, y)])
    FP = sum([(p and not(l)) for (p,l) in zip(predictions, y)])
    TN = sum([(not(p) and not(l)) for (p,l) in zip(predictions, y)])
    FN = sum([(not(p) and l) for (p,l) in zip(predictions, y)])
    BER = 0.5 * (FN / (TP + FN) + FP / (TN + FP))
    #print(TP, FP, TN, FN)
    return BER

In [138]:
def feat2(d, dict_size, mostPopularInd):
    fIng = [0] * dict_size
    for i in d['ingredients']:
        if i == 'butter':
            continue
        if i in mostPopularInd:
            #print("*")
            fIng[mostPopularInd[i]] = 1
    return fIng

In [139]:
def experiment(reg = 1, dict_size = 50):
    # Hint: run an experiment with a particular regularization strength, and a particular one-hot encoding size
    # extract features...
    # (etc.) 
    mostPopularInd = dict(zip([i[0] for i in ingredients_count.most_common(dict_size)], range(dict_size)))
    #print(mostPopularInd)
    X_train = [feat2(d, dict_size, mostPopularInd) for d in train]
    y_train = [int('butter' in d['ingredients']) for d in train]

    X_valid = [feat2(d, dict_size, mostPopularInd) for d in valid]
    y_valid = [int('butter' in d['ingredients']) for d in valid]
    
    X_test = [feat2(d, dict_size, mostPopularInd) for d in test]
    y_test = [int('butter' in d['ingredients']) for d in test]
    
    mod = linear_model.LogisticRegression(C=reg, class_weight='balanced', solver = 'lbfgs')
    mod.fit(X_train, y_train)
    
    ypred_train = mod.predict(X_train)
    ypred_valid = mod.predict(X_valid)
    ypred_test = mod.predict(X_test)
    
    ber_train = BER(ypred_train, y_train)
    ber_valid = BER(ypred_valid, y_valid)
    ber_test = BER(ypred_test, y_test)
    print(ber_train, ber_valid, ber_test)
    # (etc.)

experiment(reg = 1, dict_size = 50)

0.2900098441634009 0.2894992495605814 0.28898437523315856


Answer for Question 5:

a. BER = 0.28898437523315856

### Question 6
<img src="question6.png">

In [140]:
def pipeline():
    for C in [0.01, 1, 100]:
        for dsize in [50, 100, 500]:
            print(C,dsize)
            experiment(C, dsize)
            # Example values, can pick any others...
            
pipeline()

0.01 50
0.29062441573723236 0.29064380312920035 0.28965326046777945
0.01 100
0.26406086145975616 0.2648462891599574 0.26650275754168856
0.01 500
0.22849675171583544 0.23055365206358197 0.2303930628502352
1 50
0.2900098441634009 0.2894992495605814 0.28898437523315856
1 100
0.2620917378493318 0.2645598522496887 0.266360126687606
1 500


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.22340670702116017 0.22516993055836515 0.22551948967394542
100 50
0.28992859694645495 0.289556389439567 0.28895820818270845
100 100
0.26214510583908995 0.2645598522496887 0.2662752658593643
100 500


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.2233441739959211 0.22504205944626504 0.22555910442106264


Answer for Question 6:

a. As above <img src="answer6.png">

b. Selected Model : C=100, dsize=500

   test BER =  0.22555910442106264

In [40]:
### Question 7
#(open ended)

# Section 3 (Recommender Systems)

### Question 8
<img src="question8.png">

In [96]:
# Utility data structures
ingsPerItem = defaultdict(set)
itemsPerIng = defaultdict(set)

In [97]:
for d in dataset:
    r = d['recipe_id']
    for i in d['ingredients']:
        ingsPerItem[r].add(i)
        itemsPerIng[i].add(r)

In [98]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [101]:
def mostSimilar8(i, N):
    similarities = []
    ings = ingsPerItem[i]
    for i2 in ingsPerItem.keys():
        if i2 == i: continue
        sim = Jaccard(ings, ingsPerItem[i2])
        #sim = Pearson(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:N]

print(mostSimilar8('06432987',5))

[(0.4166666666666667, '68523854'), (0.38461538461538464, '12679596'), (0.36363636363636365, '79675099'), (0.36363636363636365, '56301588'), (0.35714285714285715, '87359281')]


Answer for Question 8:

a. Five most similar recipes to '06432987'

Jaccard Similarity, recipe ID

(0.4166666666666667, '68523854')

(0.38461538461538464, '12679596')

(0.36363636363636365, '79675099')

(0.36363636363636365, '56301588')

(0.35714285714285715, '87359281')

### Question 9
<img src="question9.png">

In [104]:
def mostSimilar9(i, N):
    similarities = []
    items = itemsPerIng[i]
    for i2 in itemsPerIng.keys():
        if i2 == i: continue
        sim = Jaccard(items, itemsPerIng[i2])
        #sim = Pearson(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:N]

print(mostSimilar9('butter',5))

[(0.22315311514274808, 'salt'), (0.2056685424969639, 'flour'), (0.19100394157199166, 'eggs'), (0.17882420717656095, 'sugar'), (0.17040052045973944, 'milk')]


Answer for Question 9:

a. Five most similar ingredients to 'butter'

Jaccard Similarity, Ingredient

(0.22315311514274808, 'salt')

(0.2056685424969639, 'flour')

(0.19100394157199166, 'eggs')

(0.17882420717656095, 'sugar')

(0.17040052045973944, 'milk')

### Question 10
<img src="question10.png">

In [153]:
# simply Jaccard
def rec_recipe(ing_list,N): 
    similarities = []
    for i2 in ingsPerItem.keys():
        if i2 == i: continue
        sim = Jaccard(set(ing_list), ingsPerItem[i2])
        #sim = Pearson(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:N]

def random_ing(N):
    ing_list = random.sample(itemsPerIng.keys(),N)
    print("recommend recipes using ingredients: ")
    print(ing_list)
    print()
    return ing_list
    
def list_rec_recipe(M, N):
    rec_recipes =  rec_recipe(random_ing(5),5)
    for sim , recipe_id in rec_recipes: 
        print("sim = ", sim , ", recipe_id = ", recipe_id)
        print(ingsPerItem[recipe_id])
    print()
    print()
        
list_rec_recipe(10,5)
list_rec_recipe(6,10)

since Python 3.9 and will be removed in a subsequent version.
  ing_list = random.sample(itemsPerIng.keys(),N)


recommend recipes using ingredients: 
['rib eye steak', 'caesar salad dressing mix', 'dry linguine', 'wine-cured sauerkraut', 'cilantro chutney']

sim =  0.1111111111111111 , recipe_id =  07037811
{'dry white wine', 'lemon juice', 'olive oil', 'caesar salad dressing mix', 'salmon steaks'}
sim =  0.1111111111111111 , recipe_id =  02190119
{'cheez whiz', 'onion', 'rib eye steak', 'butter', 'hoagie rolls'}
sim =  0.09090909090909091 , recipe_id =  73315294
{'sour cream', 'milk', 'caesar salad dressing mix', 'italian cut green beans', 'italian style breadcrumbs', 'cheese ravioli', 'cream of mushroom soup'}
sim =  0.09090909090909091 , recipe_id =  35328315
{'wine-cured sauerkraut', 'honey mustard', 'apple', 'apple juice', 'potatoes', 'caraway seed', 'lean pork'}
sim =  0.09090909090909091 , recipe_id =  30688537
{'low-fat creme fraiche', 'baby spinach', 'dry linguine', 'smoked haddock', 'lemon, zest of', 'small capers', 'cherry tomatoes'}


recommend recipes using ingredients: 
['whole gra

In [None]:
# More creative system

In [None]:
Answer for Question 10:

a. 