In [40]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import psycopg2
import scraper
import process_words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

In [5]:
conn = psycopg2.connect(dbname='food_db')
df = pd.read_sql('SELECT * FROM recipes', con=conn)

In [6]:
df.head()

Unnamed: 0,id,post_date,title,foods
0,27,2018-02-09,Long Life Noodles with Shrimp and Greens,"teaspoon sesame oil, for drizzling stalk green..."
1,28,2018-02-09,Ginger-Onion Whole Steamed Fish,"stalks green onions, cut into 3-inch segments,..."
2,29,2018-02-09,Smacked Cucumber,"teaspoon sugar cloves garlic, crushed tablespo..."
3,30,2018-02-08,Neck Bones and Lima Beans,"salt and pepper, to taste tablespoons canola o..."
4,31,2018-02-07,Angel Wings (Faworki),pinch sea salt confectioner’s sugar for servin...


In [7]:
food_stems = [process_words.clean_one_doc(doc) for doc in df.foods]
title_stems = [process_words.clean_one_doc(doc) for doc in df.title]
df['food_stems'] = food_stems
df['title_stems'] = title_stems

In [8]:
df.head()

Unnamed: 0,id,post_date,title,foods,food_stems,title_stems
0,27,2018-02-09,Long Life Noodles with Shrimp and Greens,"teaspoon sesame oil, for drizzling stalk green...","[sesame, oil, drizzling, stalk, green, onion, ...","[long, life, noodles, shrimp, greens]"
1,28,2018-02-09,Ginger-Onion Whole Steamed Fish,"stalks green onions, cut into 3-inch segments,...","[stalks, green, onions, segments, kosher, salt...","[ginger, onion, whole, steamed, fish]"
2,29,2018-02-09,Smacked Cucumber,"teaspoon sugar cloves garlic, crushed tablespo...","[sugar, cloves, garlic, crushed, soy, sauce, r...","[smacked, cucumber]"
3,30,2018-02-08,Neck Bones and Lima Beans,"salt and pepper, to taste tablespoons canola o...","[salt, pepper, canola, oil, ground, sage, froz...","[neck, bones, lima, beans]"
4,31,2018-02-07,Angel Wings (Faworki),pinch sea salt confectioner’s sugar for servin...,"[sea, salt, confectioners, sugar, sour, cream,...","[angel, wings, faworki]"


In [20]:
years = set(date.year for date in df.post_date)
food_yrs = {}
for yr in years:
    food_yrs[yr] = df[df['post_date'].dt.year.values == yr]['food_stems'].index

In [30]:
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=process_words.clean_one_doc)
food_stems9 = df.iloc[food_yrs[2009]]['foods']
tfidf9 = vectorizer.fit_transform(food_stems9)

In [35]:
features = vectorizer.get_feature_names()

In [41]:
kmeans = KMeans()
kmeans.fit(tfidf9)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [44]:
# Find the top 10 features for each cluster.
top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]
print ("top features for each cluster:")
for num, centroid in enumerate(top_centroids):
    print ("%d: %s" % (num, ", ".join(features[i] for i in centroid)))

top features for each cluster:
0: chicken, fresh, kosher, mustard, salt, stock, oil, olive, cider, pepper
1: juice, lime, fresh, lemon, ice, grapefruit, squeezed, half, zest, vodka
2: sauce, rice, beef, ginger, water, cloves, green, roughly, sesame, cilantro
3: sugar, baking, vanilla, flour, butter, egg, extract, powder, purpose, milk
4: pepper, sweet, potatoes, cream, butter, paprika, smoked, salt, ground, onion
5: red, oil, olive, garlic, pepper, vinegar, salt, cloves, bunch, fresh
6: peeled, pepper, pieces, salt, fresh, bacon, seeded, oil, onion, black
7: chocolate, powder, cocoa, baking, sugar, butter, flour, unsalted, chips, extract


In [58]:
centroids, silhouette = make_clusters(tfidf9, features, 10)

Cluster 0
pepper || 0.06374857230771823
chicken || 0.06221120930867935
butter || 0.06179129021327038
white || 0.05879279641154481
fresh || 0.05630972447574284
salt || 0.05384809332350417
diced || 0.050100502100046386
parmesan || 0.04986974920534763
onion || 0.04294568831843574
stock || 0.04237838524554043
----------------------
Cluster 1
baking || 0.1658377855051646
sugar || 0.13928175957233896
flour || 0.1316387167893058
powder || 0.1285720864403883
butter || 0.1165714333417407
vanilla || 0.09570924256705607
purpose || 0.09216007203140662
chocolate || 0.08855758704775482
extract || 0.08825501053020675
eggs || 0.07804056634543223
----------------------
Cluster 2
pepper || 0.07851636474597579
oil || 0.07612047516845562
salt || 0.06837951589600193
olive || 0.0663274596481908
ground || 0.0652163481331282
red || 0.05999784076014654
garlic || 0.05833548507475915
fresh || 0.056037408330727
black || 0.048592725055062375
cloves || 0.04395306888911238
----------------------
Cluster 3
juice || 0

In [57]:
def make_clusters(X, features, n_features, best_k=None):
    maxk = len(features)//20
    silhouette = np.zeros(maxk)
    if best_k == None:
        for k in range(1, maxk):
            km = KMeans(k)
            y = km.fit_predict(X)
            if k > 1:
                silhouette[k] = silhouette_score(X, y)
        best_k = np.argmax(silhouette) + 2

    kmeans = KMeans(n_clusters=best_k).fit(X)
    centroids = kmeans.cluster_centers_

    for i, c in enumerate(centroids):
        ind = c.argsort()[::-1][:n_features]
        print('Cluster {}'.format(i))
        for i in ind:
            print('{} || {}'.format(features[i], c[i]))
        print('----------------------')
    return centroids, silhouette

In [143]:
food_vocab_dict = {word: i for i, word in enumerate(food_vocab)}
title_vocab_dict = {word: i for i, word in enumerate(title_vocab)}

In [145]:
food_counts = np.zeros((len(food_stems), len(food_vocab)))
for doc_id, words in enumerate(food_stems):
    for word in words:
        word_id = food_vocab_dict[word]
        food_counts[doc_id][word_id] += 1

In [177]:
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=process_words.clean_one_doc)
vectors = vectorizer.fit_transform(df.foods).toarray()
words = vectorizer.get_feature_names()

In [181]:
def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[-1:-n-1:-1]]

In [183]:
avg = np.sum(vectors, axis=0) / np.sum(vectors > 0, axis=0)
print ("top 10 by average tf-idf")
print (get_top_values(avg, 10, words))

top 10 by average tf-idf
['montasio', 'papad', 'grapefuit', 'coconuts', 'kahla', 'chicharrones', 'passionfruit', 'jamn', 'kapika', 'caramels']


In [184]:
total = np.sum(vectors, axis=0)
print ("top 10 by total tf-idf")
print (get_top_values(total, 10, words))

top 10 by total tf-idf
['sugar', 'salt', 'oil', 'fresh', 'butter', 'pepper', 'flour', 'ground', 'olive', 'lemon']


In [185]:
vectors.shape

(4858, 5383)

In [187]:
len(words)

5383

In [193]:
len(vectors[:1][0])

5383

In [199]:
i = np.argsort(avg)[-1:-10-1:-1]
avg[i]

array([ 1.        ,  0.91981864,  0.91795628,  0.88133154,  0.77065948,
        0.73876459,  0.72868118,  0.72657354,  0.69608586,  0.68041406])

In [201]:
[words[i] for i in i]

['montasio',
 'papad',
 'grapefuit',
 'coconuts',
 'kahla',
 'chicharrones',
 'passionfruit',
 'jamn',
 'kapika',
 'caramels']

In [202]:
df.head()

Unnamed: 0,id,post_date,title,foods,food_stems,title_stems
0,27,2018-02-09,Long Life Noodles with Shrimp and Greens,"teaspoon sesame oil, for drizzling stalk green...","[sesame, oil, drizzling, stalk, green, onion, ...","[long, life, noodles, shrimp, greens]"
1,28,2018-02-09,Ginger-Onion Whole Steamed Fish,"stalks green onions, cut into 3-inch segments,...","[stalks, green, onions, segments, kosher, salt...","[ginger, onion, whole, steamed, fish]"
2,29,2018-02-09,Smacked Cucumber,"teaspoon sugar cloves garlic, crushed tablespo...","[sugar, cloves, garlic, crushed, soy, sauce, r...","[smacked, cucumber]"
3,30,2018-02-08,Neck Bones and Lima Beans,"salt and pepper, to taste tablespoons canola o...","[salt, pepper, canola, oil, ground, sage, froz...","[neck, bones, lima, beans]"
4,31,2018-02-07,Angel Wings (Faworki),pinch sea salt confectioner’s sugar for servin...,"[sea, salt, confectioners, sugar, sour, cream,...","[angel, wings, faworki]"


In [246]:
set(date.year for date in df.post_date)

{2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018}

In [244]:
df[df['post_date'].dt.year.values == 2009]['food_stems'].values.shape

(225,)

In [236]:
df['post_date'].dt.year.values == 2018

array([ True,  True,  True, ..., False, False, False], dtype=bool)