In [52]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import psycopg2
import scraper
import process_words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

In [53]:
conn = psycopg2.connect(dbname='food_db')
df = pd.read_sql('SELECT * FROM recipes', con=conn)

In [56]:
years, vectors = process_words.vectorize_all(df, 'foods')

In [76]:
(X, word_list) = vectors[2010]

In [77]:
best_k = process_words.get_best_k(X)
best_k

4

In [78]:
cluster_df, labels = process_words.make_one_cluster_df(
    X, word_list, best_k, 2010)

In [82]:
top_words, label_list = process_words.make_top_words(df, 'foods', 20)

In [83]:
top_words

Unnamed: 0,2009_1,s2009_1,2009_2,s2009_2,2009_3,s2009_3,2009_4,s2009_4,2010_1,s2010_1,...,2017_4,s2017_4,2018_1,s2018_1,2018_2,s2018_2,2018_3,s2018_3,2018_4,s2018_4
0,juice,0.063696,baking,0.143544,oil,0.108137,chicken,0.072773,sauce,0.054972,...,flour,0.146027,rice,0.109587,sugar,0.175476,cloves,0.077091,olive,0.118022
1,cream,0.054784,sugar,0.140354,olive,0.098059,diced,0.066811,oil,0.03337,...,sugar,0.126595,chicken,0.095214,milk,0.108353,onion,0.063744,black,0.115431
2,butter,0.050882,flour,0.127876,garlic,0.075668,black,0.060346,garlic,0.032082,...,yeast,0.120357,coconut,0.094279,egg,0.10127,garlic,0.062687,oil,0.110486
3,pieces,0.050241,powder,0.115747,red,0.073382,mustard,0.058611,butter,0.031534,...,unsalted,0.113696,pomegranate,0.089266,vanilla,0.0888,half,0.062262,chicken,0.108572
4,milk,0.039566,butter,0.111616,vinegar,0.070059,stock,0.053327,red,0.031376,...,butter,0.107746,curry,0.087534,chocolate,0.087013,oil,0.06194,kosher,0.106651
5,lemon,0.039547,vanilla,0.108539,virgin,0.066313,kosher,0.052493,sugar,0.031362,...,purpose,0.101295,water,0.082237,purpose,0.085293,red,0.056976,virgin,0.087843
6,lime,0.037919,extract,0.087192,cloves,0.050388,oil,0.050703,onion,0.030661,...,water,0.097072,tomatoes,0.076563,flour,0.084669,cheese,0.054797,lemon,0.07605
7,tomatoes,0.035357,purpose,0.084308,cheese,0.047466,onion,0.048192,ginger,0.030334,...,cold,0.072957,oil,0.067215,cream,0.080807,roughly,0.050683,juice,0.067696
8,whole,0.034584,egg,0.081219,onion,0.044789,garlic,0.047242,chicken,0.029378,...,fine,0.07206,free,0.066679,powder,0.079601,white,0.050204,vinegar,0.06453
9,sugar,0.032978,chocolate,0.074694,bunch,0.042266,cloves,0.04411,rice,0.029227,...,egg,0.069936,cilantro,0.066165,granulated,0.077734,whole,0.047719,dijon,0.05916


In [3]:
food_stems = [process_words.clean_one_doc(doc) for doc in df.foods]
title_stems = [process_words.clean_one_doc(doc) for doc in df.title]
df['food_stems'] = food_stems
df['title_stems'] = title_stems

In [33]:
df.head()

Unnamed: 0,id,post_date,title,foods,food_stems,title_stems
0,27,2018-02-09,Long Life Noodles with Shrimp and Greens,"teaspoon sesame oil, for drizzling stalk green...","[sesame, oil, drizzling, stalk, green, onion, ...","[long, life, noodles, shrimp, greens]"
1,28,2018-02-09,Ginger-Onion Whole Steamed Fish,"stalks green onions, cut into 3-inch segments,...","[stalks, green, onions, segments, kosher, shao...","[ginger, onion, whole, steamed, fish]"
2,29,2018-02-09,Smacked Cucumber,"teaspoon sugar cloves garlic, crushed tablespo...","[sugar, cloves, garlic, crushed, soy, sauce, r...","[smacked, cucumber]"
3,30,2018-02-08,Neck Bones and Lima Beans,"salt and pepper, to taste tablespoons canola o...","[canola, oil, sage, frozen, lima, beans, water...","[neck, bones, lima, beans]"
4,31,2018-02-07,Angel Wings (Faworki),pinch sea salt confectioner’s sugar for servin...,"[sea, confectioners, sugar, sour, cream, egg, ...","[angel, wings, faworki]"


In [5]:
food_yrs, food_vectors = process_words.vectorize_all(df, 'foods')

In [6]:
all_words = []
for i, vec in enumerate(food_vectors.values()):
    all_words += list(vec)[1]
len(all_words)

18093

In [7]:
best_ks = {}
for yr in food_vectors:
    X = food_vectors[yr][0]
    k, sil = process_words.get_best_k(X, maxk=20)
    best_ks[yr] = k

In [8]:
best_ks

{2009: 4,
 2010: 4,
 2011: 4,
 2012: 4,
 2013: 4,
 2014: 4,
 2015: 4,
 2016: 4,
 2017: 4,
 2018: 4}

In [44]:
def make_cluster_df(year):
    (X, features) = food_vectors[year]
    cluster_df = pd.DataFrame(features)
    centroids, labels = process_words.make_clusters(X, features, best_k=best_ks[year])
    centroids_sorted = centroids.argsort()[:,-1::-1]
    for i, c in enumerate(centroids_sorted):
        cluster_df['{}_{}'.format(year, i+1)] = np.array(features)[centroids_sorted[i]]
        cluster_df['s{}_{}'.format(year, i+1)] = centroids[i][centroids_sorted[i]]
    cluster_df.drop(columns=[0], inplace=True)
    return cluster_df, labels

In [23]:
cluster09, labels = make_cluster_df(2009)

In [29]:
sum(labels == 1)

51

In [45]:
(cluster09, cluster10, cluster11, cluster12, cluster13, 
 cluster14, cluster15, cluster16, cluster17, cluster18) = map(
    make_cluster_df, (2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018))

In [49]:
top_words = cluster09[0].iloc[0:20]
for df_i in (cluster10, cluster11, cluster12, cluster13, 
          cluster14, cluster15, cluster16, cluster17, cluster18):
    top_words = pd.concat([top_words, df_i[0].iloc[0:20]], axis=1)

In [50]:
top_words

Unnamed: 0,2009_1,s2009_1,2009_2,s2009_2,2009_3,s2009_3,2009_4,s2009_4,2010_1,s2010_1,...,2017_4,s2017_4,2018_1,s2018_1,2018_2,s2018_2,2018_3,s2018_3,2018_4,s2018_4
0,tomatoes,0.081715,baking,0.156303,oil,0.064272,juice,0.090352,red,0.054251,...,flour,0.159731,flakes,0.166063,onion,0.096853,sugar,0.162058,black,0.101437
1,oil,0.074856,sugar,0.141254,olive,0.058467,butter,0.055007,garlic,0.051451,...,yeast,0.159225,garnish,0.151654,sauce,0.087331,milk,0.110865,chicken,0.096822
2,garlic,0.073734,flour,0.12613,vinegar,0.057576,lemon,0.053195,onion,0.048919,...,sugar,0.130373,crushed,0.125092,oil,0.082539,egg,0.082502,olive,0.090768
3,paprika,0.067393,powder,0.126036,red,0.055523,sugar,0.05274,oil,0.045909,...,purpose,0.123033,cucumber,0.110583,cloves,0.075608,vanilla,0.079452,oil,0.08914
4,olive,0.060946,butter,0.108768,black,0.052804,lime,0.049578,diced,0.040929,...,unsalted,0.117963,chili,0.107001,curry,0.074825,whole,0.078576,kosher,0.08848
5,cloves,0.060238,vanilla,0.104255,garlic,0.04411,pieces,0.046894,green,0.03826,...,water,0.116749,soy,0.091151,vegetable,0.07039,chocolate,0.077854,lemon,0.071042
6,cumin,0.055411,purpose,0.091803,virgin,0.040922,ice,0.041794,sauce,0.038107,...,butter,0.108894,red,0.090281,garlic,0.069882,cream,0.072301,virgin,0.059395
7,smoked,0.054141,extract,0.083082,chicken,0.040907,water,0.039128,vinegar,0.03757,...,active,0.092107,trimmed,0.087842,roughly,0.069254,powder,0.071222,parsley,0.055162
8,onion,0.047531,chocolate,0.081334,kosher,0.039886,milk,0.035773,cloves,0.036912,...,instant,0.091287,cucumbers,0.084571,ginger,0.066522,granulated,0.069552,juice,0.054059
9,coriander,0.047083,eggs,0.077631,onion,0.039526,zest,0.033624,wine,0.036293,...,egg,0.08995,persian,0.084571,water,0.065339,baking,0.069114,potatoes,0.049891


In [36]:
cluster09.iloc[0:20]

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,butter,0.056785,oil,0.103927,sugar,0.101123,baking,0.185096
1,diced,0.051091,red,0.099764,juice,0.094931,powder,0.145571
2,chicken,0.048107,olive,0.097771,lemon,0.066747,flour,0.13651
3,onion,0.045943,garlic,0.081273,vanilla,0.065705,sugar,0.135343
4,oil,0.044362,vinegar,0.064201,cinnamon,0.051691,butter,0.116599
5,black,0.043546,cloves,0.062607,milk,0.049566,purpose,0.098047
6,cream,0.042341,virgin,0.042943,water,0.049082,chocolate,0.091051
7,whole,0.038886,bunch,0.042918,extract,0.043602,vanilla,0.087293
8,olive,0.035907,wine,0.042323,pieces,0.043372,eggs,0.085491
9,kosher,0.035069,tomatoes,0.035027,lime,0.041654,soda,0.084562


In [12]:
cluster10.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,red,0.055855,cheese,0.073564,lemon,0.096223,sugar,0.154277
1,garlic,0.052433,butter,0.059059,oil,0.078337,baking,0.089881
2,onion,0.04729,flour,0.049502,olive,0.077651,flour,0.089249
3,oil,0.044306,kosher,0.044095,juice,0.076685,butter,0.086139
4,diced,0.044159,bread,0.040561,garlic,0.050193,vanilla,0.080514
5,vinegar,0.043726,cream,0.040181,virgin,0.049237,unsalted,0.079356
6,cloves,0.041961,unsalted,0.040074,sea,0.042855,extract,0.066382
7,sauce,0.040604,white,0.040018,zest,0.040114,egg,0.062032
8,wine,0.036191,oil,0.039173,black,0.035478,purpose,0.059338
9,chicken,0.035887,garlic,0.034697,mint,0.034882,powder,0.056179


In [13]:
cluster11.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,water,0.086352,sugar,0.12626,olive,0.079145,garlic,0.056946
1,sugar,0.063742,flour,0.095222,oil,0.074431,oil,0.053968
2,juice,0.057566,butter,0.089834,lemon,0.064769,onion,0.048051
3,lemon,0.052071,baking,0.088236,cheese,0.064106,chicken,0.047344
4,ice,0.039847,vanilla,0.084352,garlic,0.055367,cloves,0.043573
5,garnish,0.031357,powder,0.066629,virgin,0.055093,sauce,0.042779
6,basil,0.03068,unsalted,0.065025,juice,0.042018,black,0.038822
7,white,0.030409,extract,0.063405,parsley,0.040838,white,0.035618
8,lime,0.030161,milk,0.060035,cloves,0.03976,red,0.035152
9,leaves,0.029893,egg,0.057743,bread,0.038063,cilantro,0.033733


In [14]:
len(cluster12)

1749

In [15]:
cluster13.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,oil,0.077457,vanilla,0.116844,juice,0.054306,flour,0.17977
1,olive,0.063629,sugar,0.102473,lemon,0.053709,sugar,0.0998
2,garlic,0.060771,cream,0.101691,sugar,0.048468,baking,0.09839
3,black,0.043556,chocolate,0.097936,lime,0.04179,purpose,0.093088
4,red,0.042351,extract,0.094718,water,0.033614,butter,0.086279
5,cloves,0.041367,heavy,0.076479,orange,0.032301,powder,0.058828
6,onion,0.04016,butter,0.058314,whole,0.030514,eggs,0.058202
7,virgin,0.034511,milk,0.056208,milk,0.028597,egg,0.057567
8,leaves,0.034358,dark,0.053091,butter,0.028506,unsalted,0.052322
9,white,0.033033,coconut,0.046719,syrup,0.027883,water,0.051358


In [16]:
cluster14.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,sugar,0.143181,sugar,0.046901,chocolate,0.183226,oil,0.070892
1,flour,0.13386,juice,0.044131,sugar,0.117012,olive,0.063861
2,baking,0.102958,lemon,0.043841,cream,0.110286,garlic,0.056584
3,butter,0.092081,syrup,0.043278,dark,0.091634,black,0.044872
4,purpose,0.087795,coconut,0.042973,heavy,0.080657,red,0.044297
5,vanilla,0.073006,water,0.03721,butter,0.070567,leaves,0.041504
6,egg,0.072642,orange,0.036631,extract,0.067437,cloves,0.038608
7,unsalted,0.071817,maple,0.029781,vanilla,0.065764,onion,0.03648
8,milk,0.065774,honey,0.028544,egg,0.063198,diced,0.035759
9,powder,0.060981,ginger,0.025422,milk,0.059599,vinegar,0.03554


In [17]:
cluster15.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,sauce,0.098947,sugar,0.14279,sugar,0.056497,olive,0.082117
1,oil,0.07306,flour,0.115377,water,0.053887,oil,0.078726
2,garlic,0.068938,butter,0.109229,juice,0.039771,garlic,0.063806
3,diced,0.061556,baking,0.095887,cream,0.037591,lemon,0.054515
4,rice,0.057525,vanilla,0.095818,milk,0.036877,black,0.049432
5,red,0.055617,unsalted,0.089897,syrup,0.034981,cloves,0.046225
6,vinegar,0.053094,purpose,0.084182,lemon,0.03151,leaves,0.04508
7,soy,0.051452,extract,0.076246,butter,0.031369,virgin,0.043094
8,onion,0.050483,powder,0.074978,coconut,0.02516,cheese,0.040055
9,ginger,0.04961,eggs,0.067295,chocolate,0.025135,parsley,0.039387


In [18]:
cluster16.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,yeast,0.113401,oil,0.071414,sugar,0.0684,baking,0.144184
1,flour,0.110011,olive,0.054347,cream,0.053929,sugar,0.141503
2,sugar,0.095957,garlic,0.050358,coconut,0.050922,flour,0.134991
3,active,0.090402,black,0.048157,vanilla,0.05058,vanilla,0.118222
4,egg,0.086239,red,0.043887,milk,0.050051,extract,0.117597
5,dough,0.082167,sauce,0.03903,heavy,0.046064,butter,0.117075
6,butter,0.078805,diced,0.037236,syrup,0.044751,purpose,0.10133
7,water,0.074639,onion,0.037128,juice,0.043378,unsalted,0.099326
8,purpose,0.071592,cloves,0.034951,water,0.040573,powder,0.097318
9,dry,0.071416,juice,0.033149,orange,0.039282,eggs,0.087609


In [19]:
cluster17.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,lemon,0.074926,yeast,0.196231,oil,0.062264,sugar,0.137792
1,juice,0.056363,flour,0.156297,garlic,0.056761,flour,0.098166
2,oil,0.049923,water,0.128646,olive,0.049252,butter,0.087874
3,orange,0.047372,sugar,0.120019,black,0.044998,vanilla,0.086209
4,syrup,0.035004,instant,0.114863,sauce,0.042263,baking,0.084696
5,sugar,0.033154,active,0.107761,leaves,0.04069,extract,0.080012
6,apple,0.030105,unsalted,0.106822,onion,0.039628,powder,0.077888
7,olive,0.02972,butter,0.100738,white,0.039177,unsalted,0.077248
8,cubes,0.027704,purpose,0.099636,red,0.038262,chocolate,0.072861
9,spice,0.027598,dry,0.086696,cheese,0.037392,purpose,0.066119


In [20]:
cluster18.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,milk,0.132315,oil,0.094508,sugar,0.224439,half,0.117887
1,chocolate,0.103359,olive,0.075808,egg,0.120514,parmesan,0.094546
2,whole,0.092548,chicken,0.074381,cream,0.114477,frozen,0.093192
3,ginger,0.080827,black,0.065346,granulated,0.099867,sprigs,0.090191
4,pomegranate,0.069429,kosher,0.062391,flour,0.099576,thyme,0.087505
5,star,0.064966,virgin,0.048789,purpose,0.098598,cloves,0.086755
6,anise,0.064966,juice,0.04877,vanilla,0.097868,black,0.082953
7,molasses,0.059369,lemon,0.047685,baking,0.097714,potatoes,0.082353
8,cinnamon,0.05756,onion,0.04671,heavy,0.088514,white,0.080888
9,fruit,0.057011,sauce,0.045783,powder,0.084966,garlic,0.080188


In [32]:
len(cluster17)

1688

In [178]:
cluster09.to_json('data/cluster09.json')