In [97]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import psycopg2
import scraper
import process_words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

In [2]:
conn = psycopg2.connect(dbname='food_db')
df = pd.read_sql('SELECT * FROM recipes', con=conn)

In [161]:
food_stems = [process_words.clean_one_doc(doc) for doc in df.foods]
title_stems = [process_words.clean_one_doc(doc) for doc in df.title]
df['food_stems'] = food_stems
df['title_stems'] = title_stems

In [162]:
df.head()

Unnamed: 0,id,post_date,title,foods,food_stems,title_stems
0,27,2018-02-09,Long Life Noodles with Shrimp and Greens,"teaspoon sesame oil, for drizzling stalk green...","[sesame, oil, drizzling, stalk, green, onion, ...","[long, life, noodles, shrimp, greens]"
1,28,2018-02-09,Ginger-Onion Whole Steamed Fish,"stalks green onions, cut into 3-inch segments,...","[stalks, green, onions, segments, kosher, shao...","[ginger, onion, whole, steamed, fish]"
2,29,2018-02-09,Smacked Cucumber,"teaspoon sugar cloves garlic, crushed tablespo...","[sugar, cloves, garlic, crushed, soy, sauce, r...","[smacked, cucumber]"
3,30,2018-02-08,Neck Bones and Lima Beans,"salt and pepper, to taste tablespoons canola o...","[canola, oil, sage, frozen, lima, beans, water...","[neck, bones, lima, beans]"
4,31,2018-02-07,Angel Wings (Faworki),pinch sea salt confectioner’s sugar for servin...,"[sea, confectioners, sugar, sour, cream, egg, ...","[angel, wings, faworki]"


In [163]:
food_yrs, food_vectors = process_words.vectorize_all(df, 'foods')

In [164]:
all_words = []
for i, vec in enumerate(food_vectors.values()):
    all_words += list(vec)[1]
len(all_words)

18093

In [165]:
best_ks = {}
for yr in food_vectors:
    X = food_vectors[yr][0]
    k, sil = process_words.get_best_k(X, maxk=20)
    best_ks[yr] = k

In [166]:
best_ks

{2009: 4,
 2010: 4,
 2011: 4,
 2012: 4,
 2013: 4,
 2014: 4,
 2015: 4,
 2016: 4,
 2017: 4,
 2018: 4}

In [167]:
def make_cluster_df(year):
    (X, features) = food_vectors[year]
    cluster_df = pd.DataFrame(features)
    centroids = process_words.make_clusters(X, features, best_k=best_ks[year])
    centroids_sorted = centroids.argsort()[:,-1::-1]
    for i, c in enumerate(centroids_sorted):
        cluster_df['words_{}'.format(i+1)] = np.array(features)[centroids_sorted[i]]
        cluster_df['scores_{}'.format(i+1)] = centroids[i][centroids_sorted[i]]
    cluster_df.drop(columns=[0], inplace=True)
    return cluster_df

In [168]:
(cluster09, cluster10, cluster11, cluster12, cluster13, 
 cluster14, cluster15, cluster16, cluster17, cluster18) = map(
    make_cluster_df, (2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018))

In [169]:
cluster09.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,juice,0.085357,baking,0.152905,butter,0.08171,oil,0.079456
1,half,0.060204,sugar,0.140149,chicken,0.073719,red,0.072204
2,sugar,0.057238,flour,0.133798,potatoes,0.060983,olive,0.069756
3,whole,0.055707,powder,0.123296,pieces,0.06021,garlic,0.069668
4,lemon,0.05349,butter,0.116993,black,0.057849,vinegar,0.056987
5,milk,0.052639,vanilla,0.101404,cream,0.057065,cloves,0.05609
6,lime,0.047513,extract,0.092879,kosher,0.049778,white,0.038704
7,cream,0.043111,purpose,0.086747,yellow,0.046675,onion,0.037738
8,honey,0.042884,chocolate,0.079566,diced,0.044814,wine,0.036749
9,ice,0.039158,egg,0.073758,onion,0.042514,sauce,0.035354


In [170]:
cluster10.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,red,0.066245,olive,0.07411,butter,0.080121,sugar,0.138388
1,garlic,0.062162,oil,0.073857,unsalted,0.063494,flour,0.079316
2,onion,0.058684,lemon,0.07195,kosher,0.060484,baking,0.076925
3,diced,0.054473,garlic,0.053482,white,0.056746,vanilla,0.076301
4,oil,0.05413,juice,0.046743,cheese,0.049881,butter,0.06827
5,cloves,0.048225,sea,0.041628,flour,0.042614,unsalted,0.059868
6,olive,0.044956,virgin,0.04107,cream,0.039772,extract,0.059018
7,wine,0.044626,cheese,0.03959,chives,0.038024,egg,0.056548
8,sauce,0.04424,black,0.036299,dice,0.036889,cream,0.050426
9,green,0.040093,cloves,0.035281,eggs,0.035351,purpose,0.049408


In [171]:
cluster11.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,water,0.112094,sugar,0.118469,onion,0.060881,olive,0.065926
1,sugar,0.095228,flour,0.101211,sauce,0.060743,oil,0.065915
2,juice,0.057868,baking,0.091129,garlic,0.058201,lemon,0.054123
3,lemon,0.053328,butter,0.089814,oil,0.057003,garlic,0.05128
4,orange,0.051095,vanilla,0.080673,red,0.05273,leaves,0.042545
5,ice,0.047265,unsalted,0.066387,cilantro,0.051509,cheese,0.04133
6,ginger,0.038948,powder,0.06563,cloves,0.048733,black,0.039428
7,raspberries,0.037399,milk,0.062259,cumin,0.041458,juice,0.036312
8,lime,0.035496,egg,0.060542,soy,0.038406,white,0.035683
9,organic,0.032425,extract,0.060098,diced,0.036537,cloves,0.035083


In [172]:
cluster12.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,cream,0.117574,oil,0.046236,flour,0.113665,oil,0.083641
1,sugar,0.094616,black,0.04451,sugar,0.109337,olive,0.08305
2,heavy,0.093002,water,0.03979,baking,0.096959,garlic,0.065673
3,milk,0.078771,garlic,0.034279,butter,0.090021,lemon,0.052981
4,vanilla,0.065812,kosher,0.033959,purpose,0.086684,juice,0.048834
5,chocolate,0.059495,olive,0.032247,unsalted,0.071012,diced,0.044587
6,syrup,0.055996,leaves,0.03204,vanilla,0.060125,virgin,0.044109
7,whole,0.04643,red,0.031095,powder,0.060034,leaves,0.043455
8,bean,0.043619,sauce,0.030695,egg,0.05929,onion,0.043061
9,light,0.041753,juice,0.029086,extract,0.057533,red,0.042759


In [138]:
cluster13.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,lime,0.039179,oil,0.072443,flour,0.159992,vanilla,0.109369
1,juice,0.0377,pepper,0.071373,sugar,0.107314,sugar,0.09999
2,lemon,0.03759,olive,0.060813,baking,0.10371,cream,0.098029
3,sugar,0.035358,fresh,0.05832,purpose,0.089207,chocolate,0.080549
4,milk,0.035222,garlic,0.057947,butter,0.08579,extract,0.079148
5,whole,0.033621,salt,0.055904,salt,0.063534,heavy,0.073032
6,fresh,0.032767,ground,0.046855,powder,0.062688,granulated,0.059695
7,honey,0.032075,black,0.042816,egg,0.061844,coconut,0.049718
8,butter,0.031493,red,0.039893,eggs,0.056712,butter,0.047107
9,orange,0.031379,cloves,0.039513,ground,0.056354,salt,0.046025


In [139]:
cluster14.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4,words_5,scores_5
0,sauce,0.086889,lemon,0.042936,sugar,0.127348,flour,0.142466,pepper,0.085385
1,seeds,0.061352,juice,0.040783,chocolate,0.112523,sugar,0.1349,olive,0.076165
2,sesame,0.05814,syrup,0.038171,cream,0.104953,baking,0.10562,oil,0.075385
3,garlic,0.057483,coconut,0.034031,vanilla,0.089273,purpose,0.09593,salt,0.060097
4,soy,0.057031,orange,0.033296,extract,0.085751,butter,0.092944,garlic,0.05595
5,green,0.05669,sugar,0.032245,heavy,0.071076,unsalted,0.07163,fresh,0.054167
6,vinegar,0.05546,salt,0.02967,room,0.066844,cinnamon,0.067523,ground,0.052236
7,rice,0.054945,butter,0.028667,temperature,0.066809,ground,0.066665,black,0.049567
8,oil,0.051388,fresh,0.028087,egg,0.065075,powder,0.066592,red,0.046535
9,peeled,0.044477,milk,0.025008,butter,0.06001,salt,0.064512,leaves,0.044697


In [159]:
cluster15.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,water,0.068408,cream,0.119556,sugar,0.147921,garlic,0.068972
1,sugar,0.053125,chocolate,0.107122,flour,0.131842,cloves,0.048381
2,juice,0.044758,heavy,0.090981,baking,0.119017,black,0.045731
3,lemon,0.038264,sugar,0.089614,purpose,0.097037,red,0.045575
4,flour,0.036362,vanilla,0.087467,vanilla,0.089994,sauce,0.043237
5,honey,0.034044,milk,0.08582,powder,0.082102,leaves,0.041273
6,dry,0.033516,extract,0.067089,unsalted,0.080332,onion,0.041232
7,yeast,0.029515,dark,0.054405,eggs,0.073188,vinegar,0.040953
8,unsalted,0.028686,syrup,0.052806,extract,0.072399,lemon,0.03687
9,orange,0.027915,chips,0.045881,egg,0.0617,juice,0.035596


In [160]:
cluster16.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,garlic,0.04387,yeast,0.219715,sugar,0.115564,baking,0.173157
1,black,0.043791,active,0.172551,vanilla,0.089219,flour,0.139208
2,red,0.039219,flour,0.144659,extract,0.076698,sugar,0.127961
3,sauce,0.034712,dry,0.136313,egg,0.074748,powder,0.122932
4,juice,0.032991,water,0.123156,cream,0.064577,purpose,0.108862
5,onion,0.032334,warm,0.110928,milk,0.054444,soda,0.099206
6,diced,0.03225,sugar,0.09942,heavy,0.049197,vanilla,0.09347
7,cloves,0.031098,purpose,0.09825,coconut,0.044701,eggs,0.089039
8,white,0.029974,instant,0.091055,granulated,0.043165,extract,0.088016
9,water,0.028828,egg,0.070093,unsalted,0.041378,unsalted,0.085353


In [142]:
cluster17.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,flour,0.155267,sugar,0.130912,lemon,0.063554,oil,0.075199
1,yeast,0.148507,flour,0.090733,juice,0.052176,pepper,0.062142
2,purpose,0.122122,vanilla,0.088781,fresh,0.046805,ground,0.059017
3,sugar,0.120026,baking,0.086857,slices,0.043412,garlic,0.058712
4,unsalted,0.114646,extract,0.082551,orange,0.038508,olive,0.053926
5,water,0.1135,butter,0.080337,salt,0.037935,salt,0.050971
6,butter,0.105519,powder,0.07784,sugar,0.035529,sauce,0.045849
7,active,0.088182,chocolate,0.07491,pepper,0.034085,onion,0.044366
8,instant,0.087013,unsalted,0.067494,syrup,0.03117,red,0.041873
9,salt,0.085732,eggs,0.064395,apple,0.030152,peeled,0.041148


In [143]:
cluster18.head(20)

Unnamed: 0,words_1,scores_1,words_2,scores_2,words_3,scores_3,words_4,scores_4
0,slices,0.169013,sugar,0.208378,sauce,0.096734,freshly,0.099682
1,cheese,0.152466,egg,0.109962,soy,0.093347,ground,0.089155
2,pepper,0.098931,cream,0.096174,flakes,0.091562,black,0.074983
3,whole,0.094461,baking,0.09112,pepper,0.08744,pepper,0.073048
4,potato,0.093223,purpose,0.090167,fresh,0.083214,oil,0.072952
5,bread,0.085225,flour,0.089766,red,0.076114,chicken,0.071202
6,chips,0.082105,vanilla,0.089043,water,0.066891,kosher,0.068625
7,american,0.076862,powder,0.086098,crushed,0.06443,salt,0.062666
8,soft,0.076862,granulated,0.08404,vinegar,0.062029,white,0.060718
9,hamburger,0.076862,milk,0.084017,garlic,0.06177,olive,0.05566
