In [148]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import psycopg2
import scraper
import process_words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [103]:
df.head()

Unnamed: 0,id,post_date,title,foods
0,27,2018-02-09,Long Life Noodles with Shrimp and Greens,"teaspoon sesame oil, for drizzling stalk green..."
1,28,2018-02-09,Ginger-Onion Whole Steamed Fish,"stalks green onions, cut into 3-inch segments,..."
2,29,2018-02-09,Smacked Cucumber,"teaspoon sugar cloves garlic, crushed tablespo..."
3,30,2018-02-08,Neck Bones and Lima Beans,"salt and pepper, to taste tablespoons canola o..."
4,31,2018-02-07,Angel Wings (Faworki),pinch sea salt confectioner’s sugar for servin...


In [136]:
food_stems = [process_words.clean_one_doc(doc) for doc in df.foods]
title_stems = [process_words.clean_one_doc(doc) for doc in df.title]
df['food_stems'] = food_stems
df['title_stems'] = title_stems

In [137]:
df.head()

Unnamed: 0,id,post_date,title,foods,food_stems,title_stems
0,27,2018-02-09,Long Life Noodles with Shrimp and Greens,"teaspoon sesame oil, for drizzling stalk green...","[sesame, oil, drizzling, stalk, green, onion, ...","[long, life, noodles, shrimp, greens]"
1,28,2018-02-09,Ginger-Onion Whole Steamed Fish,"stalks green onions, cut into 3-inch segments,...","[stalks, green, onions, segments, kosher, salt...","[ginger, onion, whole, steamed, fish]"
2,29,2018-02-09,Smacked Cucumber,"teaspoon sugar cloves garlic, crushed tablespo...","[sugar, cloves, garlic, crushed, soy, sauce, r...","[smacked, cucumber]"
3,30,2018-02-08,Neck Bones and Lima Beans,"salt and pepper, to taste tablespoons canola o...","[salt, pepper, canola, oil, ground, sage, froz...","[neck, bones, lima, beans]"
4,31,2018-02-07,Angel Wings (Faworki),pinch sea salt confectioner’s sugar for servin...,"[sea, salt, confectioners, sugar, sour, cream,...","[angel, wings, faworki]"


In [138]:
food_vocab = list(set([word for doc in food_stems for word in doc]))
title_vocab = list(set([word for doc in title_stems for word in doc]))

In [143]:
food_vocab_dict = {word: i for i, word in enumerate(food_vocab)}
title_vocab_dict = {word: i for i, word in enumerate(title_vocab)}

In [145]:
food_counts = np.zeros((len(food_stems), len(food_vocab)))
for doc_id, words in enumerate(food_stems):
    for word in words:
        word_id = food_vocab_dict[word]
        food_counts[doc_id][word_id] += 1

In [177]:
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=process_words.clean_one_doc)
vectors = vectorizer.fit_transform(df.foods).toarray()
words = vectorizer.get_feature_names()

In [181]:
def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[-1:-n-1:-1]]

In [183]:
avg = np.sum(vectors, axis=0) / np.sum(vectors > 0, axis=0)
print ("top 10 by average tf-idf")
print (get_top_values(avg, 10, words))

top 10 by average tf-idf
['montasio', 'papad', 'grapefuit', 'coconuts', 'kahla', 'chicharrones', 'passionfruit', 'jamn', 'kapika', 'caramels']


In [184]:
total = np.sum(vectors, axis=0)
print ("top 10 by total tf-idf")
print (get_top_values(total, 10, words))

top 10 by total tf-idf
['sugar', 'salt', 'oil', 'fresh', 'butter', 'pepper', 'flour', 'ground', 'olive', 'lemon']


In [185]:
vectors.shape

(4858, 5383)

In [187]:
len(words)

5383

In [193]:
len(vectors[:1][0])

5383

In [199]:
i = np.argsort(avg)[-1:-10-1:-1]
avg[i]

array([ 1.        ,  0.91981864,  0.91795628,  0.88133154,  0.77065948,
        0.73876459,  0.72868118,  0.72657354,  0.69608586,  0.68041406])

In [201]:
[words[i] for i in i]

['montasio',
 'papad',
 'grapefuit',
 'coconuts',
 'kahla',
 'chicharrones',
 'passionfruit',
 'jamn',
 'kapika',
 'caramels']

In [202]:
df.head()

Unnamed: 0,id,post_date,title,foods,food_stems,title_stems
0,27,2018-02-09,Long Life Noodles with Shrimp and Greens,"teaspoon sesame oil, for drizzling stalk green...","[sesame, oil, drizzling, stalk, green, onion, ...","[long, life, noodles, shrimp, greens]"
1,28,2018-02-09,Ginger-Onion Whole Steamed Fish,"stalks green onions, cut into 3-inch segments,...","[stalks, green, onions, segments, kosher, salt...","[ginger, onion, whole, steamed, fish]"
2,29,2018-02-09,Smacked Cucumber,"teaspoon sugar cloves garlic, crushed tablespo...","[sugar, cloves, garlic, crushed, soy, sauce, r...","[smacked, cucumber]"
3,30,2018-02-08,Neck Bones and Lima Beans,"salt and pepper, to taste tablespoons canola o...","[salt, pepper, canola, oil, ground, sage, froz...","[neck, bones, lima, beans]"
4,31,2018-02-07,Angel Wings (Faworki),pinch sea salt confectioner’s sugar for servin...,"[sea, salt, confectioners, sugar, sour, cream,...","[angel, wings, faworki]"


In [250]:
years = set(date.year for date in df.post_date)
#food_yrs = np.zeros(len(years))
for i, yr in enumerate(years):
    food_yrs = df[df['post_date'].dt.year.values == yr]['food_stems'].values

In [251]:
food_yrs

array([ ['unsalted', 'butter', 'cinnamon', 'angostura', 'bitters', 'salt', 'popped', 'popcorn', 'granulated', 'sugar'],
       ['dill', 'pickles', 'cooked', 'dark', 'meat', 'chicken', 'sized', 'potatoes', 'eggs', 'carrot', 'vinegar', 'based', 'hot', 'sauce', 'tobasco', 'tinned', 'green', 'peas', 'sized', 'onion'],
       ['flour', 'onion', 'powder', 'garlic', 'powder', 'paprika', 'cayenne', 'salt', 'sprinkling', 'canola', 'oil', 'frying', 'russet', 'potatoes'],
       ['lime', 'juice', 'sea', 'salt', 'coconut', 'water', 'cashews', 'soaked', 'overnight', 'drained', 'ripe', 'bananas'],
       ['olive', 'oil', 'butter', 'arborio', 'rice', 'milliliters', 'good', 'vegetable', 'stock', 'saffron', 'onion', 'parmesan', 'salt', 'pepper', 'milliliters', 'white', 'wine', 'lemon'],
       ['handful', 'flat', 'leaf', 'parsley', 'celery', 'stalk', 'salt', 'water', 'carrot', 'onion', 'olive', 'oil', 'bay', 'leaf', 'sprig', 'rosemary', 'brown', 'lentils'],
       ['packed', 'bonito', 'flakes', 'katsuo

In [246]:
set(date.year for date in df.post_date)

{2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018}

In [244]:
df[df['post_date'].dt.year.values == 2009]['food_stems'].values.shape

(225,)

In [236]:
df['post_date'].dt.year.values == 2018

array([ True,  True,  True, ..., False, False, False], dtype=bool)