In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import copy as cp
import math

### Create TF-IDF matrix


https://www.geeksforgeeks.org/tf-idf-model-for-page-ranking/

1. Create a matrix of TF, term frequency (this is same as bag of words) --->   "binaryIngredientsMat"
2. Normalize the TF matrix ---> "binaryIngredientsMatNorm"
3. Create IDF for each ingredient ---> "idf"
4. Multiple TF * IDF to obtain TF-IDF matrix ---> "tf-idf"

The values in " " are the variable names in this script

### 1. Load Data + Create TF
Here, TF = term frequency. This is the same as Bag of Words, or Binary Matrix generated from Mingyu's BMF script: 

In [20]:
data = pd.read_json('train.json')

In [71]:
# get the list of ingredients
ingredientList = sorted(list(set([ingredient for i in range(data.shape[0]) for ingredient in data.loc[i,'ingredients']])))
# the mapping between ingredient and its index
ingredient2index = dict(zip(ingredientList, range(len(ingredientList))))
# create a binary matrix indicating whether or not an ingredient is in a recipe
binaryIngredientsMat = np.zeros((data.shape[0], len(ingredientList)))
for iRecipe in range(data.shape[0]):
    binaryIngredientsMat[iRecipe, [ingredient2index[ingredient] for ingredient in data.loc[iRecipe, 'ingredients']]] = 1
    
dataBinaryIngredients = pd.DataFrame(binaryIngredientsMat, columns=ingredientList)
dataBinaryIngredients.head()

Unnamed: 0,( oz.) tomato sauce,( oz.) tomato paste,(10 oz.) frozen chopped spinach,"(10 oz.) frozen chopped spinach, thawed and squeezed dry",(14 oz.) sweetened condensed milk,(14.5 oz.) diced tomatoes,(15 oz.) refried beans,1% low-fat buttermilk,1% low-fat chocolate milk,1% low-fat cottage cheese,...,yukon gold potatoes,yuzu,yuzu juice,za'atar,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 2. Normalize TF Matrix
* For each ingredient in each recipe, divide by the total number of ingredients in that recipe. (Normalize by total number of 'words' in 'document')

* Another way to think of this: for each row, for each ingredient, divide by the total number of ingredients for that row.

In [107]:
%%time

n_ingredients = dataBinaryIngredients.shape[1]
n_recipes = dataBinaryIngredients.shape[0]

binaryIngredientsMatNorm = cp.deepcopy(binaryIngredientsMat)

for row in range(0,n_recipes):
    row_sum = binaryIngredientsMat[row,:].sum()
    binaryIngredientsMatNorm[row,:] = binaryIngredientsMat[row,:]/row_sum
    # Check progress: 
    #if row % 1000 == 0:
    #    print(row, row_sum, binaryIngredientsMatNorm[row,:].sum())

CPU times: user 1.52 s, sys: 740 ms, total: 2.26 s
Wall time: 1.2 s


### 3. Create IDF
Inverse Document Frequency is a measure of how common that term (or ingredient) is in all documents. 

IDF is a vector of ingredients (not a matrix!)

IDF(ingredient) = log( N / df(t))
Where: 
* N = total number of recipes
* df(t) = total number of recipes that contain the ingredient
* We take the log so that it's a monotonically increasing function

High IDF score means the ingredient is less frequently used in all recipes, and thus might have more value.

Low IDF score means the ingredient is more frequently used in all recipes, and thus might have less value. 

In [108]:
# idf(t) = log( N / df(t) )
# idf(ingredient) = N_recipes/ N_recipes_with_that_ingredient

# Each ingredient will have an IDF
idf = np.zeros(len(ingredientList))
for ingredient in range(0,n_ingredients):
    n_recipes_with_that_ingredient = binaryIngredientsMat[:,ingredient].sum()
    idf[ingredient] = math.log( n_recipes / n_recipes_with_that_ingredient )
    # if ingredient % 1000 == 0:
    #    print(idf[ingredient], n_recipes_with_that_ingredient)
print(idf)        

[8.19307344 8.39374413 9.49235642 ... 7.22367288 3.79750258 9.89782153]


### 4. Create TF-IDF matrix
For each ingredient column, multiply the TF * IDF(ingredient): 

binaryIngredientMatNorm[:,ingredient] * idf[ingredient]

In [111]:
# Calculate TF-IDF for every term:

tf_idf_mat = cp.deepcopy(binaryIngredientsMatNorm)

for ingredient in range(0,n_ingredients):
    tf_idf_mat[:,ingredient] = binaryIngredientsMatNorm[:,ingredient]*idf[ingredient]
    #if ingredient % 1000 == 0:
    #    print(tf_idf[:,ingredient])
        

In [120]:
# Make it into a panda dataframe, and save: 
import pickle
pd.options.display.max_columns=200
tf_idf = pd.DataFrame(tf_idf_mat, columns=ingredientList)
tf_idf.to_pickle("tf_idf.pkl")

In [118]:
# Check results: For the ingredient 'salt', recipes that have more ingredients should have a lower final tf-idf value than recipes that have less ingredients.  We see this is indeed the case: recipe 3 only has 5 ingredients, so the value of 'salt' is higher at .19. In comparison, recipe 1,2,4 have way more ingredients, so the value of the ingredient 'salt' is much lower at .03-.07: 
tf_idf['salt'].head()

0    0.000000
1    0.071834
2    0.065848
3    0.197545
4    0.039509
Name: salt, dtype: float64

In [119]:
data['ingredients'].head()

0    [romaine lettuce, black olives, grape tomatoes...
1    [plain flour, ground pepper, salt, tomatoes, g...
2    [eggs, pepper, salt, mayonaise, cooking oil, g...
3                  [water, vegetable oil, wheat, salt]
4    [black pepper, shallots, cornflour, cayenne pe...
Name: ingredients, dtype: object