# Word Frequency Analysis on evaluation DataBase (currently BBC DB)

Importing libraries and a premade Recipes class

In [1]:
import pandas as pd
from recipes import Recipes

creating recipes object and importing data

In [2]:
cuukin = Recipes()

In [3]:
cuukin.import_data(listings_folder='listings', evaluation_folder='evaluation')

checking tables for anomalies

In [4]:
cuukin.listings['ingredients'].head(3)

Unnamed: 0_level_0,name,created_at,updated_at,badge_id,score_1,score_2,score_3,cupboard
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
191,Smoked salmon,2021-04-26 10:52:44.769000+00:00,2021-05-21 21:23:09.963000+00:00,13,5,1,0,False
190,Smoked mackerel,2021-04-26 10:52:10.914000+00:00,2021-05-21 21:23:09.957000+00:00,13,5,1,0,False
189,Peanut butter,2021-04-26 10:48:18.280000+00:00,2021-05-21 21:23:09.951000+00:00,17,5,1,0,False


In [5]:
cuukin.evaluation_data['recipe_methods'].head(3)

Unnamed: 0_level_0,recipe_id,description
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.0,Heat the oil in a frying pan and gently fry th...
1.0,0.0,"Add the chickpeas, harissa and tomatoes and co..."
2.0,0.0,Add a squeeze of lemon juice and season with s...


Preprocessing data

In [6]:
# setting index values to integers
cuukin.evaluation_data['recipe_methods'].dropna(axis='index', inplace=True)
cuukin.evaluation_data['recipe_methods'].index = cuukin.evaluation_data['recipe_methods'].index.astype('int')
cuukin.evaluation_data['recipe_methods'] = cuukin.evaluation_data['recipe_methods'].astype({'recipe_id': 'int'})
cuukin.evaluation_data['recipe_methods'].head(3)

Unnamed: 0_level_0,recipe_id,description
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,Heat the oil in a frying pan and gently fry th...
1,0,"Add the chickpeas, harissa and tomatoes and co..."
2,0,Add a squeeze of lemon juice and season with s...


Running word frequency analysis (can take some minutes)

In [7]:
cuukin.word_frequency_analysis()

In [8]:
cuukin.word_distribution.head(10)

Unnamed: 0_level_0,frequency,relative_frequency
words,Unnamed: 1_level_1,Unnamed: 2_level_1
minute,15791,3.144365
add,14922,2.971326
heat,10553,2.101354
�,10001,1.991438
cook,9668,1.925129
pan,8024,1.59777
stir,7522,1.49781
bowl,6758,1.345679
oil,6647,1.323576
serve,6166,1.227798


transforming listings into sets for faster search

In [9]:
from recipes import lemmatize
from operator import add
from functools import reduce
# transforming listing into sets for O(1) search
ingredients_list = list(map(lemmatize, set(cuukin.listings['ingredients']['name'])))
ingredients_set = set(reduce(add ,ingredients_list))

tools_list = list(map(lemmatize, set(cuukin.listings['tools']['name'])))
tools_set = set(reduce(add ,tools_list))

techniques_list = list(map(lemmatize, set(cuukin.listings['techniques']['name'])))
techniques_set = set(reduce(add ,techniques_list))


Automatic tagging function for each word

In [10]:
# adding easy tags in the word frequency distribution
def classify_word(words):
    results = []
    for word in words:
        word = word[0]
        if word in ingredients_set: results.append('ingredients')
        elif word in tools_set: results.append('tools')
        elif word in techniques_set: results.append('techniques')
        else: results.append('others')
    return results


Printing and checking generated tables

In [16]:
cuukin.word_distribution = cuukin.word_distribution.assign(type = classify_word(cuukin.word_distribution.index))
cuukin.word_distribution.head(30)

Unnamed: 0_level_0,frequency,relative_frequency,type
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
minute,15791,3.144365,others
add,14922,2.971326,others
heat,10553,2.101354,others
�,10001,1.991438,others
cook,9668,1.925129,others
pan,8024,1.59777,tools
stir,7522,1.49781,techniques
bowl,6758,1.345679,tools
oil,6647,1.323576,ingredients
serve,6166,1.227798,others


In [12]:
cuukin.word_distribution[cuukin.word_distribution['type'] == 'ingredients'].head(10)

Unnamed: 0_level_0,frequency,relative_frequency,type
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
oil,6647,1.323576,ingredients
salt,5886,1.172043,ingredients
pepper,4518,0.899642,ingredients
bake,3807,0.758065,ingredients
sauce,3666,0.729988,ingredients
butter,3661,0.728992,ingredients
onion,3624,0.721625,ingredients
brown,3504,0.69773,ingredients
egg,3501,0.697133,ingredients
sugar,3500,0.696933,ingredients


In [13]:
cuukin.word_distribution[cuukin.word_distribution['type'] == 'tools'].head(10)

Unnamed: 0_level_0,frequency,relative_frequency,type
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pan,8024,1.59777,tools
bowl,6758,1.345679,tools
oven,5777,1.150339,tools
fry,4900,0.975707,tools
spoon,2828,0.563122,tools
saucepan,2644,0.526483,tools
tray,2283,0.4546,tools
whisk,2239,0.445838,tools
cut,1853,0.368977,tools
dish,1731,0.344683,tools


In [17]:
cuukin.word_distribution[cuukin.word_distribution['type'] == 'techniques'].head(20)

Unnamed: 0_level_0,frequency,relative_frequency,type
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
stir,7522,1.49781,techniques
water,5418,1.078853,techniques
mix,4244,0.845082,techniques
season,3502,0.697332,techniques
boil,2926,0.582636,techniques
simmer,2855,0.568499,techniques
plate,1720,0.342493,techniques
reduce,1493,0.297292,techniques
frying,1175,0.233971,techniques
blend,901,0.179411,techniques


Exporting everything to csv

In [15]:
cuukin.word_distribution.to_csv(r'word_frequency_analysis/word_frequency.csv')
cuukin.word_distribution[cuukin.word_distribution['type'] == 'ingredients'].to_csv(r'word_frequency_analysis/word_frequency_ingredients.csv')
cuukin.word_distribution[cuukin.word_distribution['type'] == 'tools'].to_csv(r'word_frequency_analysis/word_frequency_tools.csv')
cuukin.word_distribution[cuukin.word_distribution['type'] == 'techniques'].to_csv(r'word_frequency_analysis/word_frequency_techniques.csv')