In [None]:
import re
import csv
import numpy as np
import pandas as pd
import inflect
import matplotlib.pyplot as plt

In [None]:
engine = inflect.engine()

In [None]:
engine.singular_noun(' ')

In [None]:
ingredients = open("data/recipeClean/ingredients.txt", mode='r', buffering=-1, encoding="ISO-8859-1", errors=None, newline=None, closefd=True, opener=None)
with open('data/to_remove.csv', 'r') as f:
    reader = csv.reader(f)
    to_remove = [item.lower() for sublist in list(reader) for item in sublist]

In [None]:
content = ingredients.readlines()

In [None]:
ids = np.zeros(len(content), dtype=object)
titles = np.zeros(len(content), dtype=object)
ingredients = np.zeros(len(content), dtype=object)
for i in range(len(content)):
    line = content[i].split('\t')
    if len(line) > 4:
        id_recipe, title, ingredient = line[0], line[3], line[4]
        ids[i] = id_recipe
        titles[i] = title
        ingredients[i] = ingredient
    else:
        ids[i] = line[0]
        titles[i] = line[2]
        ingredients[i] = ""

In [None]:
titles_pd = pd.DataFrame(titles)
titles_pd = titles_pd.rename(columns={'0':'Titles'})
titles_pd.head(5)

In [None]:
ingredients_pd = pd.DataFrame(ingredients)

In [None]:
def split_ingredient(ingredient):
    return ingredient.split('|')

In [None]:
def clean_ingredient(ingredient):
    ingredient = remove_par(ingredient)
    ingredient = remove_special_char(ingredient)
    ingredient = remove_letter(ingredient)
    ingredient = remove_number(ingredient)
    ingredient = remove_whitespace_comma(ingredient)
    ingredient = remove_useless_words(ingredient)
    ingredient = remove_adjective(ingredient)
    ingredient = remove_alternative(ingredient)
    ingredient = ingredient.lower()
    ingredient = remove_minus(ingredient)
    ingredient = remove_conjonction(ingredient)
    ingredient = remove_space(ingredient)
    if engine.singular_noun(ingredient):
        return engine.singular_noun(ingredient)
    else:
        return ingredient
    
def remove_par(ingredient):
    return re.sub(r" \([^)]*\)","", ingredient)

def remove_special_char(ingredient):
    return re.sub(r'/|\n|%|:|\*|\.|#' , '', ingredient)

def remove_letter(ingredient):
    ingredient = re.sub(r'(\A| )s( |\Z)' , ' ', ingredient)
    ingredient = re.sub(r'(\A| )(t|T)( |\Z)' , ' ', ingredient)
    ingredient = re.sub(r'(\A| )c( |\Z)' , ' ', ingredient)
    ingredient = re.sub(r'(\A| )a ' , ' ', ingredient)
    ingredient = re.sub(r'(\A| )g ' , ' ', ingredient)
    return ingredient

def remove_number(ingredient):
    return re.sub(r"\d+", '', ingredient)

def remove_whitespace_comma(ingredient):
    return ingredient.lstrip().split(',')[0]

def remove_useless_words(ingredient):
    ingredient_list = ingredient.split(' ')
    ingredient_list = [word for word in ingredient_list if (word not in to_remove)]
    ingredient = ' '.join(ingredient_list)
    return ingredient

def remove_adjective(ingredient):
    return re.sub(r'\w+ed\s','',ingredient)

def remove_alternative(ingredient):
    return ingredient.split(' or ')[0]

def remove_minus(ingredient):
    return re.sub(r'\A-', '', ingredient)

def remove_conjonction(ingredient):
    return re.sub(r'\Aof |\Aand |\Ato ', '', ingredient)

def remove_space(ingredient):
    ingredient = re.sub(r'( )+', ' ', ingredient)
    return re.sub(r'\A ', '', ingredient)

In [None]:
list(map(lambda x: clean_ingredient(x), split_ingredient(ingredients[0])))

In [None]:
#for ingredient in ingredients:
#    print(list(map(lambda x: clean_ingredient(x), split_ingredient(ingredient))))

In [None]:
"""for i in range(len(ingredients)):
    print("cleaned: ")
    print(list(map(lambda x: clean_ingredient(x), split_ingredient(ingredients[i]))))
    print("original: ")
    print(ingredients[i])
    #a = split_ingredient(ingredients[i])
    #for b in a:  
    #    if('tb' in b):
    #        print(b)
    #        print('\n')"""

In [None]:
clean_recipes = []
for i in range(len(ingredients)):
    clean_recipe = list(map(lambda x: clean_ingredient(x), split_ingredient(ingredients[i])))
    clean_recipe = [x for x in clean_recipe if x != '' or x != '\'' or x != '\\n' or x != ['']]
    if clean_recipe:
        clean_recipes.append(clean_recipe)

In [None]:
len(clean_recipes)

In [None]:
ids_titles_recipes = list(zip(ids, titles, clean_recipes))

In [None]:
ids_titles_recipes = [x for x in ids_titles_recipes if (x[2] != [''])]

In [None]:
ids_clean = np.array([x[0] for x in ids_titles_recipes])
titles_clean = np.array([x[1] for x in ids_titles_recipes])
recipes_clean = np.array([x[2] for x in ids_titles_recipes])

In [None]:
df = pd.DataFrame({'id': ids_clean, 'title': titles_clean, 'recipe': recipes_clean})
df

In [None]:
df.to_json("clean_cookies_recipes.json")

In [None]:
df_recipe = df['recipe'].apply(pd.Series).stack().to_frame().reset_index()
df_test = df['recipe']
df_recipe = df_recipe[0].to_frame()

In [None]:
df_recipe

In [None]:
recipes_list = list(df_test.values)
recipes_list
b = pd.DataFrame(recipes_list)
b

In [None]:
df_recipe = df_recipe.rename(columns={0: 'ingredient'})

In [None]:
df_unique = df_recipe['ingredient'].unique()
len(df_unique)

In [None]:
df_test

In [None]:
ls = list()
for i in df_test:
    for ingr in i:
        ls.append(ingr)

In [None]:
se = pd.Series(ls)

In [None]:
count_table = se.value_counts().to_frame()
#count_table[count_table[0] > 10]
count_table.hist(bins=500, figsize=(15, 8))
plt.show()

In [None]:
a = count_table.reset_index()
print(a[a['index'] == 'cucumbers'])
print('\n')
print(a[a['index'] == 'cucumber'])
print('\n')
print(a[a['index'] == 'tomato'])
print('\n')
print(a[a['index'] == 'tomatoes'])
print('\n')
print(a[a['index'] == 'eggs'])
print('\n')
print(a[a['index'] == 'egg'])

In [None]:
count_table.plot(logy=True)
plt.show()