In [1]:
import pandas
import collections # for Counter
import time

Counter is a type of dictionary where elements are stored as keys and their counts are stored as values.  
https://docs.python.org/2/library/collections.html#collections.Counter

Let's define some functions we'll need.

In [2]:
def fix_string_in_list(li):
    """This function replaces spaces with underscores in strings in provided list and returns a new list with the modified strings."""
    return [s.replace(' ', '_') for s in li]

In [3]:
def concatenate_lists(list_container):
    """This function concatenates all of the list in a given container and returns the new combined list."""
    all_items = []
    for item_list in list_container:
        all_items += item_list
    return all_items

In [4]:
def count_items(li):
    """This function counts the number of occurences of each item in a list and returns the results in a Counter object."""
    item_cnt = collections.Counter()
    for item in li:
        item_cnt[item] += 1
    return item_cnt

In [5]:
def remove_singles(cntr):
    """This function removes all items with a count of one from the provided counter object and returns the modified object."""
    items = list(cntr.items())
    singles = []
    for item, cnt in items:
        if cnt == 1:
            singles.append(item)
    for item in singles:
        del cntr[item]
    return cntr

In [6]:
def most_common_items(cntr, n):
    """This function returns a list of the n most common items in the provided counter object."""
    return [item for item,cnt in cntr.most_common(n)]

"featurizing" the ingredients column taken from https://datascience.stackexchange.com/questions/11797/split-a-list-of-values-into-columns-of-a-dataframe

In [7]:
def value2attribute(df, value_list, source_col):
    """This function takes a list of values and adds them to the dataframe as binary attributes."""
    new_df = pandas.concat([df, pandas.DataFrame(columns=value_list)], sort=False)
    for value in value_list:
        new_df[value] = new_df.apply(lambda x: int(value in x[source_col]), axis=1)
    return new_df

First let's load in the file.

In [8]:
recipe_data = pandas.read_json('train.json', orient='records', typ='frame')

Let's replace any spaces in ingredient names with underscores to make things a little easier later.

In [9]:
recipe_data['ingredients'] = recipe_data['ingredients'].apply(fix_string_in_list)

Make a list of every ingredient in every recipe.

In [10]:
all_ingredients = concatenate_lists(recipe_data['ingredients'])

Now let's count the number of times each ingredient occurs.

In [11]:
ingrd_cnt = count_items(all_ingredients)

Ingredients only appearing once may not be useful, so we can try removing them.

In [12]:
ingrd_cnt = remove_singles(ingrd_cnt)

Here we'll can take a number, num_els, of the most common ingredients.


In [13]:
num_els = 20
most_common = most_common_items(ingrd_cnt, num_els)

If we think the original ~40k rows is too much, we could randomly sample a fraction of the data.  
random_state is a random seed to make sure that we get the same set every time we run it.  
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html#pandas.DataFrame.sample

In [14]:
#smaller_recipe_data = recipe_data.sample(frac=0.5, random_state=42)
#smaller_recipe_data = smaller_recipe_data.reset_index(drop=True)

Create a new dataframe with the most common ingredients 'featurized'. Weka has trouble loading this csv with the ingredients column, so let's drop it - we won't need it for learning anyway.  Then we'll write it to a CSV file.

In [15]:
new_recipe_data = value2attribute(recipe_data, most_common, 'ingredients')
new_recipe_data = new_recipe_data.drop(columns='ingredients')
new_recipe_data.to_csv('top20-a.csv', index=False)

Now we will randomly select 20 ingredients instead of choosing the 20 most common.  To do so, we'll convert the list of all ingredients to a pandas series so that we can sample it using a random seed (to make sure that we get the same set every time we run it).  We then convert the resulting series of 20 pseudorandom ingredients back to a list.

In [16]:
rand_len = 20
all_ingredients_df = pandas.Series(all_ingredients)
rand_ingrd_df = all_ingredients_df.sample(n=rand_len, random_state=42)
rand_ingrd_list = rand_ingrd_df.tolist()

We'll restrict our data to this random set of ingredients.

In [17]:
rand_recipe_data = value2attribute(recipe_data, rand_ingrd_list, 'ingredients')
rand_recipe_data = rand_recipe_data.drop(columns='ingredients')
rand_recipe_data.to_csv('rand20-a.csv', index=False)

Maybe we can get better results from the WEKA algorithms if we join the new_recipe_data and rand_recipe_data dataframes and write the resulting dataframe to a csv.

In [18]:
new_ingrd_list = list(set(most_common) | set(rand_ingrd_list))
new_and_rand_recipe_data = value2attribute(recipe_data, new_ingrd_list, 'ingredients')
new_and_rand_recipe_data = new_and_rand_recipe_data.drop(columns='ingredients')
new_and_rand_recipe_data.to_csv('top20andrand20-a.csv', index=False)

Let's find the least commonly occurring ingredients (excluding those that only appear once.)

In [19]:
least_common = [ingrd for ingrd,cnt in ingrd_cnt.most_common()[:-num_els-1:-1]]
least_data = value2attribute(recipe_data, least_common, 'ingredients')
least_data = least_data.drop(columns='ingredients')

Combine the most common and least common (excluding those appearing only once) ingredients and write to CSV file.

In [20]:
most_least_data = pandas.merge(new_recipe_data, least_data, how='inner', on=['cuisine', 'id'])
most_least_data.to_csv('most_and_least.csv', index=False)

What if we group by cuisine and take top *n* from each type?

In [21]:
cuisine_groups = recipe_data.groupby(by='cuisine')
# each group in the groupby object is a tuple: (group, dataframe)
cuisine_ingrds = {}
for group in cuisine_groups:
    items = concatenate_lists(group[1]['ingredients'])
    ingrds = count_items(items)
    top5 = most_common_items(ingrds, 5)
    cuisine_ingrds[group[0]] = top5
# now combine them to make one list with no duplicates that we can 'featurize'
top_from_each = set()
for ingrds in cuisine_ingrds.values():
    top_from_each = top_from_each.union(set(ingrds))
top_from_each = list(top_from_each)
from_each_data = value2attribute(recipe_data, top_from_each, 'ingredients')
from_each_data = from_each_data.drop(columns=['id', 'ingredients'])
from_each_data.to_csv('top_from_each.csv', index=False)

Some ingredients, e.g. salt, appear in almost every type of cuisine.  Maybe it's not very useful to include ingredients that are so common.  What if we took the top *n* ingredients from each type, combined them and then subtracted the _m_ most common ingredients to all types of cuisine?