In [1]:
import pandas
import collections # for Counter
import time

Counter is a type of dictionary where elements are stored as keys and their counts are stored as values.  
https://docs.python.org/2/library/collections.html#collections.Counter

In [2]:
recipe_data = pandas.read_json('train.json', orient='records', typ='frame')

In [3]:
recipe_data.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


Let's replace any spaces in ingredient names with underscores to make things a little easier later.

In [4]:
#def fix_string_in_list(li):
#    """This functino replace spaces with underscores in strings in provided list and returns a new list with the modified strings."""
#    return [s.replace(' ', '_') for s in li]
recipe_data['ingredients'] = recipe_data['ingredients'].apply(lambda li: [s.replace(' ', '_') for s in li])

Make a list of every ingredient in every recipe.

In [5]:
all_ingredients = []
for ingrds in recipe_data['ingredients']:
    all_ingredients += ingrds

Now let's count the number of times each ingredient occurs.

In [6]:
ingrd_cnt = collections.Counter()

for ingrd in all_ingredients:
    ingrd_cnt[ingrd] += 1

Here we can take any number, num_els, most common ingredients.  Or we could take a random sample.


In [7]:
num_els = 20
most_common = [ingrd for ingrd,cnt in ingrd_cnt.most_common(num_els)]
print(most_common)

['salt', 'onions', 'olive_oil', 'water', 'garlic', 'sugar', 'garlic_cloves', 'butter', 'ground_black_pepper', 'all-purpose_flour', 'pepper', 'vegetable_oil', 'eggs', 'soy_sauce', 'kosher_salt', 'green_onions', 'tomatoes', 'large_eggs', 'carrots', 'unsalted_butter']


"featurizing" the ingredients column taken from https://datascience.stackexchange.com/questions/11797/split-a-list-of-values-into-columns-of-a-dataframe

In [8]:
def value2attribute(df, value_list, source_col):
    """This function takes a list of values and adds them to the dataframe as binary attributes."""
    new_df = pandas.concat([df, pandas.DataFrame(columns=value_list)], sort=False)
    for value in value_list:
        new_df[value] = new_df.apply(lambda x: int(value in x[source_col]), axis=1)
    return new_df

Let's randomly sample half of the data, I think the original 40k rows may be too much.  
If it seems to run fine in Weka we can use the full dataset, or choose any percentage by setting frac.  
random_state is a random seed to make sure that we get the same set every time we run it.  
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html#pandas.DataFrame.sample

In [9]:
#smaller_recipe_data = recipe_data.sample(frac=0.5, random_state=42)
#smaller_recipe_data = smaller_recipe_data.reset_index(drop=True)
#smaller_recipe_data.shape

In [10]:
start_time = time.time()
new_recipe_data = value2attribute(recipe_data, most_common, 'ingredients')
print(time.time() - start_time)

14.661777019500732


In [11]:
new_recipe_data.head()

Unnamed: 0,cuisine,id,ingredients,salt,onions,olive_oil,water,garlic,sugar,garlic_cloves,...,pepper,vegetable_oil,eggs,soy_sauce,kosher_salt,green_onions,tomatoes,large_eggs,carrots,unsalted_butter
0,greek,10259.0,"[romaine_lettuce, black_olives, grape_tomatoes...",0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,southern_us,25693.0,"[plain_flour, ground_pepper, salt, tomatoes, g...",1,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,0,0
2,filipino,20130.0,"[eggs, pepper, salt, mayonaise, cooking_oil, g...",1,0,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
3,indian,22213.0,"[water, vegetable_oil, wheat, salt]",1,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,indian,13162.0,"[black_pepper, shallots, cornflour, cayenne_pe...",1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Weka has trouble loading this csv with the ingredients column, so let's drop it. We won't need it for learning anyway.

In [12]:
new_recipe_data = new_recipe_data.drop(columns='ingredients')

Looks good.  Let's save it to a csv file.

In [13]:
new_recipe_data.to_csv('top20.csv', index=False)

Now we will randomly select 20 ingredients instead of choosing the 20 most common.  To do so, we'll convert the list of all ingredients to a pandas series so that we can sample it using a random seed (to make sure that we get the same set every time we run it).  We then convert the resulting series of 20 pseudorandom ingredients back to a list.

In [14]:
rand_len = 20
all_ingredients_df = pandas.Series(all_ingredients)
rand_ingrd_df = all_ingredients_df.sample(n=rand_len, random_state=42)
rand_ingrd_list = rand_ingrd_df.tolist()
print(rand_ingrd_list)

['white_sugar', 'refried_beans', 'eggs', 'ham', 'unsalted_butter', 'purple_onion', 'red_wine_vinegar', 'cajun_seasoning', 'large_eggs', 'apricot_nectar', 'sugar', 'cinnamon', 'peanut_oil', 'onions', 'bay_leaf', 'water_chestnuts', 'diced_tomatoes', 'beef', 'garlic_cloves', 'cabbage']


We'll restrict our data to this random set of ingredients.

In [15]:
start_time = time.time()
rand_recipe_data = value2attribute(recipe_data, rand_ingrd_list, 'ingredients')
print(time.time() - start_time)

14.580367088317871


In [16]:
rand_recipe_data = rand_recipe_data.drop(columns='ingredients')
rand_recipe_data.head()

Unnamed: 0,cuisine,id,white_sugar,refried_beans,eggs,ham,unsalted_butter,purple_onion,red_wine_vinegar,cajun_seasoning,...,sugar,cinnamon,peanut_oil,onions,bay_leaf,water_chestnuts,diced_tomatoes,beef,garlic_cloves,cabbage
0,greek,10259.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,southern_us,25693.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,filipino,20130.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,indian,22213.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,indian,13162.0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0


In [17]:
rand_recipe_data.to_csv('rand20.csv', index=False)

Maybe we can get better results from the WEKA algorithms if we join the new_recipe_data and rand_recipe_data dataframes and write the resulting dataframe to a csv.

In [18]:
new_and_rand_recipe_data = pandas.merge(new_recipe_data, rand_recipe_data, how='inner', on=['cuisine', 'id'])

Print out the first few lines of the merged dataframe transposed so that it's easier to see if we have the correct columns.

In [19]:
new_and_rand_recipe_data.head().T

Unnamed: 0,0,1,2,3,4
cuisine,greek,southern_us,filipino,indian,indian
id,10259,25693,20130,22213,13162
salt,0,1,1,1,1
onions_x,0,0,0,0,1
olive_oil,0,0,0,0,0
water,0,0,0,1,1
garlic,1,0,0,0,0
sugar_x,0,0,0,0,0
garlic_cloves_x,0,0,0,0,0
butter,0,0,1,0,1


The dataframe looks the way we wanted, so we'll write it to a csv.

In [20]:
new_and_rand_recipe_data.to_csv('top20andrand20.csv', index=False)