In [1]:
from gensim.models import word2vec

sentences = word2vec.LineSentence('data/groceries.txt')

In [2]:
model = word2vec.Word2Vec(sentences, iter = 500, min_count = 10)

In [3]:
model.wv.most_similar(['pork'])

[('turkey', 0.5346691608428955),
 ('ham', 0.48362135887145996),
 ('citrus_fruit', 0.4777030944824219),
 ('pip_fruit', 0.4771662950515747),
 ('chicken', 0.473264217376709),
 ('tropical_fruit', 0.4480413794517517),
 ('curd', 0.43360066413879395),
 ('onions', 0.42378467321395874),
 ('frankfurter', 0.415897011756897),
 ('butter', 0.4118388593196869)]

In [4]:
model.wv.most_similar('beef')

[('chicken', 0.5476158857345581),
 ('pip_fruit', 0.510686457157135),
 ('citrus_fruit', 0.4852493107318878),
 ('onions', 0.47696545720100403),
 ('sausage', 0.4632596969604492),
 ('meat', 0.45061126351356506),
 ('tropical_fruit', 0.43038880825042725),
 ('whole_milk', 0.43035203218460083),
 ('whipped/sour_cream', 0.42627644538879395),
 ('turkey', 0.41494014859199524)]

In [5]:
model.predict_output_word(['bottled_beer', 'red/blush_wine'])

[('liquor', 0.19797058),
 ('prosecco', 0.069738224),
 ('sparkling_wine', 0.044733528),
 ('liquor_(appetizer)', 0.028634),
 ('white_wine', 0.025581764),
 ('rum', 0.023810206),
 ('tea', 0.01974846),
 ('brandy', 0.016974358),
 ('house_keeping_products', 0.016868452),
 ('tidbits', 0.015944617)]

In [6]:
model.predict_output_word(['hamburger_meat', 'soda'])

[('Instant_food_products', 0.05131628),
 ('meat_spreads', 0.028350057),
 ('canned_vegetables', 0.02785824),
 ('bathroom_cleaner', 0.02623309),
 ('pasta', 0.023944926),
 ('sweet_spreads', 0.019742757),
 ('rum', 0.019495727),
 ('mayonnaise', 0.017532615),
 ('sauces', 0.015575802),
 ('softener', 0.015541684)]

In [7]:
model.predict_output_word(['ham', 'white_bread'])

[('processed_cheese', 0.21027786),
 ('hair_spray', 0.021439953),
 ('frozen_potato_products', 0.021238346),
 ('sweet_spreads', 0.020355126),
 ('condensed_milk', 0.018533614),
 ('soft_cheese', 0.017959382),
 ('spread_cheese', 0.017201949),
 ('hard_cheese', 0.01692048),
 ('honey', 0.01568072),
 ('dessert', 0.014630928)]

In [8]:
model.predict_output_word(['root_vegetables', 'other_vegetables', 'whole_milk', 'yogurt'])

[('herbs', 0.024234567),
 ('liver_loaf', 0.020382937),
 ('onions', 0.019050032),
 ('turkey', 0.018266374),
 ('specialty_cheese', 0.018154046),
 ('packaged_fruit/vegetables', 0.015720222),
 ('spread_cheese', 0.01479917),
 ('frozen_fish', 0.014211356),
 ('berries', 0.014141317),
 ('nuts/prunes', 0.013030531)]

In [9]:
model.predict_output_word(['curd', 'sugar'])

[('flour', 0.07940076),
 ('pudding_powder', 0.050001796),
 ('baking_powder', 0.022790017),
 ('cereals', 0.02205061),
 ('cream', 0.020248242),
 ('jam', 0.018242367),
 ('Instant_food_products', 0.016521702),
 ('roll_products_', 0.015954617),
 ('sauces', 0.01508828),
 ('frozen_dessert', 0.014888518)]

In [10]:
model.predict_output_word(['soda', 'salty_snack'])

[('popcorn', 0.07213024),
 ('nut_snack', 0.036569893),
 ('kitchen_towels', 0.025195248),
 ('chewing_gum', 0.017510926),
 ('tidbits', 0.01710649),
 ('specialty_bar', 0.016457327),
 ('cake_bar', 0.015674839),
 ('dental_care', 0.01430732),
 ('finished_products', 0.013805098),
 ('canned_vegetables', 0.013581929)]

In [11]:
model.predict_output_word(['sugar', 'baking_powder'])

[('flour', 0.10866481),
 ('pudding_powder', 0.065971844),
 ('cooking_chocolate', 0.03960717),
 ('rice', 0.03666494),
 ('vinegar', 0.02258706),
 ('soups', 0.022316474),
 ('cocoa_drinks', 0.021146132),
 ('meat_spreads', 0.020928469),
 ('salt', 0.019873263),
 ('specialty_fat', 0.019028228)]

In [12]:
print( model.wv.similarity('bottled_beer', 'red/blush_wine') )
print( model.wv.similarity('bottled_beer', 'liquor') )
print( model.wv.similarity('red/blush_wine', 'liquor') )

-0.004702939841396327
0.005525113270155493
0.10728827263327184


In [13]:
print( model.wv.similarity('hamburger_meat', 'soda') )
print( model.wv.similarity('hamburger_meat', 'Instant_food_products') )
print( model.wv.similarity('soda', 'Instant_food_products') )

-0.15367850927726345
-0.2154847465911408
0.041070670637925244


In [14]:
print( model.wv.similarity('ham', 'white_bread') )
print( model.wv.similarity('ham', 'processed_cheese') )
print( model.wv.similarity('white_bread', 'processed_cheese') )

0.06907582171827166
0.15916877271399876
-0.15978167328094206


In [15]:
from collections import Counter
import itertools

tri_counter = Counter([c for ws in sentences for c in itertools.combinations(sorted(ws), 3)])

In [16]:
pairs = itertools.combinations(model.wv.vocab.keys(), 2)

sorted([
    (p, item, prob) for p in pairs for item, prob in model.predict_output_word(p)
    if prob >= 0.05 and tri_counter[tuple(sorted([p[0], p[1], item]))] >= 10
], key = lambda x: -x[2])


[(('white_bread', 'ham'), 'processed_cheese', 0.21027786),
 (('bottled_beer', 'red/blush_wine'), 'liquor', 0.19797058),
 (('bottled_beer', 'liquor'), 'red/blush_wine', 0.14986311),
 (('sugar', 'baking_powder'), 'flour', 0.10866481),
 (('curd', 'sugar'), 'flour', 0.07940076),
 (('soda', 'ham'), 'processed_cheese', 0.07328585),
 (('rolls/buns', 'hamburger_meat'), 'Instant_food_products', 0.07258417),
 (('margarine', 'sugar'), 'flour', 0.07237357),
 (('soda', 'salty_snack'), 'popcorn', 0.07213024),
 (('flour', 'sugar'), 'baking_powder', 0.067636095),
 (('flour', 'baking_powder'), 'sugar', 0.06714855),
 (('soda', 'popcorn'), 'salty_snack', 0.06605672),
 (('sugar', 'whipped/sour_cream'), 'flour', 0.06347879),
 (('tropical_fruit', 'white_bread'), 'processed_cheese', 0.06131325),
 (('white_bread', 'processed_cheese'), 'ham', 0.05810209),
 (('whole_milk', 'Instant_food_products'), 'hamburger_meat', 0.057413436),
 (('citrus_fruit', 'sugar'), 'flour', 0.057178356),
 (('white_bread', 'soda'), 'pr