In [1]:
from gensim.models import word2vec

sentences = word2vec.LineSentence('groceries.txt')

In [2]:
model = word2vec.Word2Vec(sentences, iter = 500, min_count = 1)

In [3]:
model.wv.most_similar(['pork'])

[('turkey', 0.5547687411308289),
 ('ham', 0.49448296427726746),
 ('pip_fruit', 0.46879759430885315),
 ('tropical_fruit', 0.4383287727832794),
 ('butter', 0.43373265862464905),
 ('frankfurter', 0.4334157109260559),
 ('root_vegetables', 0.4249211549758911),
 ('citrus_fruit', 0.4246293306350708),
 ('chicken', 0.42378148436546326),
 ('sausage', 0.41153857111930847)]

In [4]:
model.wv.most_similar('beef')

[('pip_fruit', 0.5338480472564697),
 ('chicken', 0.5199225544929504),
 ('citrus_fruit', 0.4966673254966736),
 ('whipped/sour_cream', 0.47635287046432495),
 ('meat', 0.46787431836128235),
 ('curd', 0.46770867705345154),
 ('onions', 0.4644170105457306),
 ('sausage', 0.45907285809516907),
 ('tropical_fruit', 0.4336831569671631),
 ('whole_milk', 0.4250752627849579)]

In [5]:
model.predict_output_word(['bottled_beer', 'red/blush_wine'])

[('liquor', 0.22384332),
 ('prosecco', 0.04933687),
 ('sparkling_wine', 0.0345262),
 ('white_wine', 0.024850508),
 ('liquor_(appetizer)', 0.023338959),
 ('tidbits', 0.021622568),
 ('rum', 0.01992436),
 ('baby_food', 0.015693987),
 ('house_keeping_products', 0.015246705),
 ('tea', 0.014245467)]

In [6]:
model.predict_output_word(['hamburger_meat', 'soda'])

[('Instant_food_products', 0.054281656),
 ('canned_vegetables', 0.029985178),
 ('pasta', 0.025487985),
 ('sweet_spreads', 0.025283454),
 ('meat_spreads', 0.022208596),
 ('bathroom_cleaner', 0.021768838),
 ('sauces', 0.016184878),
 ('potato_products', 0.016023112),
 ('rum', 0.015607255),
 ('softener', 0.015018263)]

In [7]:
model.predict_output_word(['ham', 'white_bread'])

[('processed_cheese', 0.20990367),
 ('sweet_spreads', 0.024131883),
 ('spread_cheese', 0.023222428),
 ('hard_cheese', 0.020685839),
 ('honey', 0.017153246),
 ('frozen_potato_products', 0.016769795),
 ('hair_spray', 0.016084857),
 ('condensed_milk', 0.016035773),
 ('soft_cheese', 0.014342537),
 ('dessert', 0.014085873)]

In [8]:
model.predict_output_word(['root_vegetables', 'other_vegetables', 'whole_milk', 'yogurt'])

[('herbs', 0.024541182),
 ('liver_loaf', 0.019327056),
 ('turkey', 0.01775743),
 ('onions', 0.01760579),
 ('specialty_cheese', 0.014991459),
 ('packaged_fruit/vegetables', 0.014529809),
 ('spread_cheese', 0.012931713),
 ('meat', 0.012434797),
 ('beef', 0.011924307),
 ('butter_milk', 0.011828974)]

In [9]:
model.predict_output_word(['curd', 'sugar'])

[('flour', 0.076272935),
 ('pudding_powder', 0.055790607),
 ('baking_powder', 0.026003197),
 ('cereals', 0.021788204),
 ('cream', 0.017001987),
 ('jam', 0.014941054),
 ('organic_products', 0.014818382),
 ('curd_cheese', 0.014737139),
 ('frozen_dessert', 0.013503349),
 ('tidbits', 0.013300725)]

In [10]:
model.predict_output_word(['soda', 'salty_snack'])

[('popcorn', 0.05830234),
 ('nut_snack', 0.046429735),
 ('chewing_gum', 0.0213278),
 ('kitchen_towels', 0.019580541),
 ('cake_bar', 0.018023761),
 ('prosecco', 0.016160522),
 ('make_up_remover', 0.0150430715),
 ('frozen_chicken', 0.01499828),
 ('tidbits', 0.014298808),
 ('canned_vegetables', 0.014091631)]

In [11]:
model.predict_output_word(['sugar', 'baking_powder'])

[('flour', 0.11954326),
 ('cooking_chocolate', 0.046284538),
 ('pudding_powder', 0.03714784),
 ('vinegar', 0.029273923),
 ('rice', 0.028818287),
 ('salt', 0.02470855),
 ('cocoa_drinks', 0.024143243),
 ('liqueur', 0.021240951),
 ('specialty_fat', 0.019339766),
 ('meat_spreads', 0.018915236)]

In [12]:
print( model.wv.similarity('bottled_beer', 'red/blush_wine') )
print( model.wv.similarity('bottled_beer', 'liquor') )
print( model.wv.similarity('red/blush_wine', 'liquor') )

-0.005118543658904618
0.020952063557078016
0.11583180970518396


In [13]:
print( model.wv.similarity('hamburger_meat', 'soda') )
print( model.wv.similarity('hamburger_meat', 'Instant_food_products') )
print( model.wv.similarity('soda', 'Instant_food_products') )

-0.13100304145445874
-0.2033228128994272
0.03914292732245933


In [14]:
print( model.wv.similarity('ham', 'white_bread') )
print( model.wv.similarity('ham', 'processed_cheese') )
print( model.wv.similarity('white_bread', 'processed_cheese') )

0.11314414615219766
0.11970613254290825
-0.15694054499998372


In [15]:
from collections import Counter
import itertools

tri_counter = Counter([c for ws in sentences for c in itertools.combinations(sorted(ws), 3)])

In [16]:
pairs = itertools.combinations(model.wv.vocab.keys(), 2)

sorted([
    (p, item, prob) for p in pairs for item, prob in model.predict_output_word(p)
    if prob >= 0.05 and tri_counter[tuple(sorted([p[0], p[1], item]))] >= 10
], key = lambda x: -x[2])


[(('bottled_beer', 'red/blush_wine'), 'liquor', 0.22384332),
 (('white_bread', 'ham'), 'processed_cheese', 0.20990367),
 (('bottled_beer', 'liquor'), 'red/blush_wine', 0.16274776),
 (('sugar', 'baking_powder'), 'flour', 0.11954326),
 (('curd', 'sugar'), 'flour', 0.076272935),
 (('margarine', 'sugar'), 'flour', 0.07422828),
 (('flour', 'sugar'), 'baking_powder', 0.07345509),
 (('sugar', 'whipped/sour_cream'), 'flour', 0.072731614),
 (('rolls/buns', 'hamburger_meat'), 'Instant_food_products', 0.06818052),
 (('sugar', 'root_vegetables'), 'flour', 0.0641469),
 (('tropical_fruit', 'white_bread'), 'processed_cheese', 0.061861355),
 (('soda', 'ham'), 'processed_cheese', 0.06138085),
 (('white_bread', 'processed_cheese'), 'ham', 0.061199907),
 (('whole_milk', 'ham'), 'processed_cheese', 0.059773713),
 (('beef', 'root_vegetables'), 'herbs', 0.059243686),
 (('sugar', 'whipped/sour_cream'), 'baking_powder', 0.05871357),
 (('soda', 'salty_snack'), 'popcorn', 0.05830234),
 (('soda', 'popcorn'), 'sa