In [1]:
import pandas as pd
from matplotlib import pyplot as plt

In [4]:
import numpy as np
from gensim.models import Word2Vec

In [60]:
books = pd.read_csv('goodbooks-10k-master/books.csv')
books[['book_id','title','authors']].head(30)

Unnamed: 0,book_id,title,authors
0,1,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré"
2,3,"Twilight (Twilight, #1)",Stephenie Meyer
3,4,To Kill a Mockingbird,Harper Lee
4,5,The Great Gatsby,F. Scott Fitzgerald
5,6,The Fault in Our Stars,John Green
6,7,The Hobbit,J.R.R. Tolkien
7,8,The Catcher in the Rye,J.D. Salinger
8,9,"Angels & Demons (Robert Langdon, #1)",Dan Brown
9,10,Pride and Prejudice,Jane Austen


In [220]:
books[books['title'].str.contains('Eye of the World')][['book_id','title','authors']]

Unnamed: 0,book_id,title,authors
329,330,"The Eye of the World (Wheel of Time, #1)",Robert Jordan
6217,6218,"From the Two Rivers: The Eye of the World, Par...",Robert Jordan


In [97]:
ratings = pd.read_csv('goodbooks-10k-master/ratings.csv')
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [42]:
book_lookup_dict = {}
for index, row in books.iterrows():
    book_lookup_dict[row['book_id']] = row['title']

In [47]:
user_ratings_dict = {}
for index, row in ratings.iterrows():
    u=row['user_id']
    if u in user_ratings_dict:
        user_ratings_dict[u].append(book_lookup_dict[row['book_id']])
    else:
        user_ratings_dict[u] = [book_lookup_dict[row['book_id']]]

In [50]:
user_ratings_dict[250][0:20]

['Anne of Green Gables (Anne of Green Gables, #1)',
 'Anne of Avonlea (Anne of Green Gables, #2)',
 'The Little Prince',
 "Bridget Jones's Diary (Bridget Jones, #1)",
 'Bridget Jones: The Edge of Reason (Bridget Jones, #2)',
 'Harry Potter and the Goblet of Fire (Harry Potter, #4)',
 'Harry Potter and the Order of the Phoenix (Harry Potter, #5)',
 'Harry Potter and the Half-Blood Prince (Harry Potter, #6)',
 'Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)',
 'Me Talk Pretty One Day',
 'Animal Farm',
 'Of Mice and Men',
 'A Tree Grows in Brooklyn',
 'The Giver (The Giver, #1)',
 "The Memory Keeper's Daughter",
 "She's Come Undone",
 'A Million Little Pieces',
 'A Map of the World',
 'Atlas Shrugged',
 'The Fountainhead']

In [51]:
user_ratings_list = list(user_ratings_dict.values())

In [52]:
len(user_ratings_list)

53424

### word2vec training on user sequences

In [68]:
w2v_model = Word2Vec(iter=5, 
        size=150,
        window=5,
        min_count=1,
        sg=1,            
        workers=8)



In [69]:
w2v_model.build_vocab(user_ratings_list)

In [70]:
w2v_model.train(user_ratings_list, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter)

29809986

In [71]:
w2v_model.most_similar('Anne of Avonlea (Anne of Green Gables, #2)')

  """Entry point for launching an IPython kernel.


[("Anne's House of Dreams (Anne of Green Gables, #5)", 0.8137673139572144),
 ('Anne of Windy Poplars (Anne of Green Gables, #4)', 0.7718566656112671),
 ('Anne of the Island (Anne of Green Gables, #3)', 0.7575632929801941),
 ('Anne of Ingleside (Anne of Green Gables, #6)', 0.7481300830841064),
 ('Rilla of Ingleside (Anne of Green Gables, #8)', 0.6845377683639526),
 ('Rainbow Valley (Anne of Green Gables, #7)', 0.6839284896850586),
 ('Emily of New Moon (Emily, #1)', 0.5995081663131714),
 ('Little Men (Little Women, #2)', 0.5817627310752869),
 ('The Blue Castle ', 0.5744936466217041),
 ('Emily Climbs (Emily, #2)', 0.5730841755867004)]

In [72]:
w2v_model.most_similar('Divergent (Divergent, #1)')

  """Entry point for launching an IPython kernel.


[('Free Four: Tobias Tells the Divergent Knife-Throwing Scene (Divergent, #1.5)',
  0.6214812397956848),
 ('Legend (Legend, #1)', 0.558851420879364),
 ('City of Bones (The Mortal Instruments, #1)', 0.5427226424217224),
 ('The Fault in Our Stars', 0.5403486490249634),
 ('Clockwork Angel (The Infernal Devices, #1)', 0.5294054746627808),
 ('Life Before Legend: Stories of the Criminal and the Prodigy (Legend, #0.5)',
  0.5218158960342407),
 ('Matched (Matched, #1)', 0.5208295583724976),
 ('Delirium (Delirium, #1)', 0.5208221673965454),
 ('The Transfer (Divergent, #0.1)', 0.5157473087310791),
 ('Pandemonium (Delirium, #2)', 0.5157122015953064)]

In [73]:
w2v_model.most_similar('The Fountainhead')

  """Entry point for launching an IPython kernel.


[('Atlas Shrugged', 0.8595150113105774),
 ('Siddhartha', 0.7032516002655029),
 ('The Road to Serfdom', 0.6446232199668884),
 ('The Unbearable Lightness of Being', 0.6166320443153381),
 ('Second Treatise of Government', 0.600271999835968),
 ('The Virtue of Selfishness: A New Concept of Egoism', 0.5978139638900757),
 ('We the Living', 0.5961178541183472),
 ('Invisible Man', 0.5879529714584351),
 ('A Confederacy of Dunces', 0.5840548276901245),
 ('Economics in One Lesson: The Shortest & Surest Way to Understand Basic Economics',
  0.5810875296592712)]

In [74]:
w2v_model.most_similar('Watership Down (Watership Down, #1)')

  """Entry point for launching an IPython kernel.


[('The Once and Future King (The Once and Future King #1-4)',
  0.5737870335578918),
 ('Siddhartha', 0.5642685294151306),
 ('Redwall (Redwall, #1)', 0.5616549849510193),
 ('Lord Brocktree (Redwall, #13)', 0.5556131601333618),
 ('Atlas Shrugged', 0.5433738827705383),
 ('The Wind in the Willows', 0.5381594896316528),
 ('The Fountainhead', 0.5341587662696838),
 ('The Dragonbone Chair (Memory, Sorrow, and Thorn, #1)', 0.5295690298080444),
 ('The Princess Bride ', 0.5285208821296692),
 ('The Wonderful Wizard of Oz (Oz, #1)', 0.5249261260032654)]

In [75]:
w2v_model.most_similar('Atlas Shrugged')

  """Entry point for launching an IPython kernel.


[('The Fountainhead', 0.8595148921012878),
 ('Siddhartha', 0.7408323884010315),
 ('The Road to Serfdom', 0.6345528364181519),
 ("Zen Mind, Beginner's Mind: Informal Talks on Zen Meditation and Practice",
  0.5868801474571228),
 ('A History of Western Philosophy', 0.5863711833953857),
 ('Economics in One Lesson: The Shortest & Surest Way to Understand Basic Economics',
  0.5814582705497742),
 ('Common Sense, The Rights of Man and Other Essential Writings',
  0.5786297917366028),
 ('The Virtue of Selfishness: A New Concept of Egoism', 0.578102707862854),
 ('Second Treatise of Government', 0.5713158845901489),
 ('Two Treatises of Government', 0.5684026479721069)]

In [76]:
w2v_model.most_similar('The Wind in the Willows')

  """Entry point for launching an IPython kernel.


[('Peter Pan', 0.7439976334571838),
 ('The Wonderful Wizard of Oz (Oz, #1)', 0.6846657395362854),
 ("The Complete Grimm's Fairy Tales", 0.6781201958656311),
 ('Just So Stories', 0.6502512693405151),
 ('The Tale of Mrs. Tiggy-Winkle', 0.6474913954734802),
 ('A Little Princess', 0.6240366101264954),
 ('When We Were Very Young (Winnie-the-Pooh, #3)', 0.614892840385437),
 ('The Tale of Peter Rabbit', 0.6147154569625854),
 ('Alice in Wonderland', 0.6146117448806763),
 ('The Complete Fairy Tales', 0.608862042427063)]

In [77]:
w2v_model.most_similar('The Hobbit')

  """Entry point for launching an IPython kernel.


[('The History of the Hobbit, Part One: Mr. Baggins', 0.5569245219230652),
 ('The Voyage of the Dawn Treader (Chronicles of Narnia, #3)',
  0.5200710296630859),
 ('To Green Angel Tower (Memory, Sorrow, and Thorn, #3)', 0.46554747223854065),
 ('Magician: Master (The Riftwar Saga, #2)', 0.4654303789138794),
 ('To Green Angel Tower, Part 2 (Memory, Sorrow, and Thorn, #3; Part 2)',
  0.4631262421607971),
 ("Magician's Gambit (The Belgariad, #3)", 0.44797012209892273),
 ('The Black Cauldron (The Chronicles of Prydain #2)', 0.44619518518447876),
 ('Stone of Farewell (Memory, Sorrow, and Thorn, #2)', 0.4458349645137787),
 ('The Hobbit: Graphic Novel', 0.4432245194911957),
 ('The Great Hunt (Wheel of Time, #2)', 0.4424341320991516)]

In [82]:
w2v_model.most_similar('Unfinished Tales of Númenor and Middle-Earth')

  """Entry point for launching an IPython kernel.


[('The Children of Húrin', 0.8336758017539978),
 ('The Silmarillion (Middle-Earth Universe)', 0.7907360792160034),
 ('The Return of the King (The Lord of the Rings, #3)', 0.7257494926452637),
 ('The Two Towers (The Lord of the Rings, #2)', 0.720095157623291),
 ('The Power That Preserves (The Chronicles of Thomas Covenant the Unbeliever, #3)',
  0.6097647547721863),
 ("The Dark Elf Trilogy Collector's Edition (Forgotten Realms: Dark Elf Trilogy, #1-3; Legend of Drizzt, #1-3)",
  0.5934082269668579),
 ('The Lord of the Rings (The Lord of the Rings, #1-3)', 0.5835007429122925),
 ('The Complete Guide to Middle-Earth', 0.5775114297866821),
 ('The White Rose (The Chronicles of the Black Company, #3)',
  0.572107195854187),
 ("The Icewind Dale Trilogy Collector's Edition (Forgotten Realms: Icewind Dale, #1-3; Legend of Drizzt, #4-6)",
  0.5699194073677063)]

In [83]:
w2v_model.most_similar('The Silmarillion (Middle-Earth Universe)')

  """Entry point for launching an IPython kernel.


[('The Children of Húrin', 0.8240677714347839),
 ('Unfinished Tales of Númenor and Middle-Earth', 0.7907360792160034),
 ('The Two Towers (The Lord of the Rings, #2)', 0.757845938205719),
 ('The Return of the King (The Lord of the Rings, #3)', 0.740568220615387),
 ('The Lord of the Rings (The Lord of the Rings, #1-3)', 0.5943979620933533),
 ("The Belgariad Boxed Set: Pawn of Prophecy / Queen of Sorcery / Magician's Gambit / Castle of Wizardry / Enchanters' End Game (The Belgariad, #1-5)",
  0.5734027028083801),
 ('The Chronicles of Thomas Covenant, the Unbeliever (The Chronicles of Thomas Covenant the Unbeliever, #1-3)',
  0.5632948875427246),
 ('The White Rose (The Chronicles of the Black Company, #3)',
  0.5627120137214661),
 ('J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings',
  0.5557882785797119),
 ("The Dark Elf Trilogy Collector's Edition (Forgotten Realms: Dark Elf Trilogy, #1-3; Legend of Drizzt, #1-3)",
  0.55531907081604)]

In [85]:
w2v_model.most_similar('The Fellowship of the Ring (The Lord of the Rings, #1)')

  """Entry point for launching an IPython kernel.


[('The Complete Guide to Middle-Earth', 0.5543145537376404),
 ('The Hobbit: Graphic Novel', 0.5456080436706543),
 ('The Farthest Shore (Earthsea Cycle, #3)', 0.5403525829315186),
 ('The History of the Hobbit, Part One: Mr. Baggins', 0.5347411632537842),
 ('The Lord of the Rings: The Art of The Fellowship of the Ring',
  0.5178254246711731),
 ("Enchanters' End Game (The Belgariad, #5)", 0.49791544675827026),
 ('Magician: Master (The Riftwar Saga, #2)', 0.4675130546092987),
 ('The Dragon Reborn (Wheel of Time, #3)', 0.45706117153167725),
 ('The Lion, the Witch, and the Wardrobe (Chronicles of Narnia, #1)',
  0.4564213752746582),
 ('The Legend of Huma (Dragonlance: Heroes, #1)', 0.4528716504573822)]

In [86]:
w2v_model.most_similar('Dracula')

  """Entry point for launching an IPython kernel.


[('The Picture of Dorian Gray', 0.6947566866874695),
 ('The Count of Monte Cristo', 0.6364543437957764),
 ('Persuasion', 0.6280645132064819),
 ('Emma', 0.6268529295921326),
 ('Frankenstein', 0.6254807114601135),
 ('Les Misérables', 0.6021900177001953),
 ('Dracula (Marvel Illustrated)', 0.60067218542099),
 ('Great Expectations', 0.5952202677726746),
 ('The Legend of Sleepy Hollow (Graphic Novel)', 0.5939986705780029),
 ('The Complete Works of H.P. Lovecraft', 0.5873639583587646)]

In [87]:
w2v_model.most_similar('Wuthering Heights')

  """Entry point for launching an IPython kernel.


[('Sense and Sensibility', 0.6686612367630005),
 ('Jane Eyre', 0.6627859473228455),
 ('The Picture of Dorian Gray', 0.6483259201049805),
 ('Little Women (Little Women, #1)', 0.6320464015007019),
 ('Manga Classics: Les Misérables', 0.6314364671707153),
 ('Emma', 0.6196061968803406),
 ('Anna Karenina', 0.6060368418693542),
 ('Vanity Fair', 0.6053493618965149),
 ('Pride and Prejudice', 0.595467209815979),
 ('The Tenant of Wildfell Hall', 0.5943660736083984)]

In [88]:
w2v_model.most_similar('Pride and Prejudice') 

  """Entry point for launching an IPython kernel.


[('Wuthering Heights', 0.5954671502113342),
 ('Jane Eyre', 0.573065459728241),
 ('Manga Classics: Les Misérables', 0.48895323276519775),
 ('Vanity Fair', 0.4879949986934662),
 ('The Great Gatsby', 0.48110389709472656),
 ('North and South', 0.48090535402297974),
 ('Sense & Sensibility (The Austen Project, #1)', 0.47603723406791687),
 ('Little Women (Little Women, #1)', 0.47198620438575745),
 ('Lorna Doone', 0.4678046405315399),
 ('Shirley', 0.4572826623916626)]

In [90]:
w2v_model.most_similar('Carmilla')

  """Entry point for launching an IPython kernel.


[('The Monk', 0.7028868198394775),
 ('Lost Souls', 0.6770556569099426),
 ('The Haunting of Hill House', 0.6384148001670837),
 ('Hell House', 0.6359653472900391),
 ('The Castle of Otranto', 0.6357819437980652),
 ('Shadows over Innsmouth', 0.6298846006393433),
 ('The Case of Charles Dexter Ward', 0.6209717392921448),
 ('Books of Blood: Volumes One to Three (Books of Blood #1-3)',
  0.6109519004821777),
 ('The Call of Cthulhu and Other Weird Stories', 0.6067954897880554),
 ('Let the Right One In', 0.5981661081314087)]

In [92]:
w2v_model.most_similar('Northanger Abbey') 

  """Entry point for launching an IPython kernel.


[('Mansfield Park', 0.8853601813316345),
 ('Persuasion', 0.7055795192718506),
 ('Lady Susan', 0.6169921159744263),
 ('A Tale of Two Cities / Great Expectations', 0.5910705924034119),
 ("A Midsummer Night's Dream", 0.5796450972557068),
 ('Emma', 0.5545459389686584),
 ('Macbeth', 0.5459154844284058),
 ('Frederica', 0.5424497723579407),
 ('Daniel Deronda', 0.5401150584220886),
 ('Cranford', 0.539344847202301)]

In [174]:
w2v_model.most_similar('A Wizard of Earthsea (Earthsea Cycle, #1)')

  """Entry point for launching an IPython kernel.


[('Nine Princes in Amber (The Chronicles of Amber #1)', 0.7095693945884705),
 ('Tigana', 0.6830148100852966),
 ("Wizard's First Rule (Sword of Truth, #1)", 0.6810122132301331),
 ('Elric of Melniboné (Elric, #1)', 0.679237961769104),
 ('Tehanu (Earthsea Cycle, #4)', 0.6629476547241211),
 ('Tales from Earthsea (Earthsea Cycle, #5)', 0.6553428173065186),
 ('The Magic of Recluce (The Saga of Recluce #1)', 0.6529965400695801),
 ('The Tombs of Atuan (Earthsea Cycle, #2)', 0.6527249813079834),
 ('The Earthsea Trilogy', 0.6500858068466187),
 ('The Dragonbone Chair (Memory, Sorrow, and Thorn, #1)', 0.6474732756614685)]

### Using the embeddings for user recommendations

In [132]:
from sklearn.cross_validation import train_test_split
from operator import itemgetter

In [98]:
ratings = ratings.merge(books[['book_id','title']],on='book_id')
ratings.head()

Unnamed: 0,user_id,book_id,rating,title
0,1,258,5,The Shadow of the Wind (The Cemetery of Forgot...
1,11,258,3,The Shadow of the Wind (The Cemetery of Forgot...
2,143,258,4,The Shadow of the Wind (The Cemetery of Forgot...
3,242,258,5,The Shadow of the Wind (The Cemetery of Forgot...
4,325,258,4,The Shadow of the Wind (The Cemetery of Forgot...


In [102]:
print(len(train.drop_duplicates('user_id')))
print(len(test.drop_duplicates('user_id')))

53424
53424


In [167]:
# class for generating recommendations with the embeddings
def w2v_book_recs(model,user_books,neighborhood_size=20,nr_recs=10):
    recs_dict={}
    for book in user_books:
        book_recs = model.wv.most_similar(book,topn=neighborhood_size) 
        for rec in book_recs:
            if rec[0] not in user_books:
                if rec in recs_dict.keys():
                    recs_dict[rec[0]] += rec[1]
                else:
                    recs_dict[rec[0]] = rec[1]
    recs_list = list(recs_dict.items())
    recs_list.sort(key=itemgetter(1),reverse=True)

    return recs_list[:nr_recs]

In [262]:
# Major works of Tolkien
w2v_book_recs(model=w2v_model
                 ,user_books=['The Hobbit'
                  ,'The Fellowship of the Ring (The Lord of the Rings, #1)'
                  ,'The Two Towers (The Lord of the Rings, #2)'
                  ,'The Return of the King (The Lord of the Rings, #3)']
                ,nr_recs=10,neighborhood_size=5)

[('The Silmarillion (Middle-Earth Universe)', 0.7405682802200317),
 ('The Children of Húrin', 0.7274069786071777),
 ('Unfinished Tales of Númenor and Middle-Earth', 0.7257495522499084),
 ('The Lord of the Rings (The Lord of the Rings, #1-3)', 0.6612474918365479),
 ('The Complete Guide to Middle-Earth', 0.5543145537376404),
 ('The Hobbit: Graphic Novel', 0.5456080436706543),
 ('The Farthest Shore (Earthsea Cycle, #3)', 0.5403525829315186),
 ('The History of the Hobbit, Part One: Mr. Baggins', 0.5347411632537842),
 ('The Voyage of the Dawn Treader (Chronicles of Narnia, #3)',
  0.5200710296630859),
 ('The Lord of the Rings: The Art of The Fellowship of the Ring',
  0.5178254246711731)]

In [263]:
# Gothic novels
w2v_book_recs(w2v_model
                 ,['Dracula'
                  ,'Carmilla'
                  ,'Frankenstein'
                  ,'Northanger Abbey'
                  ,'Wuthering Heights'
                  ,'The Monk']
                ,neighborhood_size=5
              ,nr_recs=10
            )

[('Mansfield Park', 0.8853601813316345),
 ('The Castle of Otranto', 0.7581470608711243),
 ('Persuasion', 0.7055795192718506),
 ('The Strange Case of Dr. Jekyll and Mr. Hyde and Other Stories ',
  0.6850782632827759),
 ('Lost Souls', 0.6770556569099426),
 ('Sense and Sensibility', 0.6686612367630005),
 ('Jane Eyre', 0.6627859473228455),
 ('Sentimental Education', 0.6613252758979797),
 ('The Charterhouse of Parma', 0.6533415913581848),
 ('The Picture of Dorian Gray', 0.6483259201049805)]

In [232]:
from numpy.random import choice

In [272]:
user_books = user_ratings_dict[251]
holdout_set = choice(user_ratings_dict[1],size=int(np.ceil(0.2*len(user_books))),replace=False)
train_set = [x for x in user_books if x not in holdout_set]

In [273]:
holdout_set

array(['Crime and Punishment',
       'The Chronicles of Narnia (Chronicles of Narnia, #1-7)',
       'A Prayer for Owen Meany', 'Gone with the Wind', 'Emma',
       'Three Junes', 'Cry to Heaven', "Ender's Game (Ender's Saga, #1)",
       'Unaccustomed Earth', 'The Poisonwood Bible',
       'Balzac and the Little Chinese Seamstress', 'The Glass Castle',
       'Sense and Sensibility', 'War and Peace',
       'A Man Without a Country',
       'World Without End (The Kingsbridge Series, #2)',
       "She's Come Undone", 'Les Misérables',
       'Snow Flower and the Secret Fan', 'The Giving Tree',
       'The Scarlet Letter', 'West with the Night', 'Life of Pi',
       "Brunelleschi's Dome: How a Renaissance Genius Reinvented Architecture",
       'The Art of Fielding', 'Little Bee', 'Mystic River', 'The Idiot',
       'The Paris Wife'], dtype='<U78')

In [274]:
w2v_book_recs(w2v_model,train_set,neighborhood_size=5,nr_recs=50)

[('Second Foundation (Foundation #3)', 0.9196539521217346),
 ('Drawing from Memory', 0.8915365934371948),
 ('Diamonds Are Forever (James Bond, #4)', 0.8826935291290283),
 ('Amerika', 0.8703593611717224),
 ("There's Treasure Everywhere: A Calvin and Hobbes Collection",
  0.868524432182312),
 ('Moonraker (James Bond, #3)', 0.8672071695327759),
 ('Brain Droppings', 0.8661783337593079),
 ('The Calvin and Hobbes Tenth Anniversary Book', 0.8612014651298523),
 ('The Magic Mirror of M.C. Escher', 0.8573482036590576),
 ('Live and Let Die (James Bond, #2)', 0.8561528921127319),
 ('Andy Goldsworthy: A Collaboration with Nature', 0.853991687297821),
 ('The Calvin and Hobbes Lazy Sunday Book', 0.8527666330337524),
 ('An Illustrated Life: Drawing Inspiration From The Private Sketchbooks Of Artists, Illustrators And Designers',
  0.8518355488777161),
 ('The Revenge of the Baby-Sat', 0.8515295386314392),
 ('The Lion and the Mouse', 0.8507595658302307),
 ('Snow Falling on Cedars', 0.8489660024642944),
