In [3]:
import pandas as pd
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
import re
import string
from gensim.parsing.preprocessing import STOPWORDS as stop_words
from gensim.parsing.preprocessing import preprocess_documents
from gensim.models import Doc2Vec, Phrases
from gensim.models.doc2vec import TaggedDocument
from random import shuffle
import multiprocessing

In [2]:
# load in movie reviews and drop some extra columns
dfm = pd.read_csv('/Users/genna/Downloads/movies_commonReviewers_grouped.csv')
dfm.drop(dfm.columns[0], axis=1, inplace=True)
dfm.drop(dfm.columns[0], axis=1, inplace=True)

In [6]:
# take a look at the data
dfm.head()

Unnamed: 0,asin,title,imUrl,reviewText,genre
0,5019281,An American Christmas Carol,http://ecx.images-amazon.com/images/I/51EN601N...,This is a charming version of the classic Dick...,
1,5119367,Joseph,http://ecx.images-amazon.com/images/I/51PfmPOP...,Though this interpretation of Joseph's life ha...,Warner Home Video All Titles
2,307141985,Mouse on the Mayflower,http://ecx.images-amazon.com/images/I/21T26ZVP...,"""Set sail with the tiniest Pilgrim for a music...",Sony Pictures Home Entertainment All Sony ...
3,307142469,Frosty the Snowman,http://ecx.images-amazon.com/images/I/51VE7CDD...,Have always loved the old movies and this is t...,Science Fiction Fantasy Science Fiction An...
4,307142477,Frosty Returns,http://ecx.images-amazon.com/images/I/513AB22V...,"Great Christmas classic, always enjoy watching...",Science Fiction Fantasy Science Fiction An...


In [7]:
# some titles are also missing and could not be found by their ASIN
sum(dfm.title.isna())

223

In [8]:
# there are not many of them, so we just remove those entries
dfm.drop(dfm[dfm.title.isnull()].index, inplace=True)

In [13]:
# some genres as missing so we make them empty strings
dfm['genre'] = dfm['genre'].fillna('')

In [25]:
# there might be nicer ways of formatted genres, but for now we just concatenate
# with the review text (other option: treat as doc tags)
dfm['reviewText'] = dfm['reviewText'] + dfm['genre']

In [33]:
# get rid of the genre column and take a look at the resulting data
dfm.drop(columns='genre', axis=1, inplace=True)
dfm.head()

In [9]:
# repeat this process for books
dfb = pd.read_csv('/Users/genna/Downloads/books_commonReviewers_grouped.csv')

In [11]:
dfb.head()

Unnamed: 0.1,Unnamed: 0,asin,reviewText,title,imUrl
0,0,000100039X,This is one my must have books. It is a master...,The Prophet,http://ecx.images-amazon.com/images/I/81ZKLPiv...
1,1,0001055178,Historic fiction requires exceptional skills t...,Master Georgie,http://ecx.images-amazon.com/images/I/51ZSC6TK...
2,2,0001473123,This is a wonderful book to study with The Gre...,The Book of Revelation,http://ecx.images-amazon.com/images/I/317G8R2T...
3,3,0001473727,This book is amazing. The charts really make i...,The Greatest Book on &quot;Dispensational Trut...,http://ecx.images-amazon.com/images/I/512M299K...
4,4,0001473905,This book is an excellent book to read in addi...,Rightly Dividing The Word,http://ecx.images-amazon.com/images/I/61KPC59B...


In [10]:
dfb.drop(dfb.columns[0], axis=1, inplace=True)

In [22]:
sum(dfb.isnull().title)

34522

In [11]:
dfb.drop(dfb[dfb['title'].isnull()].index, inplace=True)

In [40]:
# that's a lot of books to drop, 
# but only a small percentage of the total number of books
# those also couldn't be found easily by their ASIN (=ISBN)
dfb.shape

(237801, 4)

In [41]:
dfm.shape

(49050, 4)

In [None]:
# keep track of books and movies
dfm['medium'] = 'movie'
ddb['medium'] = 'book'

In [13]:
# combine all of the data together
dfAll = pd.concat([dfb, dfm])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [23]:
dfAll.head()

Unnamed: 0,asin,imUrl,title,medium
0,000100039X,http://ecx.images-amazon.com/images/I/81ZKLPiv...,The Prophet,book
1,0001055178,http://ecx.images-amazon.com/images/I/51ZSC6TK...,Master Georgie,book
2,0001473123,http://ecx.images-amazon.com/images/I/317G8R2T...,The Book of Revelation,book
3,0001473727,http://ecx.images-amazon.com/images/I/512M299K...,The Greatest Book on &quot;Dispensational Trut...,book
4,0001473905,http://ecx.images-amazon.com/images/I/61KPC59B...,Rightly Dividing The Word,book


In [92]:
# clear up some space
del dfb, dfm

In [54]:
# some are missing review text for some reason, but only a few, so drop these
sum(dfAll.reviewText.isnull())

10

In [55]:
dfAll.drop(dfAll[dfAll.reviewText.isnull()].index, inplace=True)

In [44]:
# start cleaning reviews:
# we don't want to restrict words to English dictionary because we want to 
# allow people to search by author name or other funny titles

# remove stop words
#tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
#stopword_set = set(stopwords.words('english'))

In [45]:
RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)

In [46]:
# remove punctuation, convert to lowercase, split (tokenize)
def preprocess(text):
    # Remove all punctuation and make all lowercase 
    return RE_PUNCT.sub(" ", text).lower().split()

In [None]:
#preprocess_documents([dfAll.title.iloc[0], dfAll.reviewText.iloc[0]])

In [50]:
# remove special characters
letters = list('abcdefghijklmnopqrstuvwxyz')
numbers = list('123456789')
stop_words = stop_words.union(set(letters)).union(set(numbers))

In [57]:
# doctag will just be the ASIN so we don't need to process
# preprocess review text using above code
# allow for common phrases to be grouped together like "science_fiction"
def make_movie_doc(text, title, drop_stopwords=True):
    """Make documents into LabeledSentence objects for doc2vec training"""
    doctag = title #'_'.join(preprocess(title))    
    docwords = list(filter(lambda word: word not in stop_words,
                      bigram[preprocess(text)]))
    return TaggedDocument(docwords, [doctag])

# note: can also lemmatize and stem, but seemed to work ok without it
# not sure how stemming would work with names (e.g. author)

In [56]:
bigram = Phrases(map(preprocess, dfAll.reviewText.tolist())) 

In [60]:
DOCS = [make_movie_doc(text, title) for text, title in
        zip(dfAll.reviewText.tolist(), dfAll.asin.tolist())]

In [61]:
shuffle(DOCS)

In [62]:
cores = multiprocessing.cpu_count()

In [63]:
model = Doc2Vec(dm=0, dbow_words=1, min_count=2, negative=5,
                hs=0, sample=0, window=5, vector_size=300, epochs=20, workers=cores)

In [64]:
model.build_vocab(DOCS)

In [66]:
# train the model
model.train(DOCS, total_examples=len(DOCS), epochs=20)

In [71]:
# helper functions for convertin from ASIN to other info
def getTitle(asin):
    return dfAll[dfAll['asin']==asin].title.values[0]

In [93]:
def getMedium(asin):
    return dfAll[dfAll['asin']==asin].medium.values[0]

In [72]:
getTitle('0141194529')

'The Spy Who Came in from the Cold (French Edition)'

In [143]:
# check word embeddings:
model.wv.most_similar_cosmul(positive = ['love'])

[('adore', 0.8502992391586304),
 ('loved', 0.8260433077812195),
 ('loves', 0.806640088558197),
 ('routined', 0.7842212319374084),
 ('mustt', 0.7833786606788635),
 ('enjoy', 0.7832635641098022),
 ('hylander', 0.7832064032554626),
 ('lynn_mcclure', 0.7787688970565796),
 ('storyandbite', 0.777694821357727),
 ('copeing', 0.7748092412948608)]

In [144]:
model.wv.most_similar_cosmul(positive = ['war'])

[('wars', 0.8714832067489624),
 ('war_ii', 0.8387480974197388),
 ('wwi', 0.8385044932365417),
 ('wwii', 0.8348910808563232),
 ('civil_war', 0.8262279629707336),
 ('ww_ii', 0.8186373114585876),
 ('warfare', 0.8158881068229675),
 ('ww2', 0.8144698143005371),
 ('waged', 0.8095280528068542),
 ('ww1', 0.8082234263420105)]

In [145]:
model.wv.most_similar_cosmul(positive = ['space'])

[('as2001', 0.7907372117042542),
 ('space_travel', 0.7858055233955383),
 ('chiep', 0.7854652404785156),
 ('continuem', 0.7849547863006592),
 ('spaceship', 0.781318724155426),
 ('spacecraft', 0.7809067964553833),
 ('newtownian', 0.7794908285140991),
 ('transluminal', 0.7785918116569519),
 ('2850', 0.777809739112854),
 ('collapsers', 0.7769001126289368)]

In [146]:
# the "classic" demonstration in doc2vec
model.wv.most_similar_cosmul(negative = ["man"], positive = ["king","woman"])

[('queen', 0.9074888825416565),
 ('fredrico', 0.8191695809364319),
 ('likedmary', 0.8183016777038574),
 ('etampes', 0.815883457660675),
 ('schahzeman', 0.805086076259613),
 ('scotsemblems', 0.8044558763504028),
 ('princess', 0.8033193349838257),
 ('ayuthaya', 0.8028899431228638),
 ('thesseus', 0.8025191426277161),
 ('aquitain', 0.8015168905258179)]

In [1]:
# try out some of the recommendations:
for rec in model.docvecs.most_similar([model['spy']]):
    print(getMedium(rec[0]), ': ', getTitle(rec[0]), rec[1])

NameError: name 'model' is not defined

In [103]:
# neat! it adds Spy Princess and Highland Brides which were not there before
for rec in model.docvecs.most_similar([model['spy'] + model['woman']]):
    print(getMedium(rec[0]), ': ', getTitle(rec[0]), rec[1])

book :  Matt Helm - Murderers' Row 0.5641220808029175
book :  Highland Scoundrel (Highland Brides) 0.5342029333114624
book :  Spy Princess: The Life of Noor Inayat Khan 0.5263435244560242
book :  The Nicholas Sparks Holiday Collection 0.5212256908416748
book :  After the Greek Affair 0.5169734954833984
book :  Papadaddy's book for new fathers 0.5104568600654602
book :  The Best of Our Spies 0.5089303255081177
book :  The Unofficial Suitor 0.5067099928855896
book :  Tempt the Devil 0.5026914477348328
book :  Date Out of Your League 0.5025393962860107


In [130]:
for rec in model.docvecs.most_similar([model['space'],model['alien'],model['invasion']]):
    print(getMedium(rec[0]), ': ', getTitle(rec[0]), rec[1])

book :  Nebula 0.5876638889312744
book :  We Will Destroy Your Planet: An Alien's Guide to Conquering the Earth (Dark) 0.5814547538757324
book :  Sherlock Holmes: The Coils of Time & Other Stories 0.5679984092712402
book :  Who Was Paul Revere? 0.5587623715400696
book :  Brother Assassin (berserker saga, two) 0.5407229065895081
book :  Aftermath (Star Trek) (Starfleet Corps of Engineers #29) 0.5397956967353821
book :  From the Sea to the Stars 0.5314494967460632
book :  George Adamski - A Herald for the Space Brothers 0.5282795429229736
movie :  4 Film Favorites: Sci-Fi  0.5274326801300049
book :  Fleet of Angels 0.5264274477958679


In [121]:
from operator import itemgetter

In [123]:
for asin in dfAll.asin.tolist()[4000:4005]:
    print("\nQuery: %s"% getTitle(asin))
    #doctag = '_'.join(preprocess(title))
    dists = []
    for m in dfAll.asin[dfAll.medium=='movie'].to_list():
        dists.append(model.docvecs.distance(asin, m))
    dists = list(zip(dfAll.asin[dfAll.medium=='movie'].to_list(), dists))
    dists.sort(key=itemgetter(1))
    for movieRec in dists[0:10]:
        print('movie:', getTitle(movieRec[0]), movieRec[1])


Query: A Moorland Hanging
movie: Indian Doctor: Series Two 0.447557270526886
movie: Star Trek: Next Generation - Chain of Command  0.49740082025527954
movie: Catherine Cookson Boxed Set 0.5029106140136719
movie: The Gene Autry Show - The Final Season 0.5066783130168915
movie: Dalziel & Pascoe Series 2  0.5081641674041748
movie: Gimme A Break: Season 2 0.5153225660324097
movie: Triple Feature: The Last House on the Left / The Strangers / A Perfect Getaway 0.5276615917682648
movie: Report to the Commissioner  0.5293062329292297
movie: In the Family 0.5334608852863312
movie: Silver Screen Romances: 8 Movie Set 0.5389738082885742

Query: An offer he can't refuse
movie: Zatoichi 23 - Zatoichi at Large 0.666980654001236
movie: MISTAKEN IDENTITY 0.6681159138679504
movie: Buffalo 66  0.6718172430992126
movie: The Bible Collection - 12  Set - TNT 0.6723695993423462
movie: Juden Chan 0.6737367510795593
movie: Porkchop 0.677799642086029
movie: Simon & Simon: Season 5 0.6794920265674591
movie: Di

In [73]:
model.save('d2v_2.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [104]:
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [131]:
model.docvecs

<gensim.models.keyedvectors.Doc2VecKeyedVectors at 0x1e2521db70>

In [137]:
model.docvecs.get_vector(dfAll.asin.iloc[0])

TypeError: unhashable type: 'list'

In [9]:
import numpy as np

In [4]:
t = np.load('d2v_2.model.docvecs.vectors_docs.npy')

NameError: name 'np' is not defined

In [5]:
model= Doc2Vec.load('webapp2/recallapp/data/d2v_2.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [18]:
docvecAsins = list(model.docvecs.doctags.keys())

In [41]:
df = pd.read_csv('webapp2/recallapp/data/asinCombo.csv')

In [42]:
df.drop(df.columns[0], axis=1, inplace=True)

In [43]:
df.iloc[1].imUrl

'http://ecx.images-amazon.com/images/I/51ZSC6TKS3L.jpg'

In [53]:
asinVals = df.asin.values

In [48]:
missingAsins = []

In [54]:
for d in docvecAsins:
    if d not in asinVals:
        missingAsins.append(d)

KeyboardInterrupt: 

In [46]:
len(missingAsins)

283817

In [56]:
missingAsins = list(set(asinVals) - set(docvecAsins))

In [57]:
len(missingAsins)

141697

In [59]:
df2 = pd.read_csv('/Users/genna/Downloads/movieMetaData.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [66]:
foundSome = list(set(df2.asin.values).intersection(missingAsins))

In [67]:
len(foundSome)

2

In [76]:
df3 = pd.read_csv('/Users/genna/Downloads/books_commonReviewers_grouped.csv')

In [77]:
foundSome = list(set(df3.asin.values).intersection(missingAsins))

In [78]:
len(foundSome)

252