In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
import sqlite3
from sqlite3 import Error
pd.options.display.float_format = '{:,}'.format

In [0]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
DIR = '/content/gdrive/My Drive/Goodreads/data'
DIR_GENRE = '/content/gdrive/My Drive/Goodreads/data/genre/'

In [0]:
df_reviews = pd.read_csv(os.path.join(DIR, 'df_reviews1000.csv'))

In [0]:
print(df_reviews.shape)
df_reviews.head()

(1000, 8)


Unnamed: 0,book_id,review_text,average_rating,image_url,title,description,ratings_count,review_clean
0,2767052,I cracked and finally picked this up. Very enj...,4.34,https://images.gr-assets.com/books/1447303603m...,"The Hunger Games (The Hunger Games, #1)",Winning will make you famous.\nLosing means ce...,4899965,crack finally pick enjoyable quick read put li...
1,3,Tuve el gusto de leerlo antes de que saliera l...,4.45,https://images.gr-assets.com/books/1474154022m...,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter's life is miserable. His parents ...,4765497,tuve el gusto de leerlo ante de que saliera la...
2,41865,"If I was 15, I would have LOVED this. It wasn'...",3.57,https://images.gr-assets.com/books/1361039443m...,"Twilight (Twilight, #1)",About three things I was absolutely positive.\...,3941381,would love terrible fact perfect quick read ra...
3,2657,"Still relevant and beautiful., R.I.P. Ms. Harp...",4.26,https://images.gr-assets.com/books/1361975680m...,To Kill a Mockingbird,The unforgettable novel of a childhood in a sl...,3255518,still relevant beautiful r.i.p harper lee feb ...
4,4671,I read this classic twice in high school. Its ...,3.89,https://images.gr-assets.com/books/1490528560m...,The Great Gatsby,"THE GREAT GATSBY, F. Scott Fitzgerald's third ...",2758812,read classic twice high school hard like book ...


In [0]:
#Fit TFIDF 
#Learn vocabulary and tfidf from all style_ids.

import pickle
#from nltk.corpus import stopwords
#from nltk.stem import SnowballStemmer
#from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.metrics.pairwise import cosine_similarity
#from scipy import sparse
#import re
#import gensim
#from skimage import io
#import matplotlib.pyplot as plt
#from gensim.test.utils import get_tmpfile

tf = TfidfVectorizer(analyzer='word', 
                     min_df=10,
                     ngram_range=(1, 2),
                     #max_features=1000,
                     stop_words='english')
tf.fit(df_reviews['review_clean'])

#Transform style_id products to document-term matrix.
tfidf_matrix = tf.transform(df_reviews['review_clean'])
#pickle.dump(tf, open("../models/tfidf_model.pkl", "wb"))


    
path_model = '/content/gdrive/My Drive/Goodreads/models/' 
os.makedirs(path_model, exist_ok=True) 
pickle.dump(tf, open(path_model+'tfidf_model.pkl', "wb"))


print(tfidf_matrix.shape)

(1000, 755652)


In [0]:
# Compress with SVD
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500)
latent_matrix = svd.fit_transform(tfidf_matrix)
pickle.dump(svd, open(path_model+"svd_model.pkl", "wb"))

print(latent_matrix.shape)

(1000, 500)


In [0]:
n = 100 #pick components
#Use elbow and cumulative plot to pick number of components. 
#Need high ammount of variance explained. 
doc_labels = df_reviews[['book_id','title','average_rating','image_url']]
svd_feature_matrix = pd.DataFrame(latent_matrix[:,0:n] ,index=doc_labels)
print(svd_feature_matrix.shape)
svd_feature_matrix.head()

pickle.dump(svd_feature_matrix, open(path_model+"lsa_embeddings.pkl", "wb"))

(1000, 100)


Doc2Vec Model



In [0]:
import re
import gensim

#Use reviews for vocabulary 
reviews = df_reviews.review_clean.values.tolist()

#print(reviews[0])

documents = []
for i in range(len(df_reviews)):
    mystr = reviews[i]
    #mystr = mystr + descriptions[i]
    documents.append(re.sub("[^\w]", " ",  mystr).split())

In [24]:
print(len(df_reviews))
print(len(documents))

1000
1000


In [0]:
import re
import gensim

formatted_documents = [gensim.models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]

model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=5, epochs=200, seed=0, window=3, dm=1)
model.build_vocab(formatted_documents)

In [26]:
len(formatted_documents)

1000

In [27]:
%time model.train(formatted_documents, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 1h 41min 11s, sys: 20.6 s, total: 1h 41min 32s
Wall time: 36min 55s


In [28]:
from gensim.test.utils import get_tmpfile

fname = get_tmpfile(path_model+"doc2vec_model")
model.save(path_model+"doc2vec_model")
model = gensim.models.doc2vec.Doc2Vec.load(path_model+"doc2vec_model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [29]:
vector = model.infer_vector(doc_words=["this", "is", "a", "test"], epochs=50)
vector

array([ 0.39108917,  0.05473473, -0.21939065, -0.32847852,  0.5202276 ,
       -0.27876744,  0.04379517,  0.07739755, -0.01308818,  0.08775933,
       -0.18052739,  0.18347256, -0.23883481,  0.12862064,  0.04611352,
       -0.27762583, -0.27841395,  0.15595004,  0.10337353,  0.03625776,
       -0.5964288 ,  0.04668012,  0.08189996, -0.06802316,  0.07210035,
        0.31003597, -0.09062755,  0.46164316,  0.29247853,  0.42368692,
        0.20929736,  0.08012825,  0.39776835, -0.15439355, -0.42472976,
       -0.27936187, -0.03492057, -0.25677806,  0.23903026,  0.16261692,
       -0.21710336, -0.34183028, -0.24077508,  0.01072529, -0.16878504,
        0.28701925,  0.20691174,  0.22889   ,  0.06651591,  0.1551375 ,
       -0.05615679, -0.15715086,  0.3522663 ,  0.02767293,  0.6035184 ,
       -0.20143166,  0.44172716,  0.43059564,  0.1191323 ,  0.22162677,
       -0.17100157,  0.14422834, -0.04273085, -0.05375472, -0.1363518 ,
        0.15275182, -0.93235064,  0.1286226 , -0.4132261 ,  0.31

In [30]:
doctovec_feature_matrix = pd.DataFrame(model.docvecs.vectors_docs, index=doc_labels)
print(doctovec_feature_matrix.shape)
doctovec_feature_matrix.head()
pickle.dump(doctovec_feature_matrix, open(path_model+"doctovec_embeddings.pkl", "wb"))

(1000, 100)


In [31]:
doc_labels

Unnamed: 0,book_id,title,average_rating,image_url
0,2767052,"The Hunger Games (The Hunger Games, #1)",4.34,https://images.gr-assets.com/books/1447303603m...
1,3,Harry Potter and the Sorcerer's Stone (Harry P...,4.45,https://images.gr-assets.com/books/1474154022m...
2,41865,"Twilight (Twilight, #1)",3.57,https://images.gr-assets.com/books/1361039443m...
3,2657,To Kill a Mockingbird,4.26,https://images.gr-assets.com/books/1361975680m...
4,4671,The Great Gatsby,3.89,https://images.gr-assets.com/books/1490528560m...
...,...,...,...,...
995,6584188,"It's Not Summer Without You (Summer, #2)",4.14,https://images.gr-assets.com/books/1479864019m...
996,13615258,Free Four: Tobias Tells the Divergent Knife-Th...,4.18,https://images.gr-assets.com/books/1399503802m...
997,7779,Horton Hears a Who!,4.16,https://images.gr-assets.com/books/1327924655m...
998,334643,"Abhorsen (Abhorsen, #3)",4.29,https://images.gr-assets.com/books/1401683669m...
