In [1]:
from itertools import chain
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_pickle('5p_books.pickle')
df = df.sample(frac=0.5, random_state = 1, ignore_index = True)
df.head()

Unnamed: 0,series,country_code,language_code,is_ebook,average_rating,similar_books,description,authors,publisher,num_pages,publication_year,book_id,title,title_without_series,genres,text
0,[],US,,False,3.51,[],The sequel to Zenescope's hit Grimm Fairy Tale...,[Joe Brusha],Zenescope Entertainment,120.0,2012,14654379,Neverland: Hook,Neverland: Hook,"{'comics, graphic': 38, 'fantasy, paranormal':...",Neverland: Hook The sequel to Zenescope's hit ...
1,[Maggie's Grove],US,en-GB,True,3.88,"[The Geek Job, Rare Vintage (Lords of the Were...",A seduction-gone-wrong leaves vampire Parker H...,[Dana Marie Bell],Carina Press,,2011,11068029,"Blood of the Maple (Maggie's Grove, #1)","Blood of the Maple (Maggie's Grove, #1)","{'fantasy, paranormal': 185, 'romance': 129, '...","Blood of the Maple (Maggie's Grove, #1) A sedu..."
2,[],US,,False,3.33,[],,[Nicci Talbot],Amorata Press,128.0,2008,2412568,Going Down: An Illustrated Guide to Giving Him...,Going Down: An Illustrated Guide to Giving Him...,{},Going Down: An Illustrated Guide to Giving Him...
3,[],US,,False,3.93,[],When Alan Greenberg first showed up at Werner ...,"[Alan Greenberg, Werner Herzog]",Chicago Review Press,224.0,2012,13225960,Every Night the Trees Disappear: Werner Herzog...,Every Night the Trees Disappear: Werner Herzog...,"{'non-fiction': 4, 'history, historical fictio...",Every Night the Trees Disappear: Werner Herzog...
4,[],US,en-GB,True,4.71,[],The Bodyguard's Vow novella is a lighthearted ...,[Regan Black],"Getaway Reads, LLC (April 23, 2013)",,2013,17875022,The Bodyguard's Vow,The Bodyguard's Vow,"{'romance': 1, 'fantasy, paranormal': 1}",The Bodyguard's Vow The Bodyguard's Vow novell...


In [3]:
nlp = spacy.load('en_core_web_lg')

def text_process(text:str, nlp, lemmatize = False):
    """
    Use spacy lemmatizer to tokenize or lemmatize text. Remove stopwords, punctuation
    Input: text (string)
    Output: list of tokens """
    if lemmatize:
        return ' '.join([token.lemma_ for token in nlp(text.strip(), disable=["parser", "ner"]) if not token.is_stop and not token.is_punct ])
    else:
        
        text = ' '.join([token.text for token in nlp(text.strip()) 
                     if not token.like_email and not token.like_url and not token.is_space ])

        return text

def genre_extractor(d):
    """Extract keys from dict into a flatten list"""
    keys = [key.split(',') for key in d.keys()]
    keys = list(chain(*keys))
    keys = [key.strip() for key in keys]
    return keys



def unpack_list(l):
    """Unpack elements of a list and join them into a string"""
    return ', '.join([item for item in l])

print(genre_extractor(df.genres.iloc[0]))
print(unpack_list(df.series.iloc[0]))

['comics', 'graphic', 'fantasy', 'paranormal', 'fiction']



In [4]:
# drop empty rows
df = df.dropna(how = 'all')
#Lemmatize, remove stopwords, punctuation
df['Clean_text'] = df.description.progress_apply(text_process, nlp = nlp, lemmatize = True)
#Unpack series, ooks, authors
df['series'] = df.series.progress_apply(unpack_list)
df['similar_books'] = df.similar_books.progress_apply(unpack_list)
df['authors']= df.authors.progress_apply(unpack_list)
#Get list of genres
df['genre_keys'] = df.genres.progress_apply(genre_extractor)
# Concatenate Lammetized description, serie, authors, similar books
df['Clean_text'] = df.title + df.Clean_text + ' ' + df.series+ ' ' +df.authors+' ' +df.similar_books
#Convert str to int
df['num_pages'] = pd.to_numeric(df.loc[:,'num_pages'], downcast="integer")
df['num_pages'] = df.num_pages.fillna(df.num_pages.mean())
df['average_rating'] = pd.to_numeric(df.loc[:,'average_rating'], downcast="integer")
df['publication_year'] = df.loc[:,'publication_year'].astype(int)
df['is_ebook'] = df['is_ebook'].map({'false':0, 'true':1})
df['is_ebook'] = df.is_ebook.fillna(0)


100%|██████████| 33581/33581 [03:12<00:00, 174.15it/s]
100%|██████████| 33581/33581 [00:00<00:00, 524700.11it/s]
100%|██████████| 33581/33581 [00:00<00:00, 375862.74it/s]
100%|██████████| 33581/33581 [00:00<00:00, 473524.52it/s]
100%|██████████| 33581/33581 [00:00<00:00, 320274.24it/s]


In [5]:
genres = df.genre_keys.tolist()
genres = set(list(chain(*genres)))
genres

{'biography',
 'children',
 'comics',
 'crime',
 'fantasy',
 'fiction',
 'graphic',
 'historical fiction',
 'history',
 'mystery',
 'non-fiction',
 'paranormal',
 'poetry',
 'romance',
 'thriller',
 'young-adult'}

In [6]:
columns = ['is_ebook', 
       #'num_pages', 'publication_year',
       #'genre_keys','average_rating',
       ]
# keep following columns: ['is_ebook',   'format', 'num_pages', 'genre_keys']
df_bin = df.loc[:, df.columns.isin(columns)]
df_bin.head()

Unnamed: 0,is_ebook
0,0
1,1
2,0
3,0
4,1


In [7]:
#genres 
for genre in genres:
    print(genre)
    df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)
    
#Rating columns
rating_columns = ['avg_rating>={}'.format(i) for i in range(1,5)]+ ['avg_rating>=4.5']

for rating_column in rating_columns:
    df_bin[rating_column] = 0
    #print(type(float(rating_column.split('=')[-1])))
    df_bin.loc[df['average_rating'] > float(rating_column.split('=')[-1]), rating_column] = 1


#Publication year
publication_y_cols = ["published_before_70's"]+ ["published_in_the_{:02d}'s".format(i%100) for i in range(70, 111, 10)]
publication_y_cols

for col in publication_y_cols:
    df_bin[col] = 0
df_bin.loc[df['publication_year'] < 1970, publication_y_cols[0]] = 1
df_bin.loc[((df['publication_year'] >= 1970) & (df['publication_year'] < 1980)), publication_y_cols[1]] = 1
df_bin.loc[((df['publication_year'] >= 1980) & (df['publication_year'] < 1990)), publication_y_cols[2]] = 1
df_bin.loc[((df['publication_year'] >= 1990 )& (df['publication_year'] < 2000)), publication_y_cols[3]] = 1
df_bin.loc[((df['publication_year'] >= 2000) & (df['publication_year'] < 2010)), publication_y_cols[4]] = 1
df_bin.loc[df['publication_year'] >= 2010, publication_y_cols[5]] = 1

#num_pages
page_columns = ['#Pages>200', '#Pages>250', '#Pages>300', '#Pages>500'] #[q33, q45, q72,  q95]
for page_col in page_columns:
    df_bin[page_col] = 0
    df_bin.loc[df['num_pages'] > float(page_col.split('>')[-1]), page_col] = 1

romance


100%|██████████| 33581/33581 [00:00<00:00, 507924.27it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


crime


100%|██████████| 33581/33581 [00:00<00:00, 529367.92it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


graphic


100%|██████████| 33581/33581 [00:00<00:00, 512476.07it/s]

history



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)
100%|██████████| 33581/33581 [00:00<00:00, 513756.53it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


children


100%|██████████| 33581/33581 [00:00<00:00, 523845.37it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


biography


100%|██████████| 33581/33581 [00:00<00:00, 525903.01it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


fantasy


100%|██████████| 33581/33581 [00:00<00:00, 512866.07it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


non-fiction


100%|██████████| 33581/33581 [00:00<00:00, 520275.28it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


fiction


100%|██████████| 33581/33581 [00:00<00:00, 508724.13it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


thriller


100%|██████████| 33581/33581 [00:00<00:00, 519172.57it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


young-adult


100%|██████████| 33581/33581 [00:00<00:00, 517754.59it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


poetry


100%|██████████| 33581/33581 [00:00<00:00, 519925.74it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


comics


100%|██████████| 33581/33581 [00:00<00:00, 521222.53it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


paranormal


100%|██████████| 33581/33581 [00:00<00:00, 524729.43it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


historical fiction


100%|██████████| 33581/33581 [00:00<00:00, 524311.42it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


mystery


100%|██████████| 33581/33581 [00:00<00:00, 515124.41it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bin[genre] = df.genre_keys.progress_apply(lambda x: genre in x)


In [8]:
df_bin.head()

Unnamed: 0,is_ebook,romance,crime,graphic,history,children,biography,fantasy,non-fiction,fiction,...,published_before_70's,published_in_the_70's,published_in_the_80's,published_in_the_90's,published_in_the_00's,published_in_the_10's,#Pages>200,#Pages>250,#Pages>300,#Pages>500
0,0,False,False,True,False,False,False,True,False,True,...,0,0,0,0,0,1,0,0,0,0
1,1,True,False,False,False,False,False,True,False,True,...,0,0,0,0,0,1,1,1,0,0
2,0,False,False,False,False,False,False,False,False,False,...,0,0,0,0,1,0,0,0,0,0
3,0,False,False,False,True,False,True,False,True,False,...,0,0,0,0,0,1,1,0,0,0
4,1,True,False,False,False,False,False,True,False,False,...,0,0,0,0,0,1,1,1,0,0


In [9]:
df_bin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33581 entries, 0 to 33580
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   is_ebook               33581 non-null  int64
 1   romance                33581 non-null  bool 
 2   crime                  33581 non-null  bool 
 3   graphic                33581 non-null  bool 
 4   history                33581 non-null  bool 
 5   children               33581 non-null  bool 
 6   biography              33581 non-null  bool 
 7   fantasy                33581 non-null  bool 
 8   non-fiction            33581 non-null  bool 
 9   fiction                33581 non-null  bool 
 10  thriller               33581 non-null  bool 
 11  young-adult            33581 non-null  bool 
 12  poetry                 33581 non-null  bool 
 13  comics                 33581 non-null  bool 
 14  paranormal             33581 non-null  bool 
 15  historical fiction     33581 non-nul

In [10]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_df = 0.4, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df.Clean_text)
cosine_sim_txt = np.dot(tfidf_matrix,tfidf_matrix.T).toarray()

#cosine_sim_txt = pd.DataFrame(cosine_sim_txt, index=df.index, columns=df.index)

In [11]:
jac_sim = 1- pairwise_distances(df_bin, metric = "hamming")
#jac_sim = pd.DataFrame(jac_sim, index=df.index, columns=df.index)

In [35]:
def get_recommendations(titles, similarity, df, n=20):
    """Get recommendations based a given similarity table"""
    indices = pd.Series(df.index, index=df['title'])
    #print(indices.head())
    idxs = [indices[title] for title in titles]
    all_similarity_df = pd.DataFrame()
    for idx in idxs:
        #print(all_similarity_df.head())
        # pairwsie similarity scores

        sim_scores = list(enumerate(similarity[idx]))
        # sorting
        #sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) 
        #sim_scores = sim_scores[1:n] #index 0 is the same book
        #print(sim_scores[:10])
        book_indices = [i[1] for i in sim_scores if i[0] not in idxs]
        #print(article_indices[:10])
        similarity_df = pd.DataFrame(book_indices, columns=[idx])
        all_similarity_df = pd.concat([all_similarity_df, similarity_df],axis =1)
        #print(article_indices)
    #article_indices = [i for i in article_indices if i not in idxs] #Remove index of read books
    all_similarity_df['max_similarity'] = all_similarity_df.max(axis=1)
    return all_similarity_df.max_similarity.tolist()

def get_all_recommendations(titles, similarities, df, n=20):
    """Get similarities given a list of similarity tables"""
    sim_df = pd.DataFrame()
    for similarity in similarities:
        tmp_df = pd.DataFrame(get_recommendations(titles, similarity, df, n))
        sim_df = pd.concat([sim_df, tmp_df], axis=1)
    print(sim_df.head())
    sim_df['similarity_score'] = sim_df.sum(axis=1)/sim_df.shape[1]
    book_indices = sorted(list(enumerate(sim_df.similarity_score)), key=lambda x: x[1], reverse=True) 
    top_n_idx = [x[0] for x in book_indices][:n]
    top_n_sim = [x[1] for x in book_indices][:n]
    print(top_n_sim)
    #article_indices = list(all_similarity_df.drop_duplicates(keep="first")['indices']) #keep first occurrence of list
    return df['title'].iloc[top_n_idx]


In [16]:
#Generate doc2bow models

tagged_data = [TaggedDocument(words=doc.split(' '), tags=[i]) for i, doc in enumerate(df.Clean_text)]
model_d2v = Doc2Vec(vector_size=100,alpha=0.025, min_count=1)
  
model_d2v.build_vocab(tagged_data)

for epoch in tqdm(range(100)):
    model_d2v.train(tagged_data,
                total_examples=model_d2v.corpus_count,
                epochs=model_d2v.epochs)
    
document_embeddings=np.zeros((df.shape[0],100))

for i in range(len(document_embeddings)):
    document_embeddings[i]=model_d2v.docvecs[i]
    
    
doc2bow_sim=cosine_similarity(document_embeddings)

100%|██████████| 100/100 [44:03<00:00, 26.44s/it]
  document_embeddings[i]=model_d2v.docvecs[i]


In [37]:
print(get_all_recommendations(["Miss Fortune (Poison Apple #3)", "Who Stole Alligator's Shoe?", "The Unround Circle"], similarities=[jac_sim, doc2bow_sim], df = df))

         0         0
0  0.71875  0.412034
1  0.75000  0.371968
2  0.81250  0.430004
3  0.75000  0.319309
4  0.84375  0.418888
[0.8436062076196646, 0.8230253453828307, 0.8208633927204041, 0.8175156080884871, 0.814813294328836, 0.8141691256384336, 0.8136436780122307, 0.8118025090491727, 0.8076577335637354, 0.8046392889967215, 0.8045409878831087, 0.8038979822344015, 0.803656776661414, 0.8032579752314765, 0.80308698117489, 0.8018623379102091, 0.8015011628995301, 0.8004100706716216, 0.8002686024233217, 0.7991652103376008]
2230                                         Skymningsland
21574    Heroes for All Time: Connecticut Civil War Sol...
12428                               Miss Appleby's Academy
26989                                         سماوات جائعة
26932                                     Praying for Time
12065            A Companion to Wolves (Iskryne World, #1)
28078    Energize!: Energizers and Other Great Cooperat...
24619                                         Straw Writes
6895 