In [1]:
#!pip install texthero
import pandas as pd
import numpy as np
import texthero as hero
from texthero import preprocessing
from texthero import stopwords
from texthero.visualization import wordcloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [31]:
df_b_r = pd.read_csv('Data/df_books_rev.csv', index_col=0)

In [32]:
df_b_r.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94029 entries, 0 to 94028
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   asin        94029 non-null  object
 1   reviewText  94029 non-null  object
dtypes: object(2)
memory usage: 2.2+ MB


In [33]:
df_b_r.head()

Unnamed: 0,asin,reviewText
0,B0012GTZCK,Not what I was expecting. I was looking for a ...
1,B0012LHGJ4,IT WAS DONE AS A FILM IN THE 1950-60s. VERY P...
2,B0012RMVH0,Needs pictures Excellent book. This is somethi...
3,B0012TAD1O,She always delivers with an exciting storyline...
4,B0012U0NKE,"<a data-hook=""product-link-linked"" class=""a-li..."


In [34]:
def clean_review(df):
    custom_pipeline = [preprocessing.lowercase,
                  preprocessing.remove_punctuation,
                  preprocessing.remove_stopwords,
                  preprocessing.stem,
                  preprocessing.remove_whitespace]
    df['clean_text'] = hero.clean(df['reviewText'], custom_pipeline)

In [35]:
clean_review(df_b_r)

In [7]:
#default_stopwords = stopwords.DEFAULT
#custom_stopwords = default_stopwords.union(set(["book", "stori", "read", "author"]))
#df_b_r['clean_text'] = preprocessing.remove_stopwords(df_b_r['clean_text'], custom_stopwords)

In [36]:
df_b_r.set_index('asin', inplace=True)
df_b_r.drop(columns=['reviewText'], inplace=True)

In [37]:
df_b_r.head()

Unnamed: 0_level_0,clean_text
asin,Unnamed: 1_level_1
B0012GTZCK,expect look histori christian influenc live pe...
B0012LHGJ4,done film 1950 60s popular film back today kid...
B0012RMVH0,need pictur excel book someth want book give k...
B0012TAD1O,alway deliv excit storylin lot heat cours imag...
B0012U0NKE,data hook product link link class link normal ...


In [38]:
df_b_r.to_parquet('Data/df_b_r.parquet')

In [39]:
tf = TfidfVectorizer(max_features=250)
dtm = tf.fit_transform(df_b_r['clean_text'])
dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names_out(), index=df_b_r.index)

In [23]:
features = tf.get_feature_names_out()

In [13]:
dtm.head()

Unnamed: 0_level_0,abl,absolut,action,actual,adventur,almost,along,alreadi,also,although,...,world,worth,would,write,writer,written,wrong,year,yet,young
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0012GTZCK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0012LHGJ4,0.0,0.0,0.0,0.0,0.362908,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.183927,0.0,0.0,0.0,0.0,0.0,0.0,0.327082
B0012RMVH0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.100931,0.0,0.0
B0012TAD1O,0.0,0.0,0.090261,0.090717,0.0,0.0,0.0,0.0,0.0,0.098413,...,0.085004,0.0,0.110181,0.066369,0.198286,0.065399,0.0,0.0,0.0,0.0
B0012U0NKE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
dtm.to_parquet('Data/df_dtm.parquet')

In [41]:
df_meta_all = pd.read_csv('Data/meta_all.csv', index_col='asin')
df_meta_all.drop(columns =['Unnamed: 0'], inplace=True)
df_meta_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 94029 entries, B0012GTZCK to B01HJENY3Y
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         94029 non-null  object 
 1   brand         94029 non-null  object 
 2   genre         94029 non-null  object 
 3   print_length  94029 non-null  float64
 4   word_wise     94029 non-null  int64  
 5   lending       94029 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 5.0+ MB


In [42]:
df_meta_all.head()

Unnamed: 0_level_0,title,brand,genre,print_length,word_wise,lending
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B0012GTZCK,God in the White House,Randall Balmer,History,260.0,1,0
B0012LHGJ4,Vingt Mille Lieues sous les mers (French Editi...,Jules Verne,Reference,606.0,0,1
B0012RMVH0,Wood-Carving Design and Workmanship - Kindle e...,George Jack,Arts & Photography,310.0,0,0
B0012TAD1O,Souls Night (The Vampire Pacts) - Kindle edition,Kallysten,Romance,74.0,1,1
B0012U0NKE,The Misplaced Horse (1) eBook,Constance Downes,"Comics, Manga & Graphic Novels",444.0,0,1


In [57]:
model_df = dtm.merge(df_meta_all, left_index=True, right_index=True)
model_df.drop(columns=['title', 'brand', 'word_wise', 'lending'], inplace=True)
model_df = pd.get_dummies(model_df, columns=['genre'])
model_df.head()

Unnamed: 0_level_0,abl,absolut,action,actual,adventur,almost,along,also,although,alway,...,genre_Politics & Social Sciences,genre_Reference,genre_Religion & Spirituality,genre_Romance,genre_Science & Math,genre_Science Fiction & Fantasy,genre_Self-Help,genre_Sports & Outdoors,genre_Teen & Young Adult,genre_Travel
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0012GTZCK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
B0012LHGJ4,0.0,0.0,0.0,0.0,0.397003,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
B0012RMVH0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
B0012TAD1O,0.0,0.0,0.097197,0.097689,0.0,0.0,0.0,0.0,0.105977,0.165397,...,0,0,0,1,0,0,0,0,0,0
B0012U0NKE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
cols = ['print_length']
scaler = MinMaxScaler()
scaled = scaler.fit_transform(model_df[cols])
scaled_column = pd.DataFrame(scaled, index=model_df.index, columns=cols)
model_df['print_length'] = scaled_column['print_length']

In [59]:
y = np.array(model_df.loc['B01HJBPUWA'])
y = y.reshape(1, -1)
cos_sim = cosine_similarity(model_df, y)
cos_sim = pd.DataFrame(data=cos_sim, index=model_df.index)
cos_sim.sort_values(by = 0, ascending=False, inplace=True)
results = cos_sim.head(11)

In [60]:
results

Unnamed: 0_level_0,0
asin,Unnamed: 1_level_1
B01HJBPUWA,1.0
B00BG2WZZE,0.860913
B00B0071TC,0.857805
B00C1MZN5C,0.855353
B006RZNR3Y,0.85438
B00SC7DK2S,0.853264
B005SFRJ6K,0.853168
B00EK5IMOC,0.85292
B01DE64NKE,0.852171
B007IYKF4W,0.85019


In [61]:
def book_review_recommend():
    
    book = input('ASIN: ')
    n_recs = int(input('How many recommendations? '))
    
    y = np.array(model_df.loc[book]).reshape(1, -1)
    cos_sim = cosine_similarity(model_df, y)
    cos_sim = pd.DataFrame(data=cos_sim, index=model_df.index)
    cos_sim.sort_values(by = 0, ascending = False, inplace=True)
    results = cos_sim.head(n_recs+1).index.values[1:]
    results_df = df_meta_all.loc[results]
    return results_df

In [62]:
book_review_recommend()

ASIN: B01HJBPUWA
How many recommendations? 10


Unnamed: 0_level_0,title,brand,genre,print_length,word_wise,lending
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B00BG2WZZE,The Sixth Science Fiction MEGAPACK&reg;,Johnston McCulley,Science Fiction & Fantasy,666.0,1,0
B00B0071TC,Omega Zero eBook,Ryan Henry,Science Fiction & Fantasy,228.0,1,1
B00C1MZN5C,The Grimm Diaries Prequels volume 11- 14,Cameron Jace,Science Fiction & Fantasy,159.0,1,1
B006RZNR3Y,Lacuna eBook,David Adams,Science Fiction & Fantasy,382.0,1,1
B00SC7DK2S,Betrayal&#39;s Price (In Deception&#39;s Shad...,Lisa Blackwood,Science Fiction & Fantasy,311.0,1,1
B005SFRJ6K,Infinity Blade,Brandon Sanderson,Science Fiction & Fantasy,124.0,1,0
B00EK5IMOC,Kindreds,Tani Mura,Science Fiction & Fantasy,330.0,1,1
B01DE64NKE,The Doomsday Chronicles (Future Chronicles Bo...,Samuel Peralta,Science Fiction & Fantasy,438.0,1,0
B007IYKF4W,Trixie &amp; Me (Galactic Exploration Book 2)...,Peter Cawdron,Science Fiction & Fantasy,89.0,1,1
B00B2QZH68,The Runner (Silo Submerged Book 1) eBook,WJ Davies,Science Fiction & Fantasy,61.0,1,1


In [None]:
book_review_recommend()

In [None]:
book_review_recommend()

In [None]:
book_review_recommend()

In [None]:
book_review_recommend()

In [None]:
book_review_recommend()

In [None]:
book_review_recommend()

In [None]:
knn = NearestNeighbors(n_neighbors=10)
knn.fit(model_df)

In [None]:
def book_review_recommend_knn():
    
    book = input('ASIN: ')
    n_recs = int(input('How many recommendations? '))
    
    x = np.array(model_df.loc[book]).reshape(1, -1)
    results = knn.kneighbors(x, n_recs+1, return_distance=False).flatten()
    index = model_df.iloc[results].index.values[1:]
    results_df = df_meta_all.loc[index]
    return results_df

In [None]:
book_review_recommend_knn()

In [None]:
book_review_recommend()

In [None]:
testbook = df_b_r.loc[['B01HJBPUWA']]

In [None]:
testbook_text = preprocessing.tokenize(testbook['clean_text'])

In [None]:
text_list = list(testbook_text)[0]

In [None]:
feature_in_text = [x for x in text_list if x in features]

In [None]:
text_series = pd.Series(feature_in_text)

In [None]:
text_series

In [None]:
wordcloud(text_series)

In [None]:
testbook2 = df_b_r.loc[['B01HJBPUWA']]

In [None]:
testbook_text2 = preprocessing.tokenize(testbook2['clean_text'])
text_list2 = list(testbook_text2)[0]
feature_in_text2 = [x for x in text_list2 if x in features]
text_series2 = pd.Series(feature_in_text2)

In [None]:
wordcloud(text_series2)

In [None]:
testbook2 = df_b_r.loc[['B00BG2WZZE']]
wordcloud(testbook2['clean_text'])