In [36]:
import pandas as pd
import numpy as np
import texthero as hero
from texthero import preprocessing
from texthero import stopwords
from texthero.visualization import wordcloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

ModuleNotFoundError: No module named 'fuzzywuzzy'

In [2]:
df_b_r = pd.read_csv('Data/df_books_rev.csv', index_col=0)

In [3]:
df_b_r.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94211 entries, 0 to 94210
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   asin        94211 non-null  object
 1   reviewText  94211 non-null  object
dtypes: object(2)
memory usage: 2.2+ MB


In [4]:
df_b_r.head()

Unnamed: 0,asin,reviewText
0,B0012GTZCK,Not what I was expecting. I was looking for a ...
1,B0012LHGJ4,IT WAS DONE AS A FILM IN THE 1950-60s. VERY P...
2,B0012RMVH0,Needs pictures Excellent book. This is somethi...
3,B0012TAD1O,She always delivers with an exciting storyline...
4,B0012U0NKE,"<a data-hook=""product-link-linked"" class=""a-li..."


In [5]:
def clean_review(df):
    custom_pipeline = [preprocessing.lowercase,
                  preprocessing.remove_punctuation,
                  preprocessing.remove_stopwords,
                  preprocessing.stem,
                  preprocessing.remove_whitespace]
    df['clean_text'] = hero.clean(df['reviewText'], custom_pipeline)

In [6]:
clean_review(df_b_r)

In [7]:
df_b_r.set_index('asin', inplace=True)
df_b_r.drop(columns=['reviewText'], inplace=True)

In [8]:
df_b_r.head()

Unnamed: 0_level_0,clean_text
asin,Unnamed: 1_level_1
B0012GTZCK,expect look histori christian influenc live pe...
B0012LHGJ4,done film 1950 60s popular film back today kid...
B0012RMVH0,need pictur excel book someth want book give k...
B0012TAD1O,alway deliv excit storylin lot heat cours imag...
B0012U0NKE,data hook product link link class link normal ...


In [9]:
tf = TfidfVectorizer(max_features=250)
dtm = tf.fit_transform(df_b_r['clean_text'])
dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names_out(), index=df_b_r.index)

In [10]:
features = tf.get_feature_names_out()

In [11]:
dtm.head()

Unnamed: 0_level_0,abl,absolut,action,actual,adventur,almost,along,also,although,alway,...,work,world,worth,would,write,writer,written,year,yet,young
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0012GTZCK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.161268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0012LHGJ4,0.0,0.0,0.0,0.0,0.397054,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.201192,0.0,0.0,0.0,0.0,0.0,0.357893
B0012RMVH0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104506,0.0,0.0
B0012TAD1O,0.0,0.0,0.097204,0.097665,0.0,0.0,0.0,0.0,0.105963,0.165358,...,0.0,0.091509,0.0,0.118635,0.071464,0.213452,0.070406,0.0,0.0,0.0
B0012U0NKE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
dtm.to_parquet('Data/df_dtm.parquet')

In [13]:
df_meta_all = pd.read_csv('Data/meta_all.csv', index_col='asin')
df_meta_all.drop(columns =['Unnamed: 0'], inplace=True)
df_meta_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 94212 entries, B0012GTZCK to B01HJENY3Y
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         94212 non-null  object 
 1   author        94212 non-null  object 
 2   genre         94212 non-null  object 
 3   print_length  94212 non-null  float64
 4   word_wise     94212 non-null  float64
 5   lending       94212 non-null  float64
dtypes: float64(3), object(3)
memory usage: 5.0+ MB


In [14]:
df_meta_all.head()

Unnamed: 0_level_0,title,author,genre,print_length,word_wise,lending
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B0012GTZCK,God in the White House,Randall Balmer,History,260.0,1.0,0.0
B0012LHGJ4,Vingt Mille Lieues sous les mers (French Editi...,Jules Verne,Reference,606.0,0.0,1.0
B0012RMVH0,Wood-Carving Design and Workmanship - Kindle e...,George Jack,Arts & Photography,310.0,0.0,0.0
B0012TAD1O,Souls Night (The Vampire Pacts) - Kindle edition,Kallysten,Romance,74.0,1.0,1.0
B0012U0NKE,The Misplaced Horse (1) eBook,Constance Downes,"Comics, Manga & Graphic Novels",444.0,0.0,1.0


In [17]:
model_df = dtm.merge(df_meta_all, left_index=True, right_index=True)
model_df.drop(columns=['title', 'author_y', 'word_wise', 'lending'], inplace=True)
model_df = pd.get_dummies(model_df, columns=['genre'])
model_df.head()

Unnamed: 0_level_0,abl,absolut,action,actual,adventur,almost,along,also,although,alway,...,genre_Politics & Social Sciences,genre_Reference,genre_Religion & Spirituality,genre_Romance,genre_Science & Math,genre_Science Fiction & Fantasy,genre_Self-Help,genre_Sports & Outdoors,genre_Teen & Young Adult,genre_Travel
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0012GTZCK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
B0012LHGJ4,0.0,0.0,0.0,0.0,0.397054,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
B0012RMVH0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
B0012TAD1O,0.0,0.0,0.097204,0.097665,0.0,0.0,0.0,0.0,0.105963,0.165358,...,0,0,0,1,0,0,0,0,0,0
B0012U0NKE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
cols = ['print_length']
scaler = MinMaxScaler()
scaled = scaler.fit_transform(model_df[cols])
scaled_column = pd.DataFrame(scaled, index=model_df.index, columns=cols)
model_df['print_length'] = scaled_column['print_length']

In [19]:
y = np.array(model_df.loc['B01HJBPUWA'])
y = y.reshape(1, -1)
cos_sim = cosine_similarity(model_df, y)
cos_sim = pd.DataFrame(data=cos_sim, index=model_df.index)
cos_sim.sort_values(by = 0, ascending=False, inplace=True)
results = cos_sim.head(11)

In [20]:
results

Unnamed: 0_level_0,0
asin,Unnamed: 1_level_1
B01HJBPUWA,1.0
B00BG2WZZE,0.860933
B00B0071TC,0.85782
B00C1MZN5C,0.855378
B006RZNR3Y,0.854388
B00SC7DK2S,0.853279
B005SFRJ6K,0.853177
B00EK5IMOC,0.852926
B01DE64NKE,0.852195
B007IYKF4W,0.850208


In [None]:
pd.set_option('display.max_colwidth', None)

In [57]:
def book_review_recommend():
    
    title = input('Title: ')
    book = df_meta_all.index[df_meta_all['title'] == title]
    n_recs = int(input('How many recommendations? '))
    
    y = np.array(model_df.loc[book]).reshape(1, -1)
    cos_sim = cosine_similarity(model_df, y)
    cos_sim = pd.DataFrame(data=cos_sim, index=model_df.index)
    cos_sim.sort_values(by = 0, ascending = False, inplace=True)
    results = cos_sim.head(n_recs+1).index.values[1:]
    results_df = df_meta_all.loc[results]
    results_df.rename(columns={'title':'Title', 'author':'Author',
                               'genre':'Genre', 'print_length':'# Pages',
                               'word_wise':'Word Wise', 'lending':'Lending'}, inplace=True)
    results_df[['# Pages', 'Word Wise', 'Lending']] = results_df[['# Pages', 'Word Wise', 'Lending']].astype(int)
    results_df.reset_index(inplace=True)
    return results_df

In [60]:
pd.set_option('display.max_colwidth', None)

In [61]:
book_review_recommend()

Title: The Atlantis Gene: A Thriller (The Origin Mystery, Book 1) - Kindle edition
How many recommendations? 5


Unnamed: 0,asin,Title,Author,Genre,# Pages,Word Wise,Lending
0,B01AW25GRC,Doubt (Caroline Auden Book 1) - Kindle edition,C. E. Tobisman,Literature & Fiction,348,1,0
1,B0070CSKE4,"The Twelve Stones (The Twelve Stones, Book 1) - Kindle edition",RJ Johnson,Literature & Fiction,354,1,1
2,B018SCGDWK,Dead Lawyers Don't Lie: A Gripping Thriller (Jake Wolfe Book 1) - Kindle edition,Mark Nolan,Literature & Fiction,602,1,1
3,B00KH3Z53W,Hacker For Hire (Ted Higuera Series Book 2) - Kindle edition,Pendelton Wallace,Literature & Fiction,484,1,1
4,B00NQLAPQE,On Distant Shores (Earth Exiles Book 1) eBook,Mark Harritt,Literature & Fiction,269,1,1


In [None]:
book_review_recommend()

In [None]:
knn = NearestNeighbors(n_neighbors=10)
knn.fit(model_df)

In [None]:
def book_review_recommend_knn():
    
    book = input('ASIN: ')
    n_recs = int(input('How many recommendations? '))
    
    x = np.array(model_df.loc[book]).reshape(1, -1)
    results = knn.kneighbors(x, n_recs+1, return_distance=False).flatten()
    index = model_df.iloc[results].index.values[1:]
    results_df = df_meta_all.loc[index]
    return results_df

In [None]:
testbook2 = df_b_r.loc[['B01HJBPUWA']]

In [None]:
testbook_text2 = preprocessing.tokenize(testbook2['clean_text'])
text_list2 = list(testbook_text2)[0]
feature_in_text2 = [x for x in text_list2 if x in features]
text_series2 = pd.Series(feature_in_text2)