In [2]:
# Configuration for JupyterLab
%config IPCompleter.greedy=True

# Importing Required Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import random
from tqdm import tqdm
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import warnings
import os
warnings.filterwarnings("ignore")

In [3]:
# Read in the Data from a CSV
users_csv = pd.read_csv('data/Users.csv', encoding='ISO-8859-1', delimiter=';')
books_csv = pd.read_csv('data/Books.csv', error_bad_lines=False, encoding='ISO-8859-1', delimiter=';')
ratings_csv = pd.read_csv('data/Ratings.csv', error_bad_lines=False, encoding='ISO-8859-1', delimiter=';')


b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


In [4]:
# Create the Dataset
dataset = pd.merge(ratings_csv, users_csv, on='User-ID', how='inner')
dataset = pd.merge(dataset, books_csv, on='ISBN', how='inner')
dataset.drop(columns=['Image-URL-M', 'Image-URL-L', 'Image-URL-S'], inplace=True)
dataset

Unnamed: 0,User-ID,ISBN,Book-Rating,Location,Age,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276725,034545104X,0,"tyler, texas, usa",,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,2313,034545104X,5,"cincinnati, ohio, usa",23.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
2,6543,034545104X,0,"strafford, missouri, usa",34.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
3,8680,034545104X,5,"st. charles county, missouri, usa",2.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
4,10314,034545104X,9,"beaverton, oregon, usa",,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
5,23768,034545104X,0,"st. louis, missouri, usa",45.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
6,28266,034545104X,0,"portland, oregon, usa",,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
7,28523,034545104X,0,"springfield, missouri, usa",24.0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
8,39002,034545104X,0,"san jose, ,",,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
9,50403,034545104X,9,"conway, arkansas, usa",,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books


In [5]:
# Find Issues with Data
null_age = dataset['Age'].isnull().sum()
null_author = dataset['Book-Author'].isnull().sum()
null_publisher = dataset['Publisher'].isnull().sum()
null_rating = dataset['Book-Rating'].isnull().sum()
null_publish_date = dataset['Year-Of-Publication'].isnull().sum()

In [6]:
# Handle Null Data
dataset['Book-Rating'] = dataset['Book-Rating'].replace(0, None)
dataset['Publisher'] = dataset['Publisher'].fillna('Unknown')
dataset['Book-Author'] = dataset['Book-Author'].fillna('Unknown')

In [7]:
# Handle Outliers
upper_range = dataset['Age'].median() + dataset['Age'].std()
lower_range = dataset['Age'].median() - dataset['Age'].std()
possible_ages = np.random.randint(lower_range, upper_range, size=null_age)
age_copy = dataset['Age'].copy()
age_copy[pd.isnull(age_copy)] = possible_ages
dataset['Age'] = age_copy.astype(int)
dataset

Unnamed: 0,User-ID,ISBN,Book-Rating,Location,Age,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276725,034545104X,0,"tyler, texas, usa",26,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,2313,034545104X,5,"cincinnati, ohio, usa",23,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
2,6543,034545104X,5,"strafford, missouri, usa",34,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
3,8680,034545104X,5,"st. charles county, missouri, usa",2,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
4,10314,034545104X,9,"beaverton, oregon, usa",42,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
5,23768,034545104X,9,"st. louis, missouri, usa",45,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
6,28266,034545104X,9,"portland, oregon, usa",43,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
7,28523,034545104X,9,"springfield, missouri, usa",24,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
8,39002,034545104X,9,"san jose, ,",27,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
9,50403,034545104X,9,"conway, arkansas, usa",22,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books


In [7]:
# Remove Extraneous Information in Location
location_copy = dataset['Location'].copy()
location_copy.shape
for item in range(location_copy.shape[0]):
    location_copy[item] = str(location_copy[item]).split(',')[-1]
    if(location_copy[item] == None or location_copy[item]=="") :
        location_copy[item] = "usa"
dataset['Location'] = location_copy

In [8]:
# Create Training and Testing
X_train, X_test, y_train, y_test = train_test_split(dataset['User-ID'].unique(), dataset['User-ID'].unique(), test_size=0.25, shuffle=True)
train_dataset = dataset[dataset['User-ID'].isin(X_train)]
test_dataset = dataset[dataset['User-ID'].isin(X_test)]

In [52]:
# Create searchable dictionary
all_books = test_dataset[['ISBN', 'Book-Title']].copy()
all_books.drop_duplicates(inplace=True, subset='ISBN', keep='last')
all_books_dict = all_books.groupby('ISBN')['Book-Title'].apply(list).to_dict()

In [9]:
# Load Word2Vec Model if already Generated
model = Word2Vec.load("embedding_model_1.model")

In [26]:
# Data for Training Embeddings
reading_training = list()
for i in tqdm(train_dataset['User-ID'].unique()):
    reading_training.append(train_dataset[train_dataset['User-ID'] == i]['ISBN'].tolist())

100%|██████████| 69079/69079 [04:04<00:00, 282.94it/s]


In [27]:
# Data for Validating Embeddings
reading_validation = list()
for i in tqdm(test_dataset['User-ID'].unique()):
    reading_validation.append(test_dataset[test_dataset['User-ID'] == i]['ISBN'].tolist())

100%|██████████| 23027/23027 [00:51<00:00, 444.66it/s]


In [30]:
# Train Embeddings
model = Word2Vec(window=9,sg=1,hs=0,negative=10,alpha=.0290,min_alpha=.0008)
model.build_vocab(reading_training, progress_per=300)
model.train(reading_training, total_examples=model.corpus_count, epochs=15, report_delay=1)

(6921015, 11517120)

In [38]:
# Save Model for Later Usage
model.save("embedding_model_1.model")

In [10]:
# Precompute L2-normalized vectors.
model.init_sims(replace=True)

In [75]:
# Extract the Vectors of Items in the Vocab 
vocab = model[model.wv.vocab]

In [55]:
# Test Reccomendations
dataset[dataset['Book-Title'].str.contains('Dreamcatcher')].sample()

Unnamed: 0,User-ID,ISBN,Book-Rating,Location,Age,Book-Title,Book-Author,Year-Of-Publication,Publisher
333937,82955,743467523,9,usa,48,Dreamcatcher,Stephen King,2003,Pocket


In [77]:
similar_isbns = model.similar_by_vector('0743467523')
similar_books = []
for isbn in similar_isbns:
    pair = (all_books_dict[isbn[0]][0], isbn[1])
    similar_books.append(pair)
print("Books similar to Dreamcatcher by Stephen King")
similar_books

Books similar to Dreamcatcher by Stephen King


[('Sleepwalk', 0.7612367868423462),
 ('Wizard and Glass (The Dark Tower, Book 4)', 0.7372331023216248),
 ('The Formula', 0.7296830415725708),
 ('Outrage: The Five Reasons Why O. J. Simpson Got Away With Murder',
  0.7212274074554443),
 ('The Regulators', 0.7150816321372986),
 ('Night Shift', 0.7086629867553711),
 ('Masquerade', 0.7068302631378174),
 ('The Servants of Twilight', 0.7037485837936401),
 ('The Blooding', 0.7024586200714111),
 ('The Courtship of Princess Leia (Star Wars)', 0.7002884149551392)]