In [47]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors


import warnings
warnings.filterwarnings("ignore")

In [48]:
data = pd.read_csv("book_with_genre_dataset.csv")

##### VIEWING THE IMPORTED DATASET


In [49]:
data.head(5)

Unnamed: 0,Book Id,Title,Author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,genres
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,"Fantasy;Young Adult;Fiction;Fantasy,Magic;Chil..."
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,"Fantasy;Young Adult;Fiction;Fantasy,Magic;Chil..."
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,"Fantasy;Fiction;Young Adult;Fantasy,Magic;Chil..."
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,"Fantasy;Fiction;Young Adult;Fantasy,Magic;Chil..."
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic,"Fantasy;Young Adult;Fiction;Fantasy,Magic;Adve..."


#### DATA PREPROCESSING

In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Book Id             11127 non-null  int64  
 1   Title               11127 non-null  object 
 2   Author              11127 non-null  object 
 3   average_rating      11127 non-null  float64
 4   isbn                11127 non-null  object 
 5   isbn13              11127 non-null  int64  
 6   language_code       11127 non-null  object 
 7   num_pages           11127 non-null  int64  
 8   ratings_count       11127 non-null  int64  
 9   text_reviews_count  11127 non-null  int64  
 10  publication_date    11127 non-null  object 
 11  publisher           11127 non-null  object 
 12  genres              11030 non-null  object 
dtypes: float64(1), int64(5), object(7)
memory usage: 1.1+ MB


In [51]:
data.describe()

Unnamed: 0,Book Id,average_rating,isbn13,num_pages,ratings_count,text_reviews_count
count,11127.0,11127.0,11127.0,11127.0,11127.0,11127.0
mean,21310.938887,3.933631,9759888000000.0,336.376921,17936.41,541.854498
std,13093.358023,0.352445,442896400000.0,241.127305,112479.4,2576.176608
min,1.0,0.0,8987060000.0,0.0,0.0,0.0
25%,10287.0,3.77,9780345000000.0,192.0,104.0,9.0
50%,20287.0,3.96,9780586000000.0,299.0,745.0,46.0
75%,32104.5,4.135,9780873000000.0,416.0,4993.5,237.5
max,45641.0,5.0,9790008000000.0,6576.0,4597666.0,94265.0


Checking for missing values

In [52]:
data.isnull().sum()

Book Id                0
Title                  0
Author                 0
average_rating         0
isbn                   0
isbn13                 0
language_code          0
num_pages              0
ratings_count          0
text_reviews_count     0
publication_date       0
publisher              0
genres                97
dtype: int64

In [53]:
data = data.dropna(subset=['genres'])

In [54]:
data.head(2)

Unnamed: 0,Book Id,Title,Author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,genres
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,"Fantasy;Young Adult;Fiction;Fantasy,Magic;Chil..."
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,"Fantasy;Young Adult;Fiction;Fantasy,Magic;Chil..."


In [55]:
data.drop(["isbn", "isbn13", "ratings_count", "text_reviews_count"], axis=1, inplace=True)

In [56]:
data.head()

Unnamed: 0,Book Id,Title,Author,average_rating,language_code,num_pages,publication_date,publisher,genres
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,eng,652,9/16/2006,Scholastic Inc.,"Fantasy;Young Adult;Fiction;Fantasy,Magic;Chil..."
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,eng,870,9/1/2004,Scholastic Inc.,"Fantasy;Young Adult;Fiction;Fantasy,Magic;Chil..."
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,eng,352,11/1/2003,Scholastic,"Fantasy;Fiction;Young Adult;Fantasy,Magic;Chil..."
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,eng,435,5/1/2004,Scholastic Inc.,"Fantasy;Fiction;Young Adult;Fantasy,Magic;Chil..."
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,eng,2690,9/13/2004,Scholastic,"Fantasy;Young Adult;Fiction;Fantasy,Magic;Adve..."


In [57]:
data["genres"] = data["genres"].str.split(";")




In [58]:
def split_with_comma(value:list):

  new_list = []

  for i in range(len(value)):

      if "," in value[i]:
          temp_value = value[i]

          new_list.extend(temp_value.split(","))

      else:
          new_list.append(value[i])

  return new_list



data["genres"] = data["genres"].apply(split_with_comma)

GETTING ALL UNIQUE GENRES AND ADDING THEM TO A SET

In [59]:
unique_genres = set()

for genres in data['genres']:
  unique_genres.update(genres)


unique_genres

{'12th Century',
 '13th Century',
 '14th Century',
 '15th Century',
 '16th Century',
 '17th Century',
 '18th Century',
 '1961-1975',
 '19th Century',
 '1st Grade',
 '20th Century',
 '21st Century',
 '2nd Grade',
 '40k',
 'AUTOBIOGRAPHY',
 'Abuse',
 'Academia',
 'Academic',
 'Academics',
 'Accounting',
 'Action',
 'Activism',
 'Activities',
 'Adaptations',
 'Adhd',
 'Adolescence',
 'Adoption',
 'Adult',
 'Adult Fiction',
 'Adventure',
 'Africa',
 'African American',
 'African American Literature',
 'African American Romance',
 'African Literature',
 'Agriculture',
 'Alchemy',
 'Alcohol',
 'Algeria',
 'Algorithms',
 'Aliens',
 'Alternate History',
 'Alternative Medicine',
 'Amateur Sleuth',
 'Amazon',
 'American',
 'American Civil War',
 'American Fiction',
 'American History',
 'American Revolution',
 'American Revolutionary War',
 'American fiction',
 'American literature',
 'Americana',
 'Amish',
 'Anarchism',
 'Ancient',
 'Ancient History',
 'Angels',
 'Anglo Saxon',
 'Animal Fiction

In [60]:
for genre in unique_genres:
    data[genre] = data['genres'].apply(lambda x: 1 if genre in x else 0)

In [61]:
preprocessed_data = data.copy()

In [62]:
preprocessed_data.drop(["language_code", "num_pages", "publication_date", "average_rating", "Author", "Title", "publisher"], axis=1, inplace=True)

In [63]:
preprocessed_data.head(2)

Unnamed: 0,Book Id,genres,Comic Book,M F Romance,Comic Strips,American History,Ancient History,Geography,Anthropomorphic,Classical Studies,...,International Development,Holocaust,Hungarian Literature,Jazz,Love,Cozy Mystery,Alcohol,Social Work,Classics,German Literature
0,1,"[Fantasy, Young Adult, Fiction, Fantasy, Magic...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2,"[Fantasy, Young Adult, Fiction, Fantasy, Magic...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [64]:
preprocessed_data.set_index("Book Id", inplace=True)

In [70]:
preprocessed_data.drop("genres", axis=1, inplace=True)

In [71]:
preprocessed_data

Unnamed: 0_level_0,Comic Book,M F Romance,Comic Strips,American History,Ancient History,Geography,Anthropomorphic,Classical Studies,13th Century,Fairies,...,International Development,Holocaust,Hungarian Literature,Jazz,Love,Cozy Mystery,Alcohol,Social Work,Classics,German Literature
Book Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45631,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45633,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45634,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45639,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NOW WE CAN BUILD OUR RECOMMENDER

In [105]:
preprocessed_data.loc[1]

Comic Book           0
M F Romance          0
Comic Strips         0
American History     0
Ancient History      0
                    ..
Cozy Mystery         0
Alcohol              0
Social Work          0
Classics             1
German Literature    0
Name: 1, Length: 897, dtype: int64

In [126]:
from sklearn.neighbors import NearestNeighbors


nn_model = NearestNeighbors(metric="minkowski")
nn_model.fit(preprocessed_data)

target_book_field = preprocessed_data.loc[28].values.reshape(1, -1)



distances, neighbors = nn_model.kneighbors(target_book_field, n_neighbors=30)

In [127]:
neighbors

array([[   19, 10432,    17,    15,  2806, 10429,    18, 10434, 10440,
         2981,  5175,  4480,  9080, 10433,  1930,  9631,  1072,  1594,
         2739,  4800,  6146,  7571,  5178,  2743,  5166,  4652,  2637,
         4801, 10035,  9606]])

In [131]:
data.iloc[17]

Book Id                                                             26
Title                The Lost Continent: Travels in Small Town America
Author                                                     Bill Bryson
average_rating                                                    3.83
language_code                                                      eng
                                           ...                        
Cozy Mystery                                                         0
Alcohol                                                              0
Social Work                                                          0
Classics                                                             0
German Literature                                                    0
Name: 17, Length: 906, dtype: object

In [130]:
row = data.loc[data['Book Id'] == 10432]


list(row["genres"])

[]

In [125]:
data.head(20)

Unnamed: 0,Book Id,Title,Author,average_rating,language_code,num_pages,publication_date,publisher,genres,Comic Book,...,International Development,Holocaust,Hungarian Literature,Jazz,Love,Cozy Mystery,Alcohol,Social Work,Classics,German Literature
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,eng,652,9/16/2006,Scholastic Inc.,"[Fantasy, Young Adult, Fiction, Fantasy, Magic...",0,...,0,0,0,0,0,0,0,0,1,0
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,eng,870,9/1/2004,Scholastic Inc.,"[Fantasy, Young Adult, Fiction, Fantasy, Magic...",0,...,0,0,0,0,0,0,0,0,1,0
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,eng,352,11/1/2003,Scholastic,"[Fantasy, Fiction, Young Adult, Fantasy, Magic...",0,...,0,0,0,0,0,0,0,0,1,0
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,eng,435,5/1/2004,Scholastic Inc.,"[Fantasy, Fiction, Young Adult, Fantasy, Magic...",0,...,0,0,0,0,0,0,0,0,1,0
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,eng,2690,9/13/2004,Scholastic,"[Fantasy, Young Adult, Fiction, Fantasy, Magic...",0,...,0,0,0,0,0,0,0,0,0,0
5,9,"Unauthorized Harry Potter Book Seven News: ""Ha...",W. Frederick Zimmerman,3.74,en-US,152,4/26/2005,Nimble Books,[Fiction],0,...,0,0,0,0,0,0,0,0,0,0
6,10,Harry Potter Collection (Harry Potter #1-6),J.K. Rowling,4.73,eng,3342,9/12/2005,Scholastic,"[Fantasy, Fiction, Young Adult, Fantasy, Magic...",0,...,0,0,0,0,0,0,0,0,1,0
7,12,The Ultimate Hitchhiker's Guide: Five Complete...,Douglas Adams,4.38,eng,815,11/1/2005,Gramercy Books,"[Science Fiction, Fiction, Humor, Fantasy, Cla...",0,...,0,0,0,0,0,0,0,0,1,0
8,13,The Ultimate Hitchhiker's Guide to the Galaxy ...,Douglas Adams,4.38,eng,815,4/30/2002,Del Rey Books,"[Science Fiction, Fiction, Humor, Fantasy, Cla...",0,...,0,0,0,0,0,0,0,0,1,0
9,14,The Hitchhiker's Guide to the Galaxy (Hitchhik...,Douglas Adams,4.22,eng,215,8/3/2004,Crown,"[Science Fiction, Fiction, Humor, Classics, Fa...",0,...,0,0,0,0,0,0,0,0,1,0
