In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [3]:
books_df = pd.read_csv('book600k-700k.csv', delimiter= ",")
books_df.head()

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,pagesNumber,Description
0,600000,"Lessons Learned (Great Chefs, #2)",Nora Roberts,037351025X,3.74,1993,15,2,Silhouette,5:947,4:1016,3:1061,2:287,1:63,total:3374,86,eng,250,LESSONS LEARNED...<br /><br />Coordinating the...
1,600001,Walking by Faith: Lessons Learned in the Dark,Jennifer Rothschild,0633099325,4.27,2003,1,1,Lifeway Church Resources,5:367,4:246,3:109,2:22,1:5,total:749,7,,112,"At the age of fifteen, Jennifer Rothschild con..."
2,600003,Better Health in Africa: Experience and Lesson...,World Bank Group,0821328174,5.0,1994,1,1,World Bank Publications,5:1,4:0,3:0,2:0,1:0,total:1,1,,240,
3,600004,"The Blended Learning Book: Best Practices, Pro...",Josh Bersin,0787972967,4.1,2004,1,10,Pfeiffer,5:8,4:6,3:6,2:0,1:0,total:20,3,,319,<i>The Blended Learning Book</i> is your user'...
4,600005,Lessons Learned: Successes Achieved: Be Prepar...,Robert G. Gillio,0595417566,3.0,2006,30,11,iUniverse,5:0,4:0,3:1,2:0,1:0,total:1,0,,168,"""<b>Lessons Learned: Successes Achieved</b>"" w..."


In [4]:
ratings_df = pd.read_csv('user_rating_0_to_1000.csv', delimiter= ",")
ratings_df.head()

Unnamed: 0,ID,Name,Rating
0,1,Agile Web Development with Rails: A Pragmatic ...,it was amazing
1,1,The Restaurant at the End of the Universe (Hit...,it was amazing
2,1,Siddhartha,it was amazing
3,1,The Clock of the Long Now: Time and Responsibi...,really liked it
4,1,"Ready Player One (Ready Player One, #1)",really liked it


In [5]:
books_df.columns

Index(['Id', 'Name', 'Authors', 'ISBN', 'Rating', 'PublishYear',
       'PublishMonth', 'PublishDay', 'Publisher', 'RatingDist5', 'RatingDist4',
       'RatingDist3', 'RatingDist2', 'RatingDist1', 'RatingDistTotal',
       'CountsOfReview', 'Language', 'pagesNumber', 'Description'],
      dtype='object')

In [6]:
merged_df = pd.merge(books_df, ratings_df, on='Name')
merged_df.head()

Unnamed: 0,Id,Name,Authors,ISBN,Rating_x,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,...,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,pagesNumber,Description,ID,Rating_y
0,600143,Telling Secrets,Jane Orleman,878687297,0.0,1998,1,1,CWLA Press (Child Welfare League of America),5:0,...,3:0,2:0,1:0,total:0,0,,115,Telling Secrets presents the art and dreams of...,850,really liked it
1,600188,The Turn of the Screw,Susan Conforti,883014084,1.0,1979,1,7,Pendulum Press,5:0,...,3:0,2:0,1:1,total:1,0,,64,,166,really liked it
2,600188,The Turn of the Screw,Susan Conforti,883014084,1.0,1979,1,7,Pendulum Press,5:0,...,3:0,2:0,1:1,total:1,0,,64,,192,liked it
3,600188,The Turn of the Screw,Susan Conforti,883014084,1.0,1979,1,7,Pendulum Press,5:0,...,3:0,2:0,1:1,total:1,0,,64,,260,it was ok
4,600188,The Turn of the Screw,Susan Conforti,883014084,1.0,1979,1,7,Pendulum Press,5:0,...,3:0,2:0,1:1,total:1,0,,64,,284,really liked it


In [7]:
merged_df.columns

Index(['Id', 'Name', 'Authors', 'ISBN', 'Rating_x', 'PublishYear',
       'PublishMonth', 'PublishDay', 'Publisher', 'RatingDist5', 'RatingDist4',
       'RatingDist3', 'RatingDist2', 'RatingDist1', 'RatingDistTotal',
       'CountsOfReview', 'Language', 'pagesNumber', 'Description', 'ID',
       'Rating_y'],
      dtype='object')

In [8]:
# Renaming the IDs for better interpretation of data as one is referring to the book's ID and the other one to the user rating it
merged_df.rename(columns={'Id': 'bookID'}, inplace=True)
merged_df.rename(columns={'ID': 'userID'}, inplace=True)
merged_df.columns

Index(['bookID', 'Name', 'Authors', 'ISBN', 'Rating_x', 'PublishYear',
       'PublishMonth', 'PublishDay', 'Publisher', 'RatingDist5', 'RatingDist4',
       'RatingDist3', 'RatingDist2', 'RatingDist1', 'RatingDistTotal',
       'CountsOfReview', 'Language', 'pagesNumber', 'Description', 'userID',
       'Rating_y'],
      dtype='object')

In [9]:
# Number of missing values per column 
missing_per_column = merged_df.isnull().sum()
print(missing_per_column)

bookID                0
Name                  0
Authors               0
ISBN                162
Rating_x              0
PublishYear           0
PublishMonth          0
PublishDay            0
Publisher            12
RatingDist5           0
RatingDist4           0
RatingDist3           0
RatingDist2           0
RatingDist1           0
RatingDistTotal       0
CountsOfReview        0
Language           2232
pagesNumber           0
Description         218
userID                0
Rating_y              0
dtype: int64


In [10]:
# For ISBN and Publisher we can ignore these missing values as they are not really relevant to our analysis. 
# Language, however, can be quite important for a book recommendation system, as it directly affects user preferences 
# and accessibility. 

# Mode imputation for missing values:
mode_language = merged_df['Language'].mode()[0]
merged_df['Language'].fillna(mode_language, inplace=True)

In [11]:
# Tokenizing and lemmatizing columns:
def tokenize_and_lemmatize(col):
    # ensures col is a string
    col = str(col)
    # Load English stopwords
    stop_words = set(stopwords.words('english'))
    # Initialize the Lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Tokenize the column
    tokens = word_tokenize(col.lower())
    # Filter out stopwords and non-alphabetic characters
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # Lemmatize each filtered token
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return lemmatized_tokens

In [13]:
# Apply the tokenizing and lemmatization function to the "text" cols:
merged_df['Processed_Name'] = merged_df['Name'].apply(tokenize_and_lemmatize)
merged_df['Processed_Authors'] = merged_df['Authors'].apply(tokenize_and_lemmatize)
# Not that relevant for our analysis but just in case we use it later on:
merged_df['Processed_Publisher'] = merged_df['Publisher'].apply(tokenize_and_lemmatize)
merged_df['Processed_Rating'] = merged_df['Rating_y'].apply(tokenize_and_lemmatize)
merged_df['Processed_Description'] = merged_df['Description'].apply(tokenize_and_lemmatize)

In [14]:
print(merged_df[['Rating_y', 'Processed_Rating']].head())

          Rating_y Processed_Rating
0  really liked it  [really, liked]
1  really liked it  [really, liked]
2         liked it          [liked]
3        it was ok             [ok]
4  really liked it  [really, liked]


In [15]:
# Splitting into a training and validation set:
df_train_set, df_valid_set = train_test_split(merged_df, test_size=0.2, random_state=42, shuffle=True) # random state for reproducibility